hw/vfio/pci.c

   1 /*
   2  * vfio based device assignment support
   3  *
   4  * Copyright Red Hat, Inc. 2012
   5  *
   6  * Authors:
   7  *  Alex Williamson <alex.williamson@redhat.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  * Based on qemu-kvm device-assignment:
  13  *  Adapted for KVM by Qumranet.
  14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
  15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
  16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
  17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
  18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
  19  */
  20
  21 #include <dirent.h>
  22 #include <linux/vfio.h>
  23 #include <sys/ioctl.h>
  24 #include <sys/mman.h>
  25 #include <sys/stat.h>
  26 #include <sys/types.h>
  27 #include <unistd.h>
  28
  29 #include "config.h"
  30 #include "exec/address-spaces.h"
  31 #include "exec/memory.h"
  32 #include "hw/pci/msi.h"
  33 #include "hw/pci/msix.h"
  34 #include "hw/pci/pci.h"
  35 #include "qemu-common.h"
  36 #include "qemu/error-report.h"
  37 #include "qemu/event_notifier.h"
  38 #include "qemu/queue.h"
  39 #include "qemu/range.h"
  40 #include "sysemu/kvm.h"
  41 #include "sysemu/sysemu.h"
  42 #include "trace.h"
  43 #include "hw/vfio/vfio.h"
  44
  45 /* Extra debugging, trap acceleration paths for more logging */
  46 #define VFIO_ALLOW_MMAP 1
  47 #define VFIO_ALLOW_KVM_INTX 1
  48 #define VFIO_ALLOW_KVM_MSI 1
  49 #define VFIO_ALLOW_KVM_MSIX 1
  50
  51 struct VFIOPCIDevice;
  52
  53 typedef struct VFIOQuirk {
  54     MemoryRegion mem;
  55     struct VFIOPCIDevice *vdev;
  56     QLIST_ENTRY(VFIOQuirk) next;
  57     struct {
  58         uint32_t base_offset:TARGET_PAGE_BITS;
  59         uint32_t address_offset:TARGET_PAGE_BITS;
  60         uint32_t address_size:3;
  61         uint32_t bar:3;
  62
  63         uint32_t address_match;
  64         uint32_t address_mask;
  65
  66         uint32_t address_val:TARGET_PAGE_BITS;
  67         uint32_t data_offset:TARGET_PAGE_BITS;
  68         uint32_t data_size:3;
  69
  70         uint8_t flags;
  71         uint8_t read_flags;
  72         uint8_t write_flags;
  73     } data;
  74 } VFIOQuirk;
  75
  76 typedef struct VFIOBAR {
  77     off_t fd_offset; /* offset of BAR within device fd */
  78     int fd; /* device fd, allows us to pass VFIOBAR as opaque data */
  79     MemoryRegion mem; /* slow, read/write access */
  80     MemoryRegion mmap_mem; /* direct mapped access */
  81     void *mmap;
  82     size_t size;
  83     uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
  84     uint8_t nr; /* cache the BAR number for debug */
  85     bool ioport;
  86     bool mem64;
  87     QLIST_HEAD(, VFIOQuirk) quirks;
  88 } VFIOBAR;
  89
  90 typedef struct VFIOVGARegion {
  91     MemoryRegion mem;
  92     off_t offset;
  93     int nr;
  94     QLIST_HEAD(, VFIOQuirk) quirks;
  95 } VFIOVGARegion;
  96
  97 typedef struct VFIOVGA {
  98     off_t fd_offset;
  99     int fd;
 100     VFIOVGARegion region[QEMU_PCI_VGA_NUM_REGIONS];
 101 } VFIOVGA;
 102
 103 typedef struct VFIOINTx {
 104     bool pending; /* interrupt pending */
 105     bool kvm_accel; /* set when QEMU bypass through KVM enabled */
 106     uint8_t pin; /* which pin to pull for qemu_set_irq */
 107     EventNotifier interrupt; /* eventfd triggered on interrupt */
 108     EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
 109     PCIINTxRoute route; /* routing info for QEMU bypass */
 110     uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */
 111     QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */
 112 } VFIOINTx;
 113
 114 typedef struct VFIOMSIVector {
 115     /*
 116      * Two interrupt paths are configured per vector.  The first, is only used
 117      * for interrupts injected via QEMU.  This is typically the non-accel path,
 118      * but may also be used when we want QEMU to handle masking and pending
 119      * bits.  The KVM path bypasses QEMU and is therefore higher performance,
 120      * but requires masking at the device.  virq is used to track the MSI route
 121      * through KVM, thus kvm_interrupt is only available when virq is set to a
 122      * valid (>= 0) value.
 123      */
 124     EventNotifier interrupt;
 125     EventNotifier kvm_interrupt;
 126     struct VFIOPCIDevice *vdev; /* back pointer to device */
 127     int virq;
 128     bool use;
 129 } VFIOMSIVector;
 130
 131 enum {
 132     VFIO_INT_NONE = 0,
 133     VFIO_INT_INTx = 1,
 134     VFIO_INT_MSI  = 2,
 135     VFIO_INT_MSIX = 3,
 136 };
 137
 138 typedef struct VFIOAddressSpace {
 139     AddressSpace *as;
 140     QLIST_HEAD(, VFIOContainer) containers;
 141     QLIST_ENTRY(VFIOAddressSpace) list;
 142 } VFIOAddressSpace;
 143
 144 static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
 145     QLIST_HEAD_INITIALIZER(vfio_address_spaces);
 146
 147 struct VFIOGroup;
 148
 149 typedef struct VFIOType1 {
 150     MemoryListener listener;
 151     int error;
 152     bool initialized;
 153 } VFIOType1;
 154
 155 typedef struct VFIOContainer {
 156     VFIOAddressSpace *space;
 157     int fd; /* /dev/vfio/vfio, empowered by the attached groups */
 158     struct {
 159         /* enable abstraction to support various iommu backends */
 160         union {
 161             VFIOType1 type1;
 162         };
 163         void (*release)(struct VFIOContainer *);
 164     } iommu_data;
 165     QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
 166     QLIST_HEAD(, VFIOGroup) group_list;
 167     QLIST_ENTRY(VFIOContainer) next;
 168 } VFIOContainer;
 169
 170 typedef struct VFIOGuestIOMMU {
 171     VFIOContainer *container;
 172     MemoryRegion *iommu;
 173     Notifier n;
 174     QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
 175 } VFIOGuestIOMMU;
 176
 177 /* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
 178 typedef struct VFIOMSIXInfo {
 179     uint8_t table_bar;
 180     uint8_t pba_bar;
 181     uint16_t entries;
 182     uint32_t table_offset;
 183     uint32_t pba_offset;
 184     MemoryRegion mmap_mem;
 185     void *mmap;
 186 } VFIOMSIXInfo;
 187
 188 typedef struct VFIOPCIDevice {
 189     PCIDevice pdev;
 190     int fd;
 191     VFIOINTx intx;
 192     unsigned int config_size;
 193     uint8_t *emulated_config_bits; /* QEMU emulated bits, little-endian */
 194     off_t config_offset; /* Offset of config space region within device fd */
 195     unsigned int rom_size;
 196     off_t rom_offset; /* Offset of ROM region within device fd */
 197     void *rom;
 198     int msi_cap_size;
 199     VFIOMSIVector *msi_vectors;
 200     VFIOMSIXInfo *msix;
 201     int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
 202     int interrupt; /* Current interrupt type */
 203     VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
 204     VFIOVGA vga; /* 0xa0000, 0x3b0, 0x3c0 */
 205     PCIHostDeviceAddress host;
 206     QLIST_ENTRY(VFIOPCIDevice) next;
 207     struct VFIOGroup *group;
 208     EventNotifier err_notifier;
 209     uint32_t features;
 210 #define VFIO_FEATURE_ENABLE_VGA_BIT 0
 211 #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
 212     int32_t bootindex;
 213     uint8_t pm_cap;
 214     bool reset_works;
 215     bool has_vga;
 216     bool pci_aer;
 217     bool has_flr;
 218     bool has_pm_reset;
 219     bool needs_reset;
 220     bool rom_read_failed;
 221 } VFIOPCIDevice;
 222
 223 typedef struct VFIOGroup {
 224     int fd;
 225     int groupid;
 226     VFIOContainer *container;
 227     QLIST_HEAD(, VFIOPCIDevice) device_list;
 228     QLIST_ENTRY(VFIOGroup) next;
 229     QLIST_ENTRY(VFIOGroup) container_next;
 230 } VFIOGroup;
 231
 232 typedef struct VFIORomBlacklistEntry {
 233     uint16_t vendor_id;
 234     uint16_t device_id;
 235 } VFIORomBlacklistEntry;
 236
 237 /*
 238  * List of device ids/vendor ids for which to disable
 239  * option rom loading. This avoids the guest hangs during rom
 240  * execution as noticed with the BCM 57810 card for lack of a
 241  * more better way to handle such issues.
 242  * The  user can still override by specifying a romfile or
 243  * rombar=1.
 244  * Please see https://bugs.launchpad.net/qemu/+bug/1284874
 245  * for an analysis of the 57810 card hang. When adding
 246  * a new vendor id/device id combination below, please also add
 247  * your card/environment details and information that could
 248  * help in debugging to the bug tracking this issue
 249  */
 250 static const VFIORomBlacklistEntry romblacklist[] = {
 251     /* Broadcom BCM 57810 */
 252     { 0x14e4, 0x168e }
 253 };
 254
 255 #define MSIX_CAP_LENGTH 12
 256
 257 static QLIST_HEAD(, VFIOGroup)
 258     group_list = QLIST_HEAD_INITIALIZER(group_list);
 259
 260 #ifdef CONFIG_KVM
 261 /*
 262  * We have a single VFIO pseudo device per KVM VM.  Once created it lives
 263  * for the life of the VM.  Closing the file descriptor only drops our
 264  * reference to it and the device's reference to kvm.  Therefore once
 265  * initialized, this file descriptor is only released on QEMU exit and
 266  * we'll re-use it should another vfio device be attached before then.
 267  */
 268 static int vfio_kvm_device_fd = -1;
 269 #endif
 270
 271 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 272 static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
 273 static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
 274                                   uint32_t val, int len);
 275 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
 276
 277 /*
 278  * Common VFIO interrupt disable
 279  */
 280 static void vfio_disable_irqindex(VFIOPCIDevice *vdev, int index)
 281 {
 282     struct vfio_irq_set irq_set = {
 283         .argsz = sizeof(irq_set),
 284         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
 285         .index = index,
 286         .start = 0,
 287         .count = 0,
 288     };
 289
 290     ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 291 }
 292
 293 /*
 294  * INTx
 295  */
 296 static void vfio_unmask_single_irqindex(VFIOPCIDevice *vdev, int index)
 297 {
 298     struct vfio_irq_set irq_set = {
 299         .argsz = sizeof(irq_set),
 300         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
 301         .index = index,
 302         .start = 0,
 303         .count = 1,
 304     };
 305
 306     ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 307 }
 308
 309 #ifdef CONFIG_KVM /* Unused outside of CONFIG_KVM code */
 310 static void vfio_mask_single_irqindex(VFIOPCIDevice *vdev, int index)
 311 {
 312     struct vfio_irq_set irq_set = {
 313         .argsz = sizeof(irq_set),
 314         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
 315         .index = index,
 316         .start = 0,
 317         .count = 1,
 318     };
 319
 320     ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 321 }
 322 #endif
 323
 324 /*
 325  * Disabling BAR mmaping can be slow, but toggling it around INTx can
 326  * also be a huge overhead.  We try to get the best of both worlds by
 327  * waiting until an interrupt to disable mmaps (subsequent transitions
 328  * to the same state are effectively no overhead).  If the interrupt has
 329  * been serviced and the time gap is long enough, we re-enable mmaps for
 330  * performance.  This works well for things like graphics cards, which
 331  * may not use their interrupt at all and are penalized to an unusable
 332  * level by read/write BAR traps.  Other devices, like NICs, have more
 333  * regular interrupts and see much better latency by staying in non-mmap
 334  * mode.  We therefore set the default mmap_timeout such that a ping
 335  * is just enough to keep the mmap disabled.  Users can experiment with
 336  * other options with the x-intx-mmap-timeout-ms parameter (a value of
 337  * zero disables the timer).
 338  */
 339 static void vfio_intx_mmap_enable(void *opaque)
 340 {
 341     VFIOPCIDevice *vdev = opaque;
 342
 343     if (vdev->intx.pending) {
 344         timer_mod(vdev->intx.mmap_timer,
 345                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
 346         return;
 347     }
 348
 349     vfio_mmap_set_enabled(vdev, true);
 350 }
 351
 352 static void vfio_intx_interrupt(void *opaque)
 353 {
 354     VFIOPCIDevice *vdev = opaque;
 355
 356     if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
 357         return;
 358     }
 359
 360     trace_vfio_intx_interrupt(vdev->host.domain, vdev->host.bus,
 361                               vdev->host.slot, vdev->host.function,
 362                               'A' + vdev->intx.pin);
 363
 364     vdev->intx.pending = true;
 365     pci_irq_assert(&vdev->pdev);
 366     vfio_mmap_set_enabled(vdev, false);
 367     if (vdev->intx.mmap_timeout) {
 368         timer_mod(vdev->intx.mmap_timer,
 369                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
 370     }
 371 }
 372
 373 static void vfio_eoi(VFIOPCIDevice *vdev)
 374 {
 375     if (!vdev->intx.pending) {
 376         return;
 377     }
 378
 379     trace_vfio_eoi(vdev->host.domain, vdev->host.bus,
 380                    vdev->host.slot, vdev->host.function);
 381
 382     vdev->intx.pending = false;
 383     pci_irq_deassert(&vdev->pdev);
 384     vfio_unmask_single_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
 385 }
 386
 387 static void vfio_enable_intx_kvm(VFIOPCIDevice *vdev)
 388 {
 389 #ifdef CONFIG_KVM
 390     struct kvm_irqfd irqfd = {
 391         .fd = event_notifier_get_fd(&vdev->intx.interrupt),
 392         .gsi = vdev->intx.route.irq,
 393         .flags = KVM_IRQFD_FLAG_RESAMPLE,
 394     };
 395     struct vfio_irq_set *irq_set;
 396     int ret, argsz;
 397     int32_t *pfd;
 398
 399     if (!VFIO_ALLOW_KVM_INTX || !kvm_irqfds_enabled() ||
 400         vdev->intx.route.mode != PCI_INTX_ENABLED ||
 401         !kvm_resamplefds_enabled()) {
 402         return;
 403     }
 404
 405     /* Get to a known interrupt state */
 406     qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
 407     vfio_mask_single_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
 408     vdev->intx.pending = false;
 409     pci_irq_deassert(&vdev->pdev);
 410
 411     /* Get an eventfd for resample/unmask */
 412     if (event_notifier_init(&vdev->intx.unmask, 0)) {
 413         error_report("vfio: Error: event_notifier_init failed eoi");
 414         goto fail;
 415     }
 416
 417     /* KVM triggers it, VFIO listens for it */
 418     irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
 419
 420     if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
 421         error_report("vfio: Error: Failed to setup resample irqfd: %m");
 422         goto fail_irqfd;
 423     }
 424
 425     argsz = sizeof(*irq_set) + sizeof(*pfd);
 426
 427     irq_set = g_malloc0(argsz);
 428     irq_set->argsz = argsz;
 429     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
 430     irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 431     irq_set->start = 0;
 432     irq_set->count = 1;
 433     pfd = (int32_t *)&irq_set->data;
 434
 435     *pfd = irqfd.resamplefd;
 436
 437     ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
 438     g_free(irq_set);
 439     if (ret) {
 440         error_report("vfio: Error: Failed to setup INTx unmask fd: %m");
 441         goto fail_vfio;
 442     }
 443
 444     /* Let'em rip */
 445     vfio_unmask_single_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
 446
 447     vdev->intx.kvm_accel = true;
 448
 449     trace_vfio_enable_intx_kvm(vdev->host.domain, vdev->host.bus,
 450                                vdev->host.slot, vdev->host.function);
 451
 452     return;
 453
 454 fail_vfio:
 455     irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
 456     kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
 457 fail_irqfd:
 458     event_notifier_cleanup(&vdev->intx.unmask);
 459 fail:
 460     qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
 461     vfio_unmask_single_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
 462 #endif
 463 }
 464
 465 static void vfio_disable_intx_kvm(VFIOPCIDevice *vdev)
 466 {
 467 #ifdef CONFIG_KVM
 468     struct kvm_irqfd irqfd = {
 469         .fd = event_notifier_get_fd(&vdev->intx.interrupt),
 470         .gsi = vdev->intx.route.irq,
 471         .flags = KVM_IRQFD_FLAG_DEASSIGN,
 472     };
 473
 474     if (!vdev->intx.kvm_accel) {
 475         return;
 476     }
 477
 478     /*
 479      * Get to a known state, hardware masked, QEMU ready to accept new
 480      * interrupts, QEMU IRQ de-asserted.
 481      */
 482     vfio_mask_single_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
 483     vdev->intx.pending = false;
 484     pci_irq_deassert(&vdev->pdev);
 485
 486     /* Tell KVM to stop listening for an INTx irqfd */
 487     if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
 488         error_report("vfio: Error: Failed to disable INTx irqfd: %m");
 489     }
 490
 491     /* We only need to close the eventfd for VFIO to cleanup the kernel side */
 492     event_notifier_cleanup(&vdev->intx.unmask);
 493
 494     /* QEMU starts listening for interrupt events. */
 495     qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
 496
 497     vdev->intx.kvm_accel = false;
 498
 499     /* If we've missed an event, let it re-fire through QEMU */
 500     vfio_unmask_single_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
 501
 502     trace_vfio_disable_intx_kvm(vdev->host.domain, vdev->host.bus,
 503                                 vdev->host.slot, vdev->host.function);
 504 #endif
 505 }
 506
 507 static void vfio_update_irq(PCIDevice *pdev)
 508 {
 509     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
 510     PCIINTxRoute route;
 511
 512     if (vdev->interrupt != VFIO_INT_INTx) {
 513         return;
 514     }
 515
 516     route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
 517
 518     if (!pci_intx_route_changed(&vdev->intx.route, &route)) {
 519         return; /* Nothing changed */
 520     }
 521
 522     trace_vfio_update_irq(vdev->host.domain, vdev->host.bus,
 523                           vdev->host.slot, vdev->host.function,
 524                           vdev->intx.route.irq, route.irq);
 525
 526     vfio_disable_intx_kvm(vdev);
 527
 528     vdev->intx.route = route;
 529
 530     if (route.mode != PCI_INTX_ENABLED) {
 531         return;
 532     }
 533
 534     vfio_enable_intx_kvm(vdev);
 535
 536     /* Re-enable the interrupt in cased we missed an EOI */
 537     vfio_eoi(vdev);
 538 }
 539
 540 static int vfio_enable_intx(VFIOPCIDevice *vdev)
 541 {
 542     uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
 543     int ret, argsz;
 544     struct vfio_irq_set *irq_set;
 545     int32_t *pfd;
 546
 547     if (!pin) {
 548         return 0;
 549     }
 550
 551     vfio_disable_interrupts(vdev);
 552
 553     vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
 554     pci_config_set_interrupt_pin(vdev->pdev.config, pin);
 555
 556 #ifdef CONFIG_KVM
 557     /*
 558      * Only conditional to avoid generating error messages on platforms
 559      * where we won't actually use the result anyway.
 560      */
 561     if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) {
 562         vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
 563                                                         vdev->intx.pin);
 564     }
 565 #endif
 566
 567     ret = event_notifier_init(&vdev->intx.interrupt, 0);
 568     if (ret) {
 569         error_report("vfio: Error: event_notifier_init failed");
 570         return ret;
 571     }
 572
 573     argsz = sizeof(*irq_set) + sizeof(*pfd);
 574
 575     irq_set = g_malloc0(argsz);
 576     irq_set->argsz = argsz;
 577     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 578     irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 579     irq_set->start = 0;
 580     irq_set->count = 1;
 581     pfd = (int32_t *)&irq_set->data;
 582
 583     *pfd = event_notifier_get_fd(&vdev->intx.interrupt);
 584     qemu_set_fd_handler(*pfd, vfio_intx_interrupt, NULL, vdev);
 585
 586     ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
 587     g_free(irq_set);
 588     if (ret) {
 589         error_report("vfio: Error: Failed to setup INTx fd: %m");
 590         qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
 591         event_notifier_cleanup(&vdev->intx.interrupt);
 592         return -errno;
 593     }
 594
 595     vfio_enable_intx_kvm(vdev);
 596
 597     vdev->interrupt = VFIO_INT_INTx;
 598
 599     trace_vfio_enable_intx(vdev->host.domain, vdev->host.bus,
 600                            vdev->host.slot, vdev->host.function);
 601
 602     return 0;
 603 }
 604
 605 static void vfio_disable_intx(VFIOPCIDevice *vdev)
 606 {
 607     int fd;
 608
 609     timer_del(vdev->intx.mmap_timer);
 610     vfio_disable_intx_kvm(vdev);
 611     vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
 612     vdev->intx.pending = false;
 613     pci_irq_deassert(&vdev->pdev);
 614     vfio_mmap_set_enabled(vdev, true);
 615
 616     fd = event_notifier_get_fd(&vdev->intx.interrupt);
 617     qemu_set_fd_handler(fd, NULL, NULL, vdev);
 618     event_notifier_cleanup(&vdev->intx.interrupt);
 619
 620     vdev->interrupt = VFIO_INT_NONE;
 621
 622     trace_vfio_disable_intx(vdev->host.domain, vdev->host.bus,
 623                             vdev->host.slot, vdev->host.function);
 624 }
 625
 626 /*
 627  * MSI/X
 628  */
 629 static void vfio_msi_interrupt(void *opaque)
 630 {
 631     VFIOMSIVector *vector = opaque;
 632     VFIOPCIDevice *vdev = vector->vdev;
 633     int nr = vector - vdev->msi_vectors;
 634
 635     if (!event_notifier_test_and_clear(&vector->interrupt)) {
 636         return;
 637     }
 638
 639 #ifdef DEBUG_VFIO
 640     MSIMessage msg;
 641
 642     if (vdev->interrupt == VFIO_INT_MSIX) {
 643         msg = msix_get_message(&vdev->pdev, nr);
 644     } else if (vdev->interrupt == VFIO_INT_MSI) {
 645         msg = msi_get_message(&vdev->pdev, nr);
 646     } else {
 647         abort();
 648     }
 649
 650     trace_vfio_msi_interrupt(vdev->host.domain, vdev->host.bus,
 651                              vdev->host.slot, vdev->host.function,
 652                              nr, msg.address, msg.data);
 653 #endif
 654
 655     if (vdev->interrupt == VFIO_INT_MSIX) {
 656         msix_notify(&vdev->pdev, nr);
 657     } else if (vdev->interrupt == VFIO_INT_MSI) {
 658         msi_notify(&vdev->pdev, nr);
 659     } else {
 660         error_report("vfio: MSI interrupt receieved, but not enabled?");
 661     }
 662 }
 663
 664 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
 665 {
 666     struct vfio_irq_set *irq_set;
 667     int ret = 0, i, argsz;
 668     int32_t *fds;
 669
 670     argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
 671
 672     irq_set = g_malloc0(argsz);
 673     irq_set->argsz = argsz;
 674     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 675     irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
 676     irq_set->start = 0;
 677     irq_set->count = vdev->nr_vectors;
 678     fds = (int32_t *)&irq_set->data;
 679
 680     for (i = 0; i < vdev->nr_vectors; i++) {
 681         int fd = -1;
 682
 683         /*
 684          * MSI vs MSI-X - The guest has direct access to MSI mask and pending
 685          * bits, therefore we always use the KVM signaling path when setup.
 686          * MSI-X mask and pending bits are emulated, so we want to use the
 687          * KVM signaling path only when configured and unmasked.
 688          */
 689         if (vdev->msi_vectors[i].use) {
 690             if (vdev->msi_vectors[i].virq < 0 ||
 691                 (msix && msix_is_masked(&vdev->pdev, i))) {
 692                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
 693             } else {
 694                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
 695             }
 696         }
 697
 698         fds[i] = fd;
 699     }
 700
 701     ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
 702
 703     g_free(irq_set);
 704
 705     return ret;
 706 }
 707
 708 static void vfio_add_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage *msg,
 709                                   bool msix)
 710 {
 711     int virq;
 712
 713     if ((msix && !VFIO_ALLOW_KVM_MSIX) ||
 714         (!msix && !VFIO_ALLOW_KVM_MSI) || !msg) {
 715         return;
 716     }
 717
 718     if (event_notifier_init(&vector->kvm_interrupt, 0)) {
 719         return;
 720     }
 721
 722     virq = kvm_irqchip_add_msi_route(kvm_state, *msg);
 723     if (virq < 0) {
 724         event_notifier_cleanup(&vector->kvm_interrupt);
 725         return;
 726     }
 727
 728     if (kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->kvm_interrupt,
 729                                        NULL, virq) < 0) {
 730         kvm_irqchip_release_virq(kvm_state, virq);
 731         event_notifier_cleanup(&vector->kvm_interrupt);
 732         return;
 733     }
 734
 735     vector->virq = virq;
 736 }
 737
 738 static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
 739 {
 740     kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->kvm_interrupt,
 741                                       vector->virq);
 742     kvm_irqchip_release_virq(kvm_state, vector->virq);
 743     vector->virq = -1;
 744     event_notifier_cleanup(&vector->kvm_interrupt);
 745 }
 746
 747 static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg)
 748 {
 749     kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg);
 750 }
 751
 752 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
 753                                    MSIMessage *msg, IOHandler *handler)
 754 {
 755     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
 756     VFIOMSIVector *vector;
 757     int ret;
 758
 759     trace_vfio_msix_vector_do_use(vdev->host.domain, vdev->host.bus,
 760                                   vdev->host.slot, vdev->host.function,
 761                                   nr);
 762
 763     vector = &vdev->msi_vectors[nr];
 764
 765     if (!vector->use) {
 766         vector->vdev = vdev;
 767         vector->virq = -1;
 768         if (event_notifier_init(&vector->interrupt, 0)) {
 769             error_report("vfio: Error: event_notifier_init failed");
 770         }
 771         vector->use = true;
 772         msix_vector_use(pdev, nr);
 773     }
 774
 775     qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 776                         handler, NULL, vector);
 777
 778     /*
 779      * Attempt to enable route through KVM irqchip,
 780      * default to userspace handling if unavailable.
 781      */
 782     if (vector->virq >= 0) {
 783         if (!msg) {
 784             vfio_remove_kvm_msi_virq(vector);
 785         } else {
 786             vfio_update_kvm_msi_virq(vector, *msg);
 787         }
 788     } else {
 789         vfio_add_kvm_msi_virq(vector, msg, true);
 790     }
 791
 792     /*
 793      * We don't want to have the host allocate all possible MSI vectors
 794      * for a device if they're not in use, so we shutdown and incrementally
 795      * increase them as needed.
 796      */
 797     if (vdev->nr_vectors < nr + 1) {
 798         vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
 799         vdev->nr_vectors = nr + 1;
 800         ret = vfio_enable_vectors(vdev, true);
 801         if (ret) {
 802             error_report("vfio: failed to enable vectors, %d", ret);
 803         }
 804     } else {
 805         int argsz;
 806         struct vfio_irq_set *irq_set;
 807         int32_t *pfd;
 808
 809         argsz = sizeof(*irq_set) + sizeof(*pfd);
 810
 811         irq_set = g_malloc0(argsz);
 812         irq_set->argsz = argsz;
 813         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
 814                          VFIO_IRQ_SET_ACTION_TRIGGER;
 815         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 816         irq_set->start = nr;
 817         irq_set->count = 1;
 818         pfd = (int32_t *)&irq_set->data;
 819
 820         if (vector->virq >= 0) {
 821             *pfd = event_notifier_get_fd(&vector->kvm_interrupt);
 822         } else {
 823             *pfd = event_notifier_get_fd(&vector->interrupt);
 824         }
 825
 826         ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
 827         g_free(irq_set);
 828         if (ret) {
 829             error_report("vfio: failed to modify vector, %d", ret);
 830         }
 831     }
 832
 833     return 0;
 834 }
 835
 836 static int vfio_msix_vector_use(PCIDevice *pdev,
 837                                 unsigned int nr, MSIMessage msg)
 838 {
 839     return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
 840 }
 841
 842 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
 843 {
 844     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
 845     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
 846
 847     trace_vfio_msix_vector_release(vdev->host.domain, vdev->host.bus,
 848                                    vdev->host.slot, vdev->host.function,
 849                                    nr);
 850
 851     /*
 852      * There are still old guests that mask and unmask vectors on every
 853      * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
 854      * the KVM setup in place, simply switch VFIO to use the non-bypass
 855      * eventfd.  We'll then fire the interrupt through QEMU and the MSI-X
 856      * core will mask the interrupt and set pending bits, allowing it to
 857      * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
 858      */
 859     if (vector->virq >= 0) {
 860         int argsz;
 861         struct vfio_irq_set *irq_set;
 862         int32_t *pfd;
 863
 864         argsz = sizeof(*irq_set) + sizeof(*pfd);
 865
 866         irq_set = g_malloc0(argsz);
 867         irq_set->argsz = argsz;
 868         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
 869                          VFIO_IRQ_SET_ACTION_TRIGGER;
 870         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 871         irq_set->start = nr;
 872         irq_set->count = 1;
 873         pfd = (int32_t *)&irq_set->data;
 874
 875         *pfd = event_notifier_get_fd(&vector->interrupt);
 876
 877         ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
 878
 879         g_free(irq_set);
 880     }
 881 }
 882
 883 static void vfio_enable_msix(VFIOPCIDevice *vdev)
 884 {
 885     vfio_disable_interrupts(vdev);
 886
 887     vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(VFIOMSIVector));
 888
 889     vdev->interrupt = VFIO_INT_MSIX;
 890
 891     /*
 892      * Some communication channels between VF & PF or PF & fw rely on the
 893      * physical state of the device and expect that enabling MSI-X from the
 894      * guest enables the same on the host.  When our guest is Linux, the
 895      * guest driver call to pci_enable_msix() sets the enabling bit in the
 896      * MSI-X capability, but leaves the vector table masked.  We therefore
 897      * can't rely on a vector_use callback (from request_irq() in the guest)
 898      * to switch the physical device into MSI-X mode because that may come a
 899      * long time after pci_enable_msix().  This code enables vector 0 with
 900      * triggering to userspace, then immediately release the vector, leaving
 901      * the physical device with no vectors enabled, but MSI-X enabled, just
 902      * like the guest view.
 903      */
 904     vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL);
 905     vfio_msix_vector_release(&vdev->pdev, 0);
 906
 907     if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
 908                                   vfio_msix_vector_release, NULL)) {
 909         error_report("vfio: msix_set_vector_notifiers failed");
 910     }
 911
 912     trace_vfio_enable_msix(vdev->host.domain, vdev->host.bus,
 913                            vdev->host.slot, vdev->host.function);
 914 }
 915
 916 static void vfio_enable_msi(VFIOPCIDevice *vdev)
 917 {
 918     int ret, i;
 919
 920     vfio_disable_interrupts(vdev);
 921
 922     vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
 923 retry:
 924     vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector));
 925
 926     for (i = 0; i < vdev->nr_vectors; i++) {
 927         VFIOMSIVector *vector = &vdev->msi_vectors[i];
 928         MSIMessage msg = msi_get_message(&vdev->pdev, i);
 929
 930         vector->vdev = vdev;
 931         vector->virq = -1;
 932         vector->use = true;
 933
 934         if (event_notifier_init(&vector->interrupt, 0)) {
 935             error_report("vfio: Error: event_notifier_init failed");
 936         }
 937
 938         qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 939                             vfio_msi_interrupt, NULL, vector);
 940
 941         /*
 942          * Attempt to enable route through KVM irqchip,
 943          * default to userspace handling if unavailable.
 944          */
 945         vfio_add_kvm_msi_virq(vector, &msg, false);
 946     }
 947
 948     /* Set interrupt type prior to possible interrupts */
 949     vdev->interrupt = VFIO_INT_MSI;
 950
 951     ret = vfio_enable_vectors(vdev, false);
 952     if (ret) {
 953         if (ret < 0) {
 954             error_report("vfio: Error: Failed to setup MSI fds: %m");
 955         } else if (ret != vdev->nr_vectors) {
 956             error_report("vfio: Error: Failed to enable %d "
 957                          "MSI vectors, retry with %d", vdev->nr_vectors, ret);
 958         }
 959
 960         for (i = 0; i < vdev->nr_vectors; i++) {
 961             VFIOMSIVector *vector = &vdev->msi_vectors[i];
 962             if (vector->virq >= 0) {
 963                 vfio_remove_kvm_msi_virq(vector);
 964             }
 965             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 966                                 NULL, NULL, NULL);
 967             event_notifier_cleanup(&vector->interrupt);
 968         }
 969
 970         g_free(vdev->msi_vectors);
 971
 972         if (ret > 0 && ret != vdev->nr_vectors) {
 973             vdev->nr_vectors = ret;
 974             goto retry;
 975         }
 976         vdev->nr_vectors = 0;
 977
 978         /*
 979          * Failing to setup MSI doesn't really fall within any specification.
 980          * Let's try leaving interrupts disabled and hope the guest figures
 981          * out to fall back to INTx for this device.
 982          */
 983         error_report("vfio: Error: Failed to enable MSI");
 984         vdev->interrupt = VFIO_INT_NONE;
 985
 986         return;
 987     }
 988
 989     trace_vfio_enable_msi(vdev->host.domain, vdev->host.bus,
 990                           vdev->host.slot, vdev->host.function,
 991                           vdev->nr_vectors);
 992 }
 993
 994 static void vfio_disable_msi_common(VFIOPCIDevice *vdev)
 995 {
 996     int i;
 997
 998     for (i = 0; i < vdev->nr_vectors; i++) {
 999         VFIOMSIVector *vector = &vdev->msi_vectors[i];
1000         if (vdev->msi_vectors[i].use) {
1001             if (vector->virq >= 0) {
1002                 vfio_remove_kvm_msi_virq(vector);
1003             }
1004             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
1005                                 NULL, NULL, NULL);
1006             event_notifier_cleanup(&vector->interrupt);
1007         }
1008     }
1009
1010     g_free(vdev->msi_vectors);
1011     vdev->msi_vectors = NULL;
1012     vdev->nr_vectors = 0;
1013     vdev->interrupt = VFIO_INT_NONE;
1014
1015     vfio_enable_intx(vdev);
1016 }
1017
1018 static void vfio_disable_msix(VFIOPCIDevice *vdev)
1019 {
1020     int i;
1021
1022     msix_unset_vector_notifiers(&vdev->pdev);
1023
1024     /*
1025      * MSI-X will only release vectors if MSI-X is still enabled on the
1026      * device, check through the rest and release it ourselves if necessary.
1027      */
1028     for (i = 0; i < vdev->nr_vectors; i++) {
1029         if (vdev->msi_vectors[i].use) {
1030             vfio_msix_vector_release(&vdev->pdev, i);
1031             msix_vector_unuse(&vdev->pdev, i);
1032         }
1033     }
1034
1035     if (vdev->nr_vectors) {
1036         vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
1037     }
1038
1039     vfio_disable_msi_common(vdev);
1040
1041     trace_vfio_disable_msix(vdev->host.domain, vdev->host.bus,
1042                             vdev->host.slot, vdev->host.function);
1043 }
1044
1045 static void vfio_disable_msi(VFIOPCIDevice *vdev)
1046 {
1047     vfio_disable_irqindex(vdev, VFIO_PCI_MSI_IRQ_INDEX);
1048     vfio_disable_msi_common(vdev);
1049
1050     trace_vfio_disable_msi(vdev->host.domain, vdev->host.bus,
1051                            vdev->host.slot, vdev->host.function);
1052 }
1053
1054 static void vfio_update_msi(VFIOPCIDevice *vdev)
1055 {
1056     int i;
1057
1058     for (i = 0; i < vdev->nr_vectors; i++) {
1059         VFIOMSIVector *vector = &vdev->msi_vectors[i];
1060         MSIMessage msg;
1061
1062         if (!vector->use || vector->virq < 0) {
1063             continue;
1064         }
1065
1066         msg = msi_get_message(&vdev->pdev, i);
1067         vfio_update_kvm_msi_virq(vector, msg);
1068     }
1069 }
1070
1071 /*
1072  * IO Port/MMIO - Beware of the endians, VFIO is always little endian
1073  */
1074 static void vfio_bar_write(void *opaque, hwaddr addr,
1075                            uint64_t data, unsigned size)
1076 {
1077     VFIOBAR *bar = opaque;
1078     union {
1079         uint8_t byte;
1080         uint16_t word;
1081         uint32_t dword;
1082         uint64_t qword;
1083     } buf;
1084
1085     switch (size) {
1086     case 1:
1087         buf.byte = data;
1088         break;
1089     case 2:
1090         buf.word = cpu_to_le16(data);
1091         break;
1092     case 4:
1093         buf.dword = cpu_to_le32(data);
1094         break;
1095     default:
1096         hw_error("vfio: unsupported write size, %d bytes", size);
1097         break;
1098     }
1099
1100     if (pwrite(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
1101         error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
1102                      __func__, addr, data, size);
1103     }
1104
1105 #ifdef DEBUG_VFIO
1106     {
1107         VFIOPCIDevice *vdev = container_of(bar, VFIOPCIDevice, bars[bar->nr]);
1108
1109         trace_vfio_bar_write(vdev->host.domain, vdev->host.bus,
1110                              vdev->host.slot, vdev->host.function,
1111                              region->nr, addr, data, size);
1112     }
1113 #endif
1114
1115     /*
1116      * A read or write to a BAR always signals an INTx EOI.  This will
1117      * do nothing if not pending (including not in INTx mode).  We assume
1118      * that a BAR access is in response to an interrupt and that BAR
1119      * accesses will service the interrupt.  Unfortunately, we don't know
1120      * which access will service the interrupt, so we're potentially
1121      * getting quite a few host interrupts per guest interrupt.
1122      */
1123     vfio_eoi(container_of(bar, VFIOPCIDevice, bars[bar->nr]));
1124 }
1125
1126 static uint64_t vfio_bar_read(void *opaque,
1127                               hwaddr addr, unsigned size)
1128 {
1129     VFIOBAR *bar = opaque;
1130     union {
1131         uint8_t byte;
1132         uint16_t word;
1133         uint32_t dword;
1134         uint64_t qword;
1135     } buf;
1136     uint64_t data = 0;
1137
1138     if (pread(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
1139         error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1140                      __func__, addr, size);
1141         return (uint64_t)-1;
1142     }
1143
1144     switch (size) {
1145     case 1:
1146         data = buf.byte;
1147         break;
1148     case 2:
1149         data = le16_to_cpu(buf.word);
1150         break;
1151     case 4:
1152         data = le32_to_cpu(buf.dword);
1153         break;
1154     default:
1155         hw_error("vfio: unsupported read size, %d bytes", size);
1156         break;
1157     }
1158
1159 #ifdef DEBUG_VFIO
1160     {
1161         VFIOPCIDevice *vdev = container_of(bar, VFIOPCIDevice, bars[bar->nr]);
1162
1163         trace_vfio_bar_read(vdev->host.domain, vdev->host.bus,
1164                             vdev->host.slot, vdev->host.function,
1165                             region->nr, addr, size, data);
1166     }
1167 #endif
1168
1169     /* Same as write above */
1170     vfio_eoi(container_of(bar, VFIOPCIDevice, bars[bar->nr]));
1171
1172     return data;
1173 }
1174
1175 static const MemoryRegionOps vfio_bar_ops = {
1176     .read = vfio_bar_read,
1177     .write = vfio_bar_write,
1178     .endianness = DEVICE_LITTLE_ENDIAN,
1179 };
1180
1181 static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
1182 {
1183     struct vfio_region_info reg_info = {
1184         .argsz = sizeof(reg_info),
1185         .index = VFIO_PCI_ROM_REGION_INDEX
1186     };
1187     uint64_t size;
1188     off_t off = 0;
1189     size_t bytes;
1190
1191     if (ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info)) {
1192         error_report("vfio: Error getting ROM info: %m");
1193         return;
1194     }
1195
1196     trace_vfio_pci_load_rom(vdev->host.domain, vdev->host.bus,
1197                             vdev->host.slot, vdev->host.function,
1198                             (unsigned long)reg_info.size,
1199                             (unsigned long)reg_info.offset,
1200                             (unsigned long)reg_info.flags);
1201
1202     vdev->rom_size = size = reg_info.size;
1203     vdev->rom_offset = reg_info.offset;
1204
1205     if (!vdev->rom_size) {
1206         vdev->rom_read_failed = true;
1207         error_report("vfio-pci: Cannot read device rom at "
1208                     "%04x:%02x:%02x.%x",
1209                     vdev->host.domain, vdev->host.bus, vdev->host.slot,
1210                     vdev->host.function);
1211         error_printf("Device option ROM contents are probably invalid "
1212                     "(check dmesg).\nSkip option ROM probe with rombar=0, "
1213                     "or load from file with romfile=\n");
1214         return;
1215     }
1216
1217     vdev->rom = g_malloc(size);
1218     memset(vdev->rom, 0xff, size);
1219
1220     while (size) {
1221         bytes = pread(vdev->fd, vdev->rom + off, size, vdev->rom_offset + off);
1222         if (bytes == 0) {
1223             break;
1224         } else if (bytes > 0) {
1225             off += bytes;
1226             size -= bytes;
1227         } else {
1228             if (errno == EINTR || errno == EAGAIN) {
1229                 continue;
1230             }
1231             error_report("vfio: Error reading device ROM: %m");
1232             break;
1233         }
1234     }
1235 }
1236
1237 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
1238 {
1239     VFIOPCIDevice *vdev = opaque;
1240     union {
1241         uint8_t byte;
1242         uint16_t word;
1243         uint32_t dword;
1244         uint64_t qword;
1245     } val;
1246     uint64_t data = 0;
1247
1248     /* Load the ROM lazily when the guest tries to read it */
1249     if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
1250         vfio_pci_load_rom(vdev);
1251     }
1252
1253     memcpy(&val, vdev->rom + addr,
1254            (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
1255
1256     switch (size) {
1257     case 1:
1258         data = val.byte;
1259         break;
1260     case 2:
1261         data = le16_to_cpu(val.word);
1262         break;
1263     case 4:
1264         data = le32_to_cpu(val.dword);
1265         break;
1266     default:
1267         hw_error("vfio: unsupported read size, %d bytes\n", size);
1268         break;
1269     }
1270
1271     trace_vfio_rom_read(vdev->host.domain, vdev->host.bus,
1272                         vdev->host.slot, vdev->host.function,
1273                         addr, size, data);
1274
1275     return data;
1276 }
1277
1278 static void vfio_rom_write(void *opaque, hwaddr addr,
1279                            uint64_t data, unsigned size)
1280 {
1281 }
1282
1283 static const MemoryRegionOps vfio_rom_ops = {
1284     .read = vfio_rom_read,
1285     .write = vfio_rom_write,
1286     .endianness = DEVICE_LITTLE_ENDIAN,
1287 };
1288
1289 static bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev)
1290 {
1291     PCIDevice *pdev = &vdev->pdev;
1292     uint16_t vendor_id, device_id;
1293     int count = 0;
1294
1295     vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
1296     device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
1297
1298     while (count < ARRAY_SIZE(romblacklist)) {
1299         if (romblacklist[count].vendor_id == vendor_id &&
1300             romblacklist[count].device_id == device_id) {
1301                 return true;
1302         }
1303         count++;
1304     }
1305
1306     return false;
1307 }
1308
1309 static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
1310 {
1311     uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
1312     off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
1313     DeviceState *dev = DEVICE(vdev);
1314     char name[32];
1315
1316     if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
1317         /* Since pci handles romfile, just print a message and return */
1318         if (vfio_blacklist_opt_rom(vdev) && vdev->pdev.romfile) {
1319             error_printf("Warning : Device at %04x:%02x:%02x.%x "
1320                          "is known to cause system instability issues during "
1321                          "option rom execution. "
1322                          "Proceeding anyway since user specified romfile\n",
1323                          vdev->host.domain, vdev->host.bus, vdev->host.slot,
1324                          vdev->host.function);
1325         }
1326         return;
1327     }
1328
1329     /*
1330      * Use the same size ROM BAR as the physical device.  The contents
1331      * will get filled in later when the guest tries to read it.
1332      */
1333     if (pread(vdev->fd, &orig, 4, offset) != 4 ||
1334         pwrite(vdev->fd, &size, 4, offset) != 4 ||
1335         pread(vdev->fd, &size, 4, offset) != 4 ||
1336         pwrite(vdev->fd, &orig, 4, offset) != 4) {
1337         error_report("%s(%04x:%02x:%02x.%x) failed: %m",
1338                      __func__, vdev->host.domain, vdev->host.bus,
1339                      vdev->host.slot, vdev->host.function);
1340         return;
1341     }
1342
1343     size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
1344
1345     if (!size) {
1346         return;
1347     }
1348
1349     if (vfio_blacklist_opt_rom(vdev)) {
1350         if (dev->opts && qemu_opt_get(dev->opts, "rombar")) {
1351             error_printf("Warning : Device at %04x:%02x:%02x.%x "
1352                          "is known to cause system instability issues during "
1353                          "option rom execution. "
1354                          "Proceeding anyway since user specified non zero value for "
1355                          "rombar\n",
1356                          vdev->host.domain, vdev->host.bus, vdev->host.slot,
1357                          vdev->host.function);
1358         } else {
1359             error_printf("Warning : Rom loading for device at "
1360                          "%04x:%02x:%02x.%x has been disabled due to "
1361                          "system instability issues. "
1362                          "Specify rombar=1 or romfile to force\n",
1363                          vdev->host.domain, vdev->host.bus, vdev->host.slot,
1364                          vdev->host.function);
1365             return;
1366         }
1367     }
1368
1369     trace_vfio_pci_size_rom(vdev->host.domain, vdev->host.bus,
1370                             vdev->host.slot, vdev->host.function,
1371                             size);
1372
1373     snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
1374              vdev->host.domain, vdev->host.bus, vdev->host.slot,
1375              vdev->host.function);
1376
1377     memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
1378                           &vfio_rom_ops, vdev, name, size);
1379
1380     pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
1381                      PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
1382
1383     vdev->pdev.has_rom = true;
1384     vdev->rom_read_failed = false;
1385 }
1386
1387 static void vfio_vga_write(void *opaque, hwaddr addr,
1388                            uint64_t data, unsigned size)
1389 {
1390     VFIOVGARegion *region = opaque;
1391     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1392     union {
1393         uint8_t byte;
1394         uint16_t word;
1395         uint32_t dword;
1396         uint64_t qword;
1397     } buf;
1398     off_t offset = vga->fd_offset + region->offset + addr;
1399
1400     switch (size) {
1401     case 1:
1402         buf.byte = data;
1403         break;
1404     case 2:
1405         buf.word = cpu_to_le16(data);
1406         break;
1407     case 4:
1408         buf.dword = cpu_to_le32(data);
1409         break;
1410     default:
1411         hw_error("vfio: unsupported write size, %d bytes", size);
1412         break;
1413     }
1414
1415     if (pwrite(vga->fd, &buf, size, offset) != size) {
1416         error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
1417                      __func__, region->offset + addr, data, size);
1418     }
1419
1420     trace_vfio_vga_write(region->offset + addr, data, size);
1421 }
1422
1423 static uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
1424 {
1425     VFIOVGARegion *region = opaque;
1426     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1427     union {
1428         uint8_t byte;
1429         uint16_t word;
1430         uint32_t dword;
1431         uint64_t qword;
1432     } buf;
1433     uint64_t data = 0;
1434     off_t offset = vga->fd_offset + region->offset + addr;
1435
1436     if (pread(vga->fd, &buf, size, offset) != size) {
1437         error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1438                      __func__, region->offset + addr, size);
1439         return (uint64_t)-1;
1440     }
1441
1442     switch (size) {
1443     case 1:
1444         data = buf.byte;
1445         break;
1446     case 2:
1447         data = le16_to_cpu(buf.word);
1448         break;
1449     case 4:
1450         data = le32_to_cpu(buf.dword);
1451         break;
1452     default:
1453         hw_error("vfio: unsupported read size, %d bytes", size);
1454         break;
1455     }
1456
1457     trace_vfio_vga_read(region->offset + addr, size, data);
1458
1459     return data;
1460 }
1461
1462 static const MemoryRegionOps vfio_vga_ops = {
1463     .read = vfio_vga_read,
1464     .write = vfio_vga_write,
1465     .endianness = DEVICE_LITTLE_ENDIAN,
1466 };
1467
1468 /*
1469  * Device specific quirks
1470  */
1471
1472 /* Is range1 fully contained within range2?  */
1473 static bool vfio_range_contained(uint64_t first1, uint64_t len1,
1474                                  uint64_t first2, uint64_t len2) {
1475     return (first1 >= first2 && first1 + len1 <= first2 + len2);
1476 }
1477
1478 static bool vfio_flags_enabled(uint8_t flags, uint8_t mask)
1479 {
1480     return (mask && (flags & mask) == mask);
1481 }
1482
1483 static uint64_t vfio_generic_window_quirk_read(void *opaque,
1484                                                hwaddr addr, unsigned size)
1485 {
1486     VFIOQuirk *quirk = opaque;
1487     VFIOPCIDevice *vdev = quirk->vdev;
1488     uint64_t data;
1489
1490     if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) &&
1491         ranges_overlap(addr, size,
1492                        quirk->data.data_offset, quirk->data.data_size)) {
1493         hwaddr offset = addr - quirk->data.data_offset;
1494
1495         if (!vfio_range_contained(addr, size, quirk->data.data_offset,
1496                                   quirk->data.data_size)) {
1497             hw_error("%s: window data read not fully contained: %s",
1498                      __func__, memory_region_name(&quirk->mem));
1499         }
1500
1501         data = vfio_pci_read_config(&vdev->pdev,
1502                                     quirk->data.address_val + offset, size);
1503
1504         trace_vfio_generic_window_quirk_read(memory_region_name(&quirk->mem),
1505                                              vdev->host.domain,
1506                                              vdev->host.bus,
1507                                              vdev->host.slot,
1508                                              vdev->host.function,
1509                                              quirk->data.bar,
1510                                              addr, size, data);
1511     } else {
1512         data = vfio_bar_read(&vdev->bars[quirk->data.bar],
1513                              addr + quirk->data.base_offset, size);
1514     }
1515
1516     return data;
1517 }
1518
1519 static void vfio_generic_window_quirk_write(void *opaque, hwaddr addr,
1520                                             uint64_t data, unsigned size)
1521 {
1522     VFIOQuirk *quirk = opaque;
1523     VFIOPCIDevice *vdev = quirk->vdev;
1524
1525     if (ranges_overlap(addr, size,
1526                        quirk->data.address_offset, quirk->data.address_size)) {
1527
1528         if (addr != quirk->data.address_offset) {
1529             hw_error("%s: offset write into address window: %s",
1530                      __func__, memory_region_name(&quirk->mem));
1531         }
1532
1533         if ((data & ~quirk->data.address_mask) == quirk->data.address_match) {
1534             quirk->data.flags |= quirk->data.write_flags |
1535                                  quirk->data.read_flags;
1536             quirk->data.address_val = data & quirk->data.address_mask;
1537         } else {
1538             quirk->data.flags &= ~(quirk->data.write_flags |
1539                                    quirk->data.read_flags);
1540         }
1541     }
1542
1543     if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) &&
1544         ranges_overlap(addr, size,
1545                        quirk->data.data_offset, quirk->data.data_size)) {
1546         hwaddr offset = addr - quirk->data.data_offset;
1547
1548         if (!vfio_range_contained(addr, size, quirk->data.data_offset,
1549                                   quirk->data.data_size)) {
1550             hw_error("%s: window data write not fully contained: %s",
1551                      __func__, memory_region_name(&quirk->mem));
1552         }
1553
1554         vfio_pci_write_config(&vdev->pdev,
1555                               quirk->data.address_val + offset, data, size);
1556
1557         trace_vfio_generic_window_quirk_write(memory_region_name(&quirk->mem),
1558                                              vdev->host.domain,
1559                                              vdev->host.bus,
1560                                              vdev->host.slot,
1561                                              vdev->host.function,
1562                                              quirk->data.bar,
1563                                              addr, data, size);
1564         return;
1565     }
1566
1567     vfio_bar_write(&vdev->bars[quirk->data.bar],
1568                    addr + quirk->data.base_offset, data, size);
1569 }
1570
1571 static const MemoryRegionOps vfio_generic_window_quirk = {
1572     .read = vfio_generic_window_quirk_read,
1573     .write = vfio_generic_window_quirk_write,
1574     .endianness = DEVICE_LITTLE_ENDIAN,
1575 };
1576
1577 static uint64_t vfio_generic_quirk_read(void *opaque,
1578                                         hwaddr addr, unsigned size)
1579 {
1580     VFIOQuirk *quirk = opaque;
1581     VFIOPCIDevice *vdev = quirk->vdev;
1582     hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
1583     hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK;
1584     uint64_t data;
1585
1586     if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) &&
1587         ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) {
1588         if (!vfio_range_contained(addr, size, offset,
1589                                   quirk->data.address_mask + 1)) {
1590             hw_error("%s: read not fully contained: %s",
1591                      __func__, memory_region_name(&quirk->mem));
1592         }
1593
1594         data = vfio_pci_read_config(&vdev->pdev, addr - offset, size);
1595
1596         trace_vfio_generic_quirk_read(memory_region_name(&quirk->mem),
1597                                       vdev->host.domain,
1598                                       vdev->host.bus,
1599                                       vdev->host.slot,
1600                                       vdev->host.function,
1601                                       quirk->data.bar,
1602                                       addr + base, size, data);
1603     } else {
1604         data = vfio_bar_read(&vdev->bars[quirk->data.bar], addr + base, size);
1605     }
1606
1607     return data;
1608 }
1609
1610 static void vfio_generic_quirk_write(void *opaque, hwaddr addr,
1611                                      uint64_t data, unsigned size)
1612 {
1613     VFIOQuirk *quirk = opaque;
1614     VFIOPCIDevice *vdev = quirk->vdev;
1615     hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
1616     hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK;
1617
1618     if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) &&
1619         ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) {
1620         if (!vfio_range_contained(addr, size, offset,
1621                                   quirk->data.address_mask + 1)) {
1622             hw_error("%s: write not fully contained: %s",
1623                      __func__, memory_region_name(&quirk->mem));
1624         }
1625
1626         vfio_pci_write_config(&vdev->pdev, addr - offset, data, size);
1627
1628         trace_vfio_generic_quirk_write(memory_region_name(&quirk->mem),
1629                                        vdev->host.domain,
1630                                        vdev->host.bus,
1631                                        vdev->host.slot,
1632                                        vdev->host.function,
1633                                        quirk->data.bar,
1634                                        addr + base, data, size);
1635     } else {
1636         vfio_bar_write(&vdev->bars[quirk->data.bar], addr + base, data, size);
1637     }
1638 }
1639
1640 static const MemoryRegionOps vfio_generic_quirk = {
1641     .read = vfio_generic_quirk_read,
1642     .write = vfio_generic_quirk_write,
1643     .endianness = DEVICE_LITTLE_ENDIAN,
1644 };
1645
1646 #define PCI_VENDOR_ID_ATI               0x1002
1647
1648 /*
1649  * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR
1650  * through VGA register 0x3c3.  On newer cards, the I/O port BAR is always
1651  * BAR4 (older cards like the X550 used BAR1, but we don't care to support
1652  * those).  Note that on bare metal, a read of 0x3c3 doesn't always return the
1653  * I/O port BAR address.  Originally this was coded to return the virtual BAR
1654  * address only if the physical register read returns the actual BAR address,
1655  * but users have reported greater success if we return the virtual address
1656  * unconditionally.
1657  */
1658 static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
1659                                         hwaddr addr, unsigned size)
1660 {
1661     VFIOQuirk *quirk = opaque;
1662     VFIOPCIDevice *vdev = quirk->vdev;
1663     uint64_t data = vfio_pci_read_config(&vdev->pdev,
1664                                          PCI_BASE_ADDRESS_0 + (4 * 4) + 1,
1665                                          size);
1666     trace_vfio_ati_3c3_quirk_read(data);
1667
1668     return data;
1669 }
1670
1671 static const MemoryRegionOps vfio_ati_3c3_quirk = {
1672     .read = vfio_ati_3c3_quirk_read,
1673     .endianness = DEVICE_LITTLE_ENDIAN,
1674 };
1675
1676 static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev)
1677 {
1678     PCIDevice *pdev = &vdev->pdev;
1679     VFIOQuirk *quirk;
1680
1681     if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1682         return;
1683     }
1684
1685     /*
1686      * As long as the BAR is >= 256 bytes it will be aligned such that the
1687      * lower byte is always zero.  Filter out anything else, if it exists.
1688      */
1689     if (!vdev->bars[4].ioport || vdev->bars[4].size < 256) {
1690         return;
1691     }
1692
1693     quirk = g_malloc0(sizeof(*quirk));
1694     quirk->vdev = vdev;
1695
1696     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, quirk,
1697                           "vfio-ati-3c3-quirk", 1);
1698     memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
1699                                 3 /* offset 3 bytes from 0x3c0 */, &quirk->mem);
1700
1701     QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
1702                       quirk, next);
1703
1704     trace_vfio_vga_probe_ati_3c3_quirk(vdev->host.domain, vdev->host.bus,
1705                                        vdev->host.slot, vdev->host.function);
1706 }
1707
1708 /*
1709  * Newer ATI/AMD devices, including HD5450 and HD7850, have a window to PCI
1710  * config space through MMIO BAR2 at offset 0x4000.  Nothing seems to access
1711  * the MMIO space directly, but a window to this space is provided through
1712  * I/O port BAR4.  Offset 0x0 is the address register and offset 0x4 is the
1713  * data register.  When the address is programmed to a range of 0x4000-0x4fff
1714  * PCI configuration space is available.  Experimentation seems to indicate
1715  * that only read-only access is provided, but we drop writes when the window
1716  * is enabled to config space nonetheless.
1717  */
1718 static void vfio_probe_ati_bar4_window_quirk(VFIOPCIDevice *vdev, int nr)
1719 {
1720     PCIDevice *pdev = &vdev->pdev;
1721     VFIOQuirk *quirk;
1722
1723     if (!vdev->has_vga || nr != 4 ||
1724         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1725         return;
1726     }
1727
1728     quirk = g_malloc0(sizeof(*quirk));
1729     quirk->vdev = vdev;
1730     quirk->data.address_size = 4;
1731     quirk->data.data_offset = 4;
1732     quirk->data.data_size = 4;
1733     quirk->data.address_match = 0x4000;
1734     quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
1735     quirk->data.bar = nr;
1736     quirk->data.read_flags = quirk->data.write_flags = 1;
1737
1738     memory_region_init_io(&quirk->mem, OBJECT(vdev),
1739                           &vfio_generic_window_quirk, quirk,
1740                           "vfio-ati-bar4-window-quirk", 8);
1741     memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
1742                           quirk->data.base_offset, &quirk->mem, 1);
1743
1744     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1745
1746     trace_vfio_probe_ati_bar4_window_quirk(vdev->host.domain,
1747                                            vdev->host.bus,
1748                                            vdev->host.slot,
1749                                            vdev->host.function);
1750 }
1751
1752 #define PCI_VENDOR_ID_REALTEK 0x10ec
1753
1754 /*
1755  * RTL8168 devices have a backdoor that can access the MSI-X table.  At BAR2
1756  * offset 0x70 there is a dword data register, offset 0x74 is a dword address
1757  * register.  According to the Linux r8169 driver, the MSI-X table is addressed
1758  * when the "type" portion of the address register is set to 0x1.  This appears
1759  * to be bits 16:30.  Bit 31 is both a write indicator and some sort of
1760  * "address latched" indicator.  Bits 12:15 are a mask field, which we can
1761  * ignore because the MSI-X table should always be accessed as a dword (full
1762  * mask).  Bits 0:11 is offset within the type.
1763  *
1764  * Example trace:
1765  *
1766  * Read from MSI-X table offset 0
1767  * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr
1768  * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch
1769  * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data
1770  *
1771  * Write 0xfee00000 to MSI-X table offset 0
1772  * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data
1773  * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write
1774  * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete
1775  */
1776
1777 static uint64_t vfio_rtl8168_window_quirk_read(void *opaque,
1778                                                hwaddr addr, unsigned size)
1779 {
1780     VFIOQuirk *quirk = opaque;
1781     VFIOPCIDevice *vdev = quirk->vdev;
1782
1783     switch (addr) {
1784     case 4: /* address */
1785         if (quirk->data.flags) {
1786             trace_vfio_rtl8168_window_quirk_read_fake(
1787                     memory_region_name(&quirk->mem),
1788                     vdev->host.domain, vdev->host.bus,
1789                     vdev->host.slot, vdev->host.function);
1790
1791             return quirk->data.address_match ^ 0x10000000U;
1792         }
1793         break;
1794     case 0: /* data */
1795         if (quirk->data.flags) {
1796             uint64_t val;
1797
1798             trace_vfio_rtl8168_window_quirk_read_table(
1799                     memory_region_name(&quirk->mem),
1800                     vdev->host.domain, vdev->host.bus,
1801                     vdev->host.slot, vdev->host.function
1802                );
1803
1804             if (!(vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) {
1805                 return 0;
1806             }
1807
1808             io_mem_read(&vdev->pdev.msix_table_mmio,
1809                         (hwaddr)(quirk->data.address_match & 0xfff),
1810                         &val, size);
1811             return val;
1812         }
1813     }
1814
1815     trace_vfio_rtl8168_window_quirk_read_direct(
1816                         memory_region_name(&quirk->mem),
1817                         vdev->host.domain, vdev->host.bus,
1818                         vdev->host.slot, vdev->host.function);
1819
1820     return vfio_bar_read(&vdev->bars[quirk->data.bar], addr + 0x70, size);
1821 }
1822
1823 static void vfio_rtl8168_window_quirk_write(void *opaque, hwaddr addr,
1824                                             uint64_t data, unsigned size)
1825 {
1826     VFIOQuirk *quirk = opaque;
1827     VFIOPCIDevice *vdev = quirk->vdev;
1828
1829     switch (addr) {
1830     case 4: /* address */
1831         if ((data & 0x7fff0000) == 0x10000) {
1832             if (data & 0x10000000U &&
1833                 vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) {
1834
1835                 trace_vfio_rtl8168_window_quirk_write_table(
1836                         memory_region_name(&quirk->mem),
1837                         vdev->host.domain, vdev->host.bus,
1838                         vdev->host.slot, vdev->host.function);
1839
1840                 io_mem_write(&vdev->pdev.msix_table_mmio,
1841                              (hwaddr)(quirk->data.address_match & 0xfff),
1842                              data, size);
1843             }
1844
1845             quirk->data.flags = 1;
1846             quirk->data.address_match = data;
1847
1848             return;
1849         }
1850         quirk->data.flags = 0;
1851         break;
1852     case 0: /* data */
1853         quirk->data.address_mask = data;
1854         break;
1855     }
1856
1857     trace_vfio_rtl8168_window_quirk_write_direct(
1858             memory_region_name(&quirk->mem),
1859             vdev->host.domain, vdev->host.bus,
1860             vdev->host.slot, vdev->host.function);
1861
1862     vfio_bar_write(&vdev->bars[quirk->data.bar], addr + 0x70, data, size);
1863 }
1864
1865 static const MemoryRegionOps vfio_rtl8168_window_quirk = {
1866     .read = vfio_rtl8168_window_quirk_read,
1867     .write = vfio_rtl8168_window_quirk_write,
1868     .valid = {
1869         .min_access_size = 4,
1870         .max_access_size = 4,
1871         .unaligned = false,
1872     },
1873     .endianness = DEVICE_LITTLE_ENDIAN,
1874 };
1875
1876 static void vfio_probe_rtl8168_bar2_window_quirk(VFIOPCIDevice *vdev, int nr)
1877 {
1878     PCIDevice *pdev = &vdev->pdev;
1879     VFIOQuirk *quirk;
1880
1881     if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_REALTEK ||
1882         pci_get_word(pdev->config + PCI_DEVICE_ID) != 0x8168 || nr != 2) {
1883         return;
1884     }
1885
1886     quirk = g_malloc0(sizeof(*quirk));
1887     quirk->vdev = vdev;
1888     quirk->data.bar = nr;
1889
1890     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_rtl8168_window_quirk,
1891                           quirk, "vfio-rtl8168-window-quirk", 8);
1892     memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
1893                                         0x70, &quirk->mem, 1);
1894
1895     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1896
1897     trace_vfio_probe_rtl8168_bar2_window_quirk(vdev->host.domain,
1898                                                vdev->host.bus,
1899                                                vdev->host.slot,
1900                                                vdev->host.function);
1901 }
1902 /*
1903  * Trap the BAR2 MMIO window to config space as well.
1904  */
1905 static void vfio_probe_ati_bar2_4000_quirk(VFIOPCIDevice *vdev, int nr)
1906 {
1907     PCIDevice *pdev = &vdev->pdev;
1908     VFIOQuirk *quirk;
1909
1910     /* Only enable on newer devices where BAR2 is 64bit */
1911     if (!vdev->has_vga || nr != 2 || !vdev->bars[2].mem64 ||
1912         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1913         return;
1914     }
1915
1916     quirk = g_malloc0(sizeof(*quirk));
1917     quirk->vdev = vdev;
1918     quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
1919     quirk->data.address_match = 0x4000;
1920     quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
1921     quirk->data.bar = nr;
1922
1923     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk,
1924                           "vfio-ati-bar2-4000-quirk",
1925                           TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
1926     memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
1927                           quirk->data.address_match & TARGET_PAGE_MASK,
1928                           &quirk->mem, 1);
1929
1930     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1931
1932     trace_vfio_probe_ati_bar2_4000_quirk(vdev->host.domain,
1933                                          vdev->host.bus,
1934                                          vdev->host.slot,
1935                                          vdev->host.function);
1936 }
1937
1938 /*
1939  * Older ATI/AMD cards like the X550 have a similar window to that above.
1940  * I/O port BAR1 provides a window to a mirror of PCI config space located
1941  * in BAR2 at offset 0xf00.  We don't care to support such older cards, but
1942  * note it for future reference.
1943  */
1944
1945 #define PCI_VENDOR_ID_NVIDIA                    0x10de
1946
1947 /*
1948  * Nvidia has several different methods to get to config space, the
1949  * nouveu project has several of these documented here:
1950  * https://github.com/pathscale/envytools/tree/master/hwdocs
1951  *
1952  * The first quirk is actually not documented in envytools and is found
1953  * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]).  This is an
1954  * NV46 chipset.  The backdoor uses the legacy VGA I/O ports to access
1955  * the mirror of PCI config space found at BAR0 offset 0x1800.  The access
1956  * sequence first writes 0x338 to I/O port 0x3d4.  The target offset is
1957  * then written to 0x3d0.  Finally 0x538 is written for a read and 0x738
1958  * is written for a write to 0x3d4.  The BAR0 offset is then accessible
1959  * through 0x3d0.  This quirk doesn't seem to be necessary on newer cards
1960  * that use the I/O port BAR5 window but it doesn't hurt to leave it.
1961  */
1962 enum {
1963     NV_3D0_NONE = 0,
1964     NV_3D0_SELECT,
1965     NV_3D0_WINDOW,
1966     NV_3D0_READ,
1967     NV_3D0_WRITE,
1968 };
1969
1970 static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque,
1971                                            hwaddr addr, unsigned size)
1972 {
1973     VFIOQuirk *quirk = opaque;
1974     VFIOPCIDevice *vdev = quirk->vdev;
1975     PCIDevice *pdev = &vdev->pdev;
1976     uint64_t data = vfio_vga_read(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
1977                                   addr + quirk->data.base_offset, size);
1978
1979     if (quirk->data.flags == NV_3D0_READ && addr == quirk->data.data_offset) {
1980         data = vfio_pci_read_config(pdev, quirk->data.address_val, size);
1981         trace_vfio_nvidia_3d0_quirk_read(size, data);
1982     }
1983
1984     quirk->data.flags = NV_3D0_NONE;
1985
1986     return data;
1987 }
1988
1989 static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr,
1990                                         uint64_t data, unsigned size)
1991 {
1992     VFIOQuirk *quirk = opaque;
1993     VFIOPCIDevice *vdev = quirk->vdev;
1994     PCIDevice *pdev = &vdev->pdev;
1995
1996     switch (quirk->data.flags) {
1997     case NV_3D0_NONE:
1998         if (addr == quirk->data.address_offset && data == 0x338) {
1999             quirk->data.flags = NV_3D0_SELECT;
2000         }
2001         break;
2002     case NV_3D0_SELECT:
2003         quirk->data.flags = NV_3D0_NONE;
2004         if (addr == quirk->data.data_offset &&
2005             (data & ~quirk->data.address_mask) == quirk->data.address_match) {
2006             quirk->data.flags = NV_3D0_WINDOW;
2007             quirk->data.address_val = data & quirk->data.address_mask;
2008         }
2009         break;
2010     case NV_3D0_WINDOW:
2011         quirk->data.flags = NV_3D0_NONE;
2012         if (addr == quirk->data.address_offset) {
2013             if (data == 0x538) {
2014                 quirk->data.flags = NV_3D0_READ;
2015             } else if (data == 0x738) {
2016                 quirk->data.flags = NV_3D0_WRITE;
2017             }
2018         }
2019         break;
2020     case NV_3D0_WRITE:
2021         quirk->data.flags = NV_3D0_NONE;
2022         if (addr == quirk->data.data_offset) {
2023             vfio_pci_write_config(pdev, quirk->data.address_val, data, size);
2024             trace_vfio_nvidia_3d0_quirk_write(data, size);
2025             return;
2026         }
2027         break;
2028     }
2029
2030     vfio_vga_write(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
2031                    addr + quirk->data.base_offset, data, size);
2032 }
2033
2034 static const MemoryRegionOps vfio_nvidia_3d0_quirk = {
2035     .read = vfio_nvidia_3d0_quirk_read,
2036     .write = vfio_nvidia_3d0_quirk_write,
2037     .endianness = DEVICE_LITTLE_ENDIAN,
2038 };
2039
2040 static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev)
2041 {
2042     PCIDevice *pdev = &vdev->pdev;
2043     VFIOQuirk *quirk;
2044
2045     if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA ||
2046         !vdev->bars[1].size) {
2047         return;
2048     }
2049
2050     quirk = g_malloc0(sizeof(*quirk));
2051     quirk->vdev = vdev;
2052     quirk->data.base_offset = 0x10;
2053     quirk->data.address_offset = 4;
2054     quirk->data.address_size = 2;
2055     quirk->data.address_match = 0x1800;
2056     quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1;
2057     quirk->data.data_offset = 0;
2058     quirk->data.data_size = 4;
2059
2060     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_3d0_quirk,
2061                           quirk, "vfio-nvidia-3d0-quirk", 6);
2062     memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
2063                                 quirk->data.base_offset, &quirk->mem);
2064
2065     QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
2066                       quirk, next);
2067
2068     trace_vfio_vga_probe_nvidia_3d0_quirk(vdev->host.domain,
2069                                           vdev->host.bus,
2070                                           vdev->host.slot,
2071                                           vdev->host.function);
2072 }
2073
2074 /*
2075  * The second quirk is documented in envytools.  The I/O port BAR5 is just
2076  * a set of address/data ports to the MMIO BARs.  The BAR we care about is
2077  * again BAR0.  This backdoor is apparently a bit newer than the one above
2078  * so we need to not only trap 256 bytes @0x1800, but all of PCI config
2079  * space, including extended space is available at the 4k @0x88000.
2080  */
2081 enum {
2082     NV_BAR5_ADDRESS = 0x1,
2083     NV_BAR5_ENABLE = 0x2,
2084     NV_BAR5_MASTER = 0x4,
2085     NV_BAR5_VALID = 0x7,
2086 };
2087
2088 static void vfio_nvidia_bar5_window_quirk_write(void *opaque, hwaddr addr,
2089                                                 uint64_t data, unsigned size)
2090 {
2091     VFIOQuirk *quirk = opaque;
2092
2093     switch (addr) {
2094     case 0x0:
2095         if (data & 0x1) {
2096             quirk->data.flags |= NV_BAR5_MASTER;
2097         } else {
2098             quirk->data.flags &= ~NV_BAR5_MASTER;
2099         }
2100         break;
2101     case 0x4:
2102         if (data & 0x1) {
2103             quirk->data.flags |= NV_BAR5_ENABLE;
2104         } else {
2105             quirk->data.flags &= ~NV_BAR5_ENABLE;
2106         }
2107         break;
2108     case 0x8:
2109         if (quirk->data.flags & NV_BAR5_MASTER) {
2110             if ((data & ~0xfff) == 0x88000) {
2111                 quirk->data.flags |= NV_BAR5_ADDRESS;
2112                 quirk->data.address_val = data & 0xfff;
2113             } else if ((data & ~0xff) == 0x1800) {
2114                 quirk->data.flags |= NV_BAR5_ADDRESS;
2115                 quirk->data.address_val = data & 0xff;
2116             } else {
2117                 quirk->data.flags &= ~NV_BAR5_ADDRESS;
2118             }
2119         }
2120         break;
2121     }
2122
2123     vfio_generic_window_quirk_write(opaque, addr, data, size);
2124 }
2125
2126 static const MemoryRegionOps vfio_nvidia_bar5_window_quirk = {
2127     .read = vfio_generic_window_quirk_read,
2128     .write = vfio_nvidia_bar5_window_quirk_write,
2129     .valid.min_access_size = 4,
2130     .endianness = DEVICE_LITTLE_ENDIAN,
2131 };
2132
2133 static void vfio_probe_nvidia_bar5_window_quirk(VFIOPCIDevice *vdev, int nr)
2134 {
2135     PCIDevice *pdev = &vdev->pdev;
2136     VFIOQuirk *quirk;
2137
2138     if (!vdev->has_vga || nr != 5 ||
2139         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
2140         return;
2141     }
2142
2143     quirk = g_malloc0(sizeof(*quirk));
2144     quirk->vdev = vdev;
2145     quirk->data.read_flags = quirk->data.write_flags = NV_BAR5_VALID;
2146     quirk->data.address_offset = 0x8;
2147     quirk->data.address_size = 0; /* actually 4, but avoids generic code */
2148     quirk->data.data_offset = 0xc;
2149     quirk->data.data_size = 4;
2150     quirk->data.bar = nr;
2151
2152     memory_region_init_io(&quirk->mem, OBJECT(vdev),
2153                           &vfio_nvidia_bar5_window_quirk, quirk,
2154                           "vfio-nvidia-bar5-window-quirk", 16);
2155     memory_region_add_subregion_overlap(&vdev->bars[nr].mem, 0, &quirk->mem, 1);
2156
2157     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
2158
2159     trace_vfio_probe_nvidia_bar5_window_quirk(vdev->host.domain,
2160                                               vdev->host.bus,
2161                                               vdev->host.slot,
2162                                               vdev->host.function);
2163 }
2164
2165 static void vfio_nvidia_88000_quirk_write(void *opaque, hwaddr addr,
2166                                           uint64_t data, unsigned size)
2167 {
2168     VFIOQuirk *quirk = opaque;
2169     VFIOPCIDevice *vdev = quirk->vdev;
2170     PCIDevice *pdev = &vdev->pdev;
2171     hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
2172
2173     vfio_generic_quirk_write(opaque, addr, data, size);
2174
2175     /*
2176      * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
2177      * MSI capability ID register.  Both the ID and next register are
2178      * read-only, so we allow writes covering either of those to real hw.
2179      * NB - only fixed for the 0x88000 MMIO window.
2180      */
2181     if ((pdev->cap_present & QEMU_PCI_CAP_MSI) &&
2182         vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) {
2183         vfio_bar_write(&vdev->bars[quirk->data.bar], addr + base, data, size);
2184     }
2185 }
2186
2187 static const MemoryRegionOps vfio_nvidia_88000_quirk = {
2188     .read = vfio_generic_quirk_read,
2189     .write = vfio_nvidia_88000_quirk_write,
2190     .endianness = DEVICE_LITTLE_ENDIAN,
2191 };
2192
2193 /*
2194  * Finally, BAR0 itself.  We want to redirect any accesses to either
2195  * 0x1800 or 0x88000 through the PCI config space access functions.
2196  *
2197  * NB - quirk at a page granularity or else they don't seem to work when
2198  *      BARs are mmap'd
2199  *
2200  * Here's offset 0x88000...
2201  */
2202 static void vfio_probe_nvidia_bar0_88000_quirk(VFIOPCIDevice *vdev, int nr)
2203 {
2204     PCIDevice *pdev = &vdev->pdev;
2205     VFIOQuirk *quirk;
2206     uint16_t vendor, class;
2207
2208     vendor = pci_get_word(pdev->config + PCI_VENDOR_ID);
2209     class = pci_get_word(pdev->config + PCI_CLASS_DEVICE);
2210
2211     if (nr != 0 || vendor != PCI_VENDOR_ID_NVIDIA ||
2212         class != PCI_CLASS_DISPLAY_VGA) {
2213         return;
2214     }
2215
2216     quirk = g_malloc0(sizeof(*quirk));
2217     quirk->vdev = vdev;
2218     quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
2219     quirk->data.address_match = 0x88000;
2220     quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
2221     quirk->data.bar = nr;
2222
2223     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_88000_quirk,
2224                           quirk, "vfio-nvidia-bar0-88000-quirk",
2225                           TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
2226     memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
2227                           quirk->data.address_match & TARGET_PAGE_MASK,
2228                           &quirk->mem, 1);
2229
2230     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
2231
2232     trace_vfio_probe_nvidia_bar0_88000_quirk(vdev->host.domain,
2233                                              vdev->host.bus,
2234                                              vdev->host.slot,
2235                                              vdev->host.function);
2236 }
2237
2238 /*
2239  * And here's the same for BAR0 offset 0x1800...
2240  */
2241 static void vfio_probe_nvidia_bar0_1800_quirk(VFIOPCIDevice *vdev, int nr)
2242 {
2243     PCIDevice *pdev = &vdev->pdev;
2244     VFIOQuirk *quirk;
2245
2246     if (!vdev->has_vga || nr != 0 ||
2247         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
2248         return;
2249     }
2250
2251     /* Log the chipset ID */
2252     trace_vfio_probe_nvidia_bar0_1800_quirk_id(
2253             (unsigned int)(vfio_bar_read(&vdev->bars[0], 0, 4) >> 20) & 0xff);
2254
2255     quirk = g_malloc0(sizeof(*quirk));
2256     quirk->vdev = vdev;
2257     quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
2258     quirk->data.address_match = 0x1800;
2259     quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1;
2260     quirk->data.bar = nr;
2261
2262     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk,
2263                           "vfio-nvidia-bar0-1800-quirk",
2264                           TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
2265     memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
2266                           quirk->data.address_match & TARGET_PAGE_MASK,
2267                           &quirk->mem, 1);
2268
2269     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
2270
2271     trace_vfio_probe_nvidia_bar0_1800_quirk(vdev->host.domain,
2272                                             vdev->host.bus,
2273                                             vdev->host.slot,
2274                                             vdev->host.function);
2275 }
2276
2277 /*
2278  * TODO - Some Nvidia devices provide config access to their companion HDA
2279  * device and even to their parent bridge via these config space mirrors.
2280  * Add quirks for those regions.
2281  */
2282
2283 /*
2284  * Common quirk probe entry points.
2285  */
2286 static void vfio_vga_quirk_setup(VFIOPCIDevice *vdev)
2287 {
2288     vfio_vga_probe_ati_3c3_quirk(vdev);
2289     vfio_vga_probe_nvidia_3d0_quirk(vdev);
2290 }
2291
2292 static void vfio_vga_quirk_teardown(VFIOPCIDevice *vdev)
2293 {
2294     int i;
2295
2296     for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) {
2297         while (!QLIST_EMPTY(&vdev->vga.region[i].quirks)) {
2298             VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga.region[i].quirks);
2299             memory_region_del_subregion(&vdev->vga.region[i].mem, &quirk->mem);
2300             object_unparent(OBJECT(&quirk->mem));
2301             QLIST_REMOVE(quirk, next);
2302             g_free(quirk);
2303         }
2304     }
2305 }
2306
2307 static void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr)
2308 {
2309     vfio_probe_ati_bar4_window_quirk(vdev, nr);
2310     vfio_probe_ati_bar2_4000_quirk(vdev, nr);
2311     vfio_probe_nvidia_bar5_window_quirk(vdev, nr);
2312     vfio_probe_nvidia_bar0_88000_quirk(vdev, nr);
2313     vfio_probe_nvidia_bar0_1800_quirk(vdev, nr);
2314     vfio_probe_rtl8168_bar2_window_quirk(vdev, nr);
2315 }
2316
2317 static void vfio_bar_quirk_teardown(VFIOPCIDevice *vdev, int nr)
2318 {
2319     VFIOBAR *bar = &vdev->bars[nr];
2320
2321     while (!QLIST_EMPTY(&bar->quirks)) {
2322         VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
2323         memory_region_del_subregion(&bar->mem, &quirk->mem);
2324         object_unparent(OBJECT(&quirk->mem));
2325         QLIST_REMOVE(quirk, next);
2326         g_free(quirk);
2327     }
2328 }
2329
2330 /*
2331  * PCI config space
2332  */
2333 static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
2334 {
2335     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
2336     uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
2337
2338     memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
2339     emu_bits = le32_to_cpu(emu_bits);
2340
2341     if (emu_bits) {
2342         emu_val = pci_default_read_config(pdev, addr, len);
2343     }
2344
2345     if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
2346         ssize_t ret;
2347
2348         ret = pread(vdev->fd, &phys_val, len, vdev->config_offset + addr);
2349         if (ret != len) {
2350             error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m",
2351                          __func__, vdev->host.domain, vdev->host.bus,
2352                          vdev->host.slot, vdev->host.function, addr, len);
2353             return -errno;
2354         }
2355         phys_val = le32_to_cpu(phys_val);
2356     }
2357
2358     val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
2359
2360     trace_vfio_pci_read_config(vdev->host.domain, vdev->host.bus,
2361                                vdev->host.slot, vdev->host.function,
2362                                addr, len, val);
2363
2364     return val;
2365 }
2366
2367 static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
2368                                   uint32_t val, int len)
2369 {
2370     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
2371     uint32_t val_le = cpu_to_le32(val);
2372
2373     trace_vfio_pci_write_config(vdev->host.domain, vdev->host.bus,
2374                                 vdev->host.slot, vdev->host.function,
2375                                 addr, val, len);
2376
2377     /* Write everything to VFIO, let it filter out what we can't write */
2378     if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) != len) {
2379         error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m",
2380                      __func__, vdev->host.domain, vdev->host.bus,
2381                      vdev->host.slot, vdev->host.function, addr, val, len);
2382     }
2383
2384     /* MSI/MSI-X Enabling/Disabling */
2385     if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
2386         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
2387         int is_enabled, was_enabled = msi_enabled(pdev);
2388
2389         pci_default_write_config(pdev, addr, val, len);
2390
2391         is_enabled = msi_enabled(pdev);
2392
2393         if (!was_enabled) {
2394             if (is_enabled) {
2395                 vfio_enable_msi(vdev);
2396             }
2397         } else {
2398             if (!is_enabled) {
2399                 vfio_disable_msi(vdev);
2400             } else {
2401                 vfio_update_msi(vdev);
2402             }
2403         }
2404     } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
2405         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
2406         int is_enabled, was_enabled = msix_enabled(pdev);
2407
2408         pci_default_write_config(pdev, addr, val, len);
2409
2410         is_enabled = msix_enabled(pdev);
2411
2412         if (!was_enabled && is_enabled) {
2413             vfio_enable_msix(vdev);
2414         } else if (was_enabled && !is_enabled) {
2415             vfio_disable_msix(vdev);
2416         }
2417     } else {
2418         /* Write everything to QEMU to keep emulated bits correct */
2419         pci_default_write_config(pdev, addr, val, len);
2420     }
2421 }
2422
2423 /*
2424  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
2425  */
2426 static int vfio_dma_unmap(VFIOContainer *container,
2427                           hwaddr iova, ram_addr_t size)
2428 {
2429     struct vfio_iommu_type1_dma_unmap unmap = {
2430         .argsz = sizeof(unmap),
2431         .flags = 0,
2432         .iova = iova,
2433         .size = size,
2434     };
2435
2436     if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
2437         error_report("VFIO_UNMAP_DMA: %d\n", -errno);
2438         return -errno;
2439     }
2440
2441     return 0;
2442 }
2443
2444 static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
2445                         ram_addr_t size, void *vaddr, bool readonly)
2446 {
2447     struct vfio_iommu_type1_dma_map map = {
2448         .argsz = sizeof(map),
2449         .flags = VFIO_DMA_MAP_FLAG_READ,
2450         .vaddr = (__u64)(uintptr_t)vaddr,
2451         .iova = iova,
2452         .size = size,
2453     };
2454
2455     if (!readonly) {
2456         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
2457     }
2458
2459     /*
2460      * Try the mapping, if it fails with EBUSY, unmap the region and try
2461      * again.  This shouldn't be necessary, but we sometimes see it in
2462      * the the VGA ROM space.
2463      */
2464     if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
2465         (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
2466          ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
2467         return 0;
2468     }
2469
2470     error_report("VFIO_MAP_DMA: %d\n", -errno);
2471     return -errno;
2472 }
2473
2474 static bool vfio_listener_skipped_section(MemoryRegionSection *section)
2475 {
2476     return (!memory_region_is_ram(section->mr) &&
2477             !memory_region_is_iommu(section->mr)) ||
2478            /*
2479             * Sizing an enabled 64-bit BAR can cause spurious mappings to
2480             * addresses in the upper part of the 64-bit address space.  These
2481             * are never accessed by the CPU and beyond the address width of
2482             * some IOMMU hardware.  TODO: VFIO should tell us the IOMMU width.
2483             */
2484            section->offset_within_address_space & (1ULL << 63);
2485 }
2486
2487 static void vfio_iommu_map_notify(Notifier *n, void *data)
2488 {
2489     VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
2490     VFIOContainer *container = giommu->container;
2491     IOMMUTLBEntry *iotlb = data;
2492     MemoryRegion *mr;
2493     hwaddr xlat;
2494     hwaddr len = iotlb->addr_mask + 1;
2495     void *vaddr;
2496     int ret;
2497
2498     trace_vfio_iommu_map_notify(iotlb->iova,
2499                                 iotlb->iova + iotlb->addr_mask);
2500
2501     /*
2502      * The IOMMU TLB entry we have just covers translation through
2503      * this IOMMU to its immediate target.  We need to translate
2504      * it the rest of the way through to memory.
2505      */
2506     mr = address_space_translate(&address_space_memory,
2507                                  iotlb->translated_addr,
2508                                  &xlat, &len, iotlb->perm & IOMMU_WO);
2509     if (!memory_region_is_ram(mr)) {
2510         error_report("iommu map to non memory area %"HWADDR_PRIx"\n",
2511                 xlat);
2512         return;
2513     }
2514     /*
2515      * Translation truncates length to the IOMMU page size,
2516      * check that it did not truncate too much.
2517      */
2518     if (len & iotlb->addr_mask) {
2519         error_report("iommu has granularity incompatible with target AS\n");
2520         return;
2521     }
2522
2523     if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
2524         vaddr = memory_region_get_ram_ptr(mr) + xlat;
2525
2526         ret = vfio_dma_map(container, iotlb->iova,
2527                            iotlb->addr_mask + 1, vaddr,
2528                            !(iotlb->perm & IOMMU_WO) || mr->readonly);
2529         if (ret) {
2530             error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
2531                          "0x%"HWADDR_PRIx", %p) = %d (%m)",
2532                          container, iotlb->iova,
2533                          iotlb->addr_mask + 1, vaddr, ret);
2534         }
2535     } else {
2536         ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1);
2537         if (ret) {
2538             error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
2539                          "0x%"HWADDR_PRIx") = %d (%m)",
2540                          container, iotlb->iova,
2541                          iotlb->addr_mask + 1, ret);
2542         }
2543     }
2544 }
2545
2546 static void vfio_listener_region_add(MemoryListener *listener,
2547                                      MemoryRegionSection *section)
2548 {
2549     VFIOContainer *container = container_of(listener, VFIOContainer,
2550                                             iommu_data.type1.listener);
2551     hwaddr iova, end;
2552     Int128 llend;
2553     void *vaddr;
2554     int ret;
2555
2556     if (vfio_listener_skipped_section(section)) {
2557         trace_vfio_listener_region_add_skip(
2558                 section->offset_within_address_space,
2559                 section->offset_within_address_space +
2560                 int128_get64(int128_sub(section->size, int128_one())));
2561         return;
2562     }
2563
2564     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
2565                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
2566         error_report("%s received unaligned region", __func__);
2567         return;
2568     }
2569
2570     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
2571     llend = int128_make64(section->offset_within_address_space);
2572     llend = int128_add(llend, section->size);
2573     llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
2574
2575     if (int128_ge(int128_make64(iova), llend)) {
2576         return;
2577     }
2578
2579     memory_region_ref(section->mr);
2580
2581     if (memory_region_is_iommu(section->mr)) {
2582         VFIOGuestIOMMU *giommu;
2583
2584         trace_vfio_listener_region_add_iommu(iova,
2585                     int128_get64(int128_sub(llend, int128_one())));
2586         /*
2587          * FIXME: We should do some checking to see if the
2588          * capabilities of the host VFIO IOMMU are adequate to model
2589          * the guest IOMMU
2590          *
2591          * FIXME: For VFIO iommu types which have KVM acceleration to
2592          * avoid bouncing all map/unmaps through qemu this way, this
2593          * would be the right place to wire that up (tell the KVM
2594          * device emulation the VFIO iommu handles to use).
2595          */
2596         /*
2597          * This assumes that the guest IOMMU is empty of
2598          * mappings at this point.
2599          *
2600          * One way of doing this is:
2601          * 1. Avoid sharing IOMMUs between emulated devices or different
2602          * IOMMU groups.
2603          * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if
2604          * there are some mappings in IOMMU.
2605          *
2606          * VFIO on SPAPR does that. Other IOMMU models may do that different,
2607          * they must make sure there are no existing mappings or
2608          * loop through existing mappings to map them into VFIO.
2609          */
2610         giommu = g_malloc0(sizeof(*giommu));
2611         giommu->iommu = section->mr;
2612         giommu->container = container;
2613         giommu->n.notify = vfio_iommu_map_notify;
2614         QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
2615         memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
2616
2617         return;
2618     }
2619
2620     /* Here we assume that memory_region_is_ram(section->mr)==true */
2621
2622     end = int128_get64(llend);
2623     vaddr = memory_region_get_ram_ptr(section->mr) +
2624             section->offset_within_region +
2625             (iova - section->offset_within_address_space);
2626
2627     trace_vfio_listener_region_add_ram(iova, end - 1, vaddr);
2628
2629     ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
2630     if (ret) {
2631         error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
2632                      "0x%"HWADDR_PRIx", %p) = %d (%m)",
2633                      container, iova, end - iova, vaddr, ret);
2634
2635         /*
2636          * On the initfn path, store the first error in the container so we
2637          * can gracefully fail.  Runtime, there's not much we can do other
2638          * than throw a hardware error.
2639          */
2640         if (!container->iommu_data.type1.initialized) {
2641             if (!container->iommu_data.type1.error) {
2642                 container->iommu_data.type1.error = ret;
2643             }
2644         } else {
2645             hw_error("vfio: DMA mapping failed, unable to continue");
2646         }
2647     }
2648 }
2649
2650 static void vfio_listener_region_del(MemoryListener *listener,
2651                                      MemoryRegionSection *section)
2652 {
2653     VFIOContainer *container = container_of(listener, VFIOContainer,
2654                                             iommu_data.type1.listener);
2655     hwaddr iova, end;
2656     int ret;
2657
2658     if (vfio_listener_skipped_section(section)) {
2659         trace_vfio_listener_region_del_skip(
2660                 section->offset_within_address_space,
2661                 section->offset_within_address_space +
2662                 int128_get64(int128_sub(section->size, int128_one())));
2663         return;
2664     }
2665
2666     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
2667                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
2668         error_report("%s received unaligned region", __func__);
2669         return;
2670     }
2671
2672     if (memory_region_is_iommu(section->mr)) {
2673         VFIOGuestIOMMU *giommu;
2674
2675         QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
2676             if (giommu->iommu == section->mr) {
2677                 memory_region_unregister_iommu_notifier(&giommu->n);
2678                 QLIST_REMOVE(giommu, giommu_next);
2679                 g_free(giommu);
2680                 break;
2681             }
2682         }
2683
2684         /*
2685          * FIXME: We assume the one big unmap below is adequate to
2686          * remove any individual page mappings in the IOMMU which
2687          * might have been copied into VFIO. This works for a page table
2688          * based IOMMU where a big unmap flattens a large range of IO-PTEs.
2689          * That may not be true for all IOMMU types.
2690          */
2691     }
2692
2693     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
2694     end = (section->offset_within_address_space + int128_get64(section->size)) &
2695           TARGET_PAGE_MASK;
2696
2697     if (iova >= end) {
2698         return;
2699     }
2700
2701     trace_vfio_listener_region_del(iova, end - 1);
2702
2703     ret = vfio_dma_unmap(container, iova, end - iova);
2704     memory_region_unref(section->mr);
2705     if (ret) {
2706         error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
2707                      "0x%"HWADDR_PRIx") = %d (%m)",
2708                      container, iova, end - iova, ret);
2709     }
2710 }
2711
2712 static MemoryListener vfio_memory_listener = {
2713     .region_add = vfio_listener_region_add,
2714     .region_del = vfio_listener_region_del,
2715 };
2716
2717 static void vfio_listener_release(VFIOContainer *container)
2718 {
2719     memory_listener_unregister(&container->iommu_data.type1.listener);
2720 }
2721
2722 /*
2723  * Interrupt setup
2724  */
2725 static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
2726 {
2727     switch (vdev->interrupt) {
2728     case VFIO_INT_INTx:
2729         vfio_disable_intx(vdev);
2730         break;
2731     case VFIO_INT_MSI:
2732         vfio_disable_msi(vdev);
2733         break;
2734     case VFIO_INT_MSIX:
2735         vfio_disable_msix(vdev);
2736         break;
2737     }
2738 }
2739
2740 static int vfio_setup_msi(VFIOPCIDevice *vdev, int pos)
2741 {
2742     uint16_t ctrl;
2743     bool msi_64bit, msi_maskbit;
2744     int ret, entries;
2745
2746     if (pread(vdev->fd, &ctrl, sizeof(ctrl),
2747               vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
2748         return -errno;
2749     }
2750     ctrl = le16_to_cpu(ctrl);
2751
2752     msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
2753     msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
2754     entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
2755
2756     trace_vfio_setup_msi(vdev->host.domain, vdev->host.bus,
2757                          vdev->host.slot, vdev->host.function, pos);
2758
2759     ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
2760     if (ret < 0) {
2761         if (ret == -ENOTSUP) {
2762             return 0;
2763         }
2764         error_report("vfio: msi_init failed");
2765         return ret;
2766     }
2767     vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
2768
2769     return 0;
2770 }
2771
2772 /*
2773  * We don't have any control over how pci_add_capability() inserts
2774  * capabilities into the chain.  In order to setup MSI-X we need a
2775  * MemoryRegion for the BAR.  In order to setup the BAR and not
2776  * attempt to mmap the MSI-X table area, which VFIO won't allow, we
2777  * need to first look for where the MSI-X table lives.  So we
2778  * unfortunately split MSI-X setup across two functions.
2779  */
2780 static int vfio_early_setup_msix(VFIOPCIDevice *vdev)
2781 {
2782     uint8_t pos;
2783     uint16_t ctrl;
2784     uint32_t table, pba;
2785
2786     pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
2787     if (!pos) {
2788         return 0;
2789     }
2790
2791     if (pread(vdev->fd, &ctrl, sizeof(ctrl),
2792               vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
2793         return -errno;
2794     }
2795
2796     if (pread(vdev->fd, &table, sizeof(table),
2797               vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
2798         return -errno;
2799     }
2800
2801     if (pread(vdev->fd, &pba, sizeof(pba),
2802               vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
2803         return -errno;
2804     }
2805
2806     ctrl = le16_to_cpu(ctrl);
2807     table = le32_to_cpu(table);
2808     pba = le32_to_cpu(pba);
2809
2810     vdev->msix = g_malloc0(sizeof(*(vdev->msix)));
2811     vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
2812     vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
2813     vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
2814     vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
2815     vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
2816
2817     trace_vfio_early_setup_msix(vdev->host.domain, vdev->host.bus,
2818                                 vdev->host.slot, vdev->host.function,
2819                                 pos, vdev->msix->table_bar,
2820                                 vdev->msix->table_offset,
2821                                 vdev->msix->entries);
2822
2823     return 0;
2824 }
2825
2826 static int vfio_setup_msix(VFIOPCIDevice *vdev, int pos)
2827 {
2828     int ret;
2829
2830     ret = msix_init(&vdev->pdev, vdev->msix->entries,
2831                     &vdev->bars[vdev->msix->table_bar].mem,
2832                     vdev->msix->table_bar, vdev->msix->table_offset,
2833                     &vdev->bars[vdev->msix->pba_bar].mem,
2834                     vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
2835     if (ret < 0) {
2836         if (ret == -ENOTSUP) {
2837             return 0;
2838         }
2839         error_report("vfio: msix_init failed");
2840         return ret;
2841     }
2842
2843     return 0;
2844 }
2845
2846 static void vfio_teardown_msi(VFIOPCIDevice *vdev)
2847 {
2848     msi_uninit(&vdev->pdev);
2849
2850     if (vdev->msix) {
2851         msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
2852                     &vdev->bars[vdev->msix->pba_bar].mem);
2853     }
2854 }
2855
2856 /*
2857  * Resource setup
2858  */
2859 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
2860 {
2861     int i;
2862
2863     for (i = 0; i < PCI_ROM_SLOT; i++) {
2864         VFIOBAR *bar = &vdev->bars[i];
2865
2866         if (!bar->size) {
2867             continue;
2868         }
2869
2870         memory_region_set_enabled(&bar->mmap_mem, enabled);
2871         if (vdev->msix && vdev->msix->table_bar == i) {
2872             memory_region_set_enabled(&vdev->msix->mmap_mem, enabled);
2873         }
2874     }
2875 }
2876
2877 static void vfio_unmap_bar(VFIOPCIDevice *vdev, int nr)
2878 {
2879     VFIOBAR *bar = &vdev->bars[nr];
2880
2881     if (!bar->size) {
2882         return;
2883     }
2884
2885     vfio_bar_quirk_teardown(vdev, nr);
2886
2887     memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
2888     munmap(bar->mmap, memory_region_size(&bar->mmap_mem));
2889
2890     if (vdev->msix && vdev->msix->table_bar == nr) {
2891         memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
2892         munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
2893     }
2894 }
2895
2896 static int vfio_mmap_bar(VFIOPCIDevice *vdev, VFIOBAR *bar,
2897                          MemoryRegion *mem, MemoryRegion *submem,
2898                          void **map, size_t size, off_t offset,
2899                          const char *name)
2900 {
2901     int ret = 0;
2902
2903     if (VFIO_ALLOW_MMAP && size && bar->flags & VFIO_REGION_INFO_FLAG_MMAP) {
2904         int prot = 0;
2905
2906         if (bar->flags & VFIO_REGION_INFO_FLAG_READ) {
2907             prot |= PROT_READ;
2908         }
2909
2910         if (bar->flags & VFIO_REGION_INFO_FLAG_WRITE) {
2911             prot |= PROT_WRITE;
2912         }
2913
2914         *map = mmap(NULL, size, prot, MAP_SHARED,
2915                     bar->fd, bar->fd_offset + offset);
2916         if (*map == MAP_FAILED) {
2917             *map = NULL;
2918             ret = -errno;
2919             goto empty_region;
2920         }
2921
2922         memory_region_init_ram_ptr(submem, OBJECT(vdev), name, size, *map);
2923         memory_region_set_skip_dump(submem);
2924     } else {
2925 empty_region:
2926         /* Create a zero sized sub-region to make cleanup easy. */
2927         memory_region_init(submem, OBJECT(vdev), name, 0);
2928     }
2929
2930     memory_region_add_subregion(mem, offset, submem);
2931
2932     return ret;
2933 }
2934
2935 static void vfio_map_bar(VFIOPCIDevice *vdev, int nr)
2936 {
2937     VFIOBAR *bar = &vdev->bars[nr];
2938     unsigned size = bar->size;
2939     char name[64];
2940     uint32_t pci_bar;
2941     uint8_t type;
2942     int ret;
2943
2944     /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
2945     if (!size) {
2946         return;
2947     }
2948
2949     snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
2950              vdev->host.domain, vdev->host.bus, vdev->host.slot,
2951              vdev->host.function, nr);
2952
2953     /* Determine what type of BAR this is for registration */
2954     ret = pread(vdev->fd, &pci_bar, sizeof(pci_bar),
2955                 vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
2956     if (ret != sizeof(pci_bar)) {
2957         error_report("vfio: Failed to read BAR %d (%m)", nr);
2958         return;
2959     }
2960
2961     pci_bar = le32_to_cpu(pci_bar);
2962     bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
2963     bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
2964     type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
2965                                     ~PCI_BASE_ADDRESS_MEM_MASK);
2966
2967     /* A "slow" read/write mapping underlies all BARs */
2968     memory_region_init_io(&bar->mem, OBJECT(vdev), &vfio_bar_ops,
2969                           bar, name, size);
2970     pci_register_bar(&vdev->pdev, nr, type, &bar->mem);
2971
2972     /*
2973      * We can't mmap areas overlapping the MSIX vector table, so we
2974      * potentially insert a direct-mapped subregion before and after it.
2975      */
2976     if (vdev->msix && vdev->msix->table_bar == nr) {
2977         size = vdev->msix->table_offset & qemu_host_page_mask;
2978     }
2979
2980     strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
2981     if (vfio_mmap_bar(vdev, bar, &bar->mem,
2982                       &bar->mmap_mem, &bar->mmap, size, 0, name)) {
2983         error_report("%s unsupported. Performance may be slow", name);
2984     }
2985
2986     if (vdev->msix && vdev->msix->table_bar == nr) {
2987         unsigned start;
2988
2989         start = HOST_PAGE_ALIGN(vdev->msix->table_offset +
2990                                 (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
2991
2992         size = start < bar->size ? bar->size - start : 0;
2993         strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
2994         /* VFIOMSIXInfo contains another MemoryRegion for this mapping */
2995         if (vfio_mmap_bar(vdev, bar, &bar->mem, &vdev->msix->mmap_mem,
2996                           &vdev->msix->mmap, size, start, name)) {
2997             error_report("%s unsupported. Performance may be slow", name);
2998         }
2999     }
3000
3001     vfio_bar_quirk_setup(vdev, nr);
3002 }
3003
3004 static void vfio_map_bars(VFIOPCIDevice *vdev)
3005 {
3006     int i;
3007
3008     for (i = 0; i < PCI_ROM_SLOT; i++) {
3009         vfio_map_bar(vdev, i);
3010     }
3011
3012     if (vdev->has_vga) {
3013         memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
3014                               OBJECT(vdev), &vfio_vga_ops,
3015                               &vdev->vga.region[QEMU_PCI_VGA_MEM],
3016                               "vfio-vga-mmio@0xa0000",
3017                               QEMU_PCI_VGA_MEM_SIZE);
3018         memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
3019                               OBJECT(vdev), &vfio_vga_ops,
3020                               &vdev->vga.region[QEMU_PCI_VGA_IO_LO],
3021                               "vfio-vga-io@0x3b0",
3022                               QEMU_PCI_VGA_IO_LO_SIZE);
3023         memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
3024                               OBJECT(vdev), &vfio_vga_ops,
3025                               &vdev->vga.region[QEMU_PCI_VGA_IO_HI],
3026                               "vfio-vga-io@0x3c0",
3027                               QEMU_PCI_VGA_IO_HI_SIZE);
3028
3029         pci_register_vga(&vdev->pdev, &vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
3030                          &vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
3031                          &vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem);
3032         vfio_vga_quirk_setup(vdev);
3033     }
3034 }
3035
3036 static void vfio_unmap_bars(VFIOPCIDevice *vdev)
3037 {
3038     int i;
3039
3040     for (i = 0; i < PCI_ROM_SLOT; i++) {
3041         vfio_unmap_bar(vdev, i);
3042     }
3043
3044     if (vdev->has_vga) {
3045         vfio_vga_quirk_teardown(vdev);
3046         pci_unregister_vga(&vdev->pdev);
3047     }
3048 }
3049
3050 /*
3051  * General setup
3052  */
3053 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
3054 {
3055     uint8_t tmp, next = 0xff;
3056
3057     for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
3058          tmp = pdev->config[tmp + 1]) {
3059         if (tmp > pos && tmp < next) {
3060             next = tmp;
3061         }
3062     }
3063
3064     return next - pos;
3065 }
3066
3067 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
3068 {
3069     pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
3070 }
3071
3072 static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos,
3073                                    uint16_t val, uint16_t mask)
3074 {
3075     vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
3076     vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
3077     vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
3078 }
3079
3080 static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
3081 {
3082     pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
3083 }
3084
3085 static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos,
3086                                    uint32_t val, uint32_t mask)
3087 {
3088     vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
3089     vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
3090     vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
3091 }
3092
3093 static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size)
3094 {
3095     uint16_t flags;
3096     uint8_t type;
3097
3098     flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
3099     type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
3100
3101     if (type != PCI_EXP_TYPE_ENDPOINT &&
3102         type != PCI_EXP_TYPE_LEG_END &&
3103         type != PCI_EXP_TYPE_RC_END) {
3104
3105         error_report("vfio: Assignment of PCIe type 0x%x "
3106                      "devices is not currently supported", type);
3107         return -EINVAL;
3108     }
3109
3110     if (!pci_bus_is_express(vdev->pdev.bus)) {
3111         /*
3112          * Use express capability as-is on PCI bus.  It doesn't make much
3113          * sense to even expose, but some drivers (ex. tg3) depend on it
3114          * and guests don't seem to be particular about it.  We'll need
3115          * to revist this or force express devices to express buses if we
3116          * ever expose an IOMMU to the guest.
3117          */
3118     } else if (pci_bus_is_root(vdev->pdev.bus)) {
3119         /*
3120          * On a Root Complex bus Endpoints become Root Complex Integrated
3121          * Endpoints, which changes the type and clears the LNK & LNK2 fields.
3122          */
3123         if (type == PCI_EXP_TYPE_ENDPOINT) {
3124             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
3125                                    PCI_EXP_TYPE_RC_END << 4,
3126                                    PCI_EXP_FLAGS_TYPE);
3127
3128             /* Link Capabilities, Status, and Control goes away */
3129             if (size > PCI_EXP_LNKCTL) {
3130                 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
3131                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
3132                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
3133
3134 #ifndef PCI_EXP_LNKCAP2
3135 #define PCI_EXP_LNKCAP2 44
3136 #endif
3137 #ifndef PCI_EXP_LNKSTA2
3138 #define PCI_EXP_LNKSTA2 50
3139 #endif
3140                 /* Link 2 Capabilities, Status, and Control goes away */
3141                 if (size > PCI_EXP_LNKCAP2) {
3142                     vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
3143                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
3144                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
3145                 }
3146             }
3147
3148         } else if (type == PCI_EXP_TYPE_LEG_END) {
3149             /*
3150              * Legacy endpoints don't belong on the root complex.  Windows
3151              * seems to be happier with devices if we skip the capability.
3152              */
3153             return 0;
3154         }
3155
3156     } else {
3157         /*
3158          * Convert Root Complex Integrated Endpoints to regular endpoints.
3159          * These devices don't support LNK/LNK2 capabilities, so make them up.
3160          */
3161         if (type == PCI_EXP_TYPE_RC_END) {
3162             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
3163                                    PCI_EXP_TYPE_ENDPOINT << 4,
3164                                    PCI_EXP_FLAGS_TYPE);
3165             vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
3166                                    PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25, ~0);
3167             vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
3168         }
3169
3170         /* Mark the Link Status bits as emulated to allow virtual negotiation */
3171         vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA,
3172                                pci_get_word(vdev->pdev.config + pos +
3173                                             PCI_EXP_LNKSTA),
3174                                PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
3175     }
3176
3177     pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size);
3178     if (pos >= 0) {
3179         vdev->pdev.exp.exp_cap = pos;
3180     }
3181
3182     return pos;
3183 }
3184
3185 static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
3186 {
3187     uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
3188
3189     if (cap & PCI_EXP_DEVCAP_FLR) {
3190         trace_vfio_check_pcie_flr(vdev->host.domain, vdev->host.bus,
3191                                   vdev->host.slot, vdev->host.function);
3192         vdev->has_flr = true;
3193     }
3194 }
3195
3196 static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos)
3197 {
3198     uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
3199
3200     if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
3201         trace_vfio_check_pm_reset(vdev->host.domain, vdev->host.bus,
3202                                   vdev->host.slot, vdev->host.function);
3203         vdev->has_pm_reset = true;
3204     }
3205 }
3206
3207 static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
3208 {
3209     uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
3210
3211     if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
3212         trace_vfio_check_af_flr(vdev->host.domain, vdev->host.bus,
3213                                 vdev->host.slot, vdev->host.function);
3214         vdev->has_flr = true;
3215     }
3216 }
3217
3218 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
3219 {
3220     PCIDevice *pdev = &vdev->pdev;
3221     uint8_t cap_id, next, size;
3222     int ret;
3223
3224     cap_id = pdev->config[pos];
3225     next = pdev->config[pos + 1];
3226
3227     /*
3228      * If it becomes important to configure capabilities to their actual
3229      * size, use this as the default when it's something we don't recognize.
3230      * Since QEMU doesn't actually handle many of the config accesses,
3231      * exact size doesn't seem worthwhile.
3232      */
3233     size = vfio_std_cap_max_size(pdev, pos);
3234
3235     /*
3236      * pci_add_capability always inserts the new capability at the head
3237      * of the chain.  Therefore to end up with a chain that matches the
3238      * physical device, we insert from the end by making this recursive.
3239      * This is also why we pre-caclulate size above as cached config space
3240      * will be changed as we unwind the stack.
3241      */
3242     if (next) {
3243         ret = vfio_add_std_cap(vdev, next);
3244         if (ret) {
3245             return ret;
3246         }
3247     } else {
3248         /* Begin the rebuild, use QEMU emulated list bits */
3249         pdev->config[PCI_CAPABILITY_LIST] = 0;
3250         vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
3251         vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
3252     }
3253
3254     /* Use emulated next pointer to allow dropping caps */
3255     pci_set_byte(vdev->emulated_config_bits + pos + 1, 0xff);
3256
3257     switch (cap_id) {
3258     case PCI_CAP_ID_MSI:
3259         ret = vfio_setup_msi(vdev, pos);
3260         break;
3261     case PCI_CAP_ID_EXP:
3262         vfio_check_pcie_flr(vdev, pos);
3263         ret = vfio_setup_pcie_cap(vdev, pos, size);
3264         break;
3265     case PCI_CAP_ID_MSIX:
3266         ret = vfio_setup_msix(vdev, pos);
3267         break;
3268     case PCI_CAP_ID_PM:
3269         vfio_check_pm_reset(vdev, pos);
3270         vdev->pm_cap = pos;
3271         ret = pci_add_capability(pdev, cap_id, pos, size);
3272         break;
3273     case PCI_CAP_ID_AF:
3274         vfio_check_af_flr(vdev, pos);
3275         ret = pci_add_capability(pdev, cap_id, pos, size);
3276         break;
3277     default:
3278         ret = pci_add_capability(pdev, cap_id, pos, size);
3279         break;
3280     }
3281
3282     if (ret < 0) {
3283         error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability "
3284                      "0x%x[0x%x]@0x%x: %d", vdev->host.domain,
3285                      vdev->host.bus, vdev->host.slot, vdev->host.function,
3286                      cap_id, size, pos, ret);
3287         return ret;
3288     }
3289
3290     return 0;
3291 }
3292
3293 static int vfio_add_capabilities(VFIOPCIDevice *vdev)
3294 {
3295     PCIDevice *pdev = &vdev->pdev;
3296
3297     if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
3298         !pdev->config[PCI_CAPABILITY_LIST]) {
3299         return 0; /* Nothing to add */
3300     }
3301
3302     return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
3303 }
3304
3305 static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
3306 {
3307     PCIDevice *pdev = &vdev->pdev;
3308     uint16_t cmd;
3309
3310     vfio_disable_interrupts(vdev);
3311
3312     /* Make sure the device is in D0 */
3313     if (vdev->pm_cap) {
3314         uint16_t pmcsr;
3315         uint8_t state;
3316
3317         pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
3318         state = pmcsr & PCI_PM_CTRL_STATE_MASK;
3319         if (state) {
3320             pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
3321             vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
3322             /* vfio handles the necessary delay here */
3323             pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
3324             state = pmcsr & PCI_PM_CTRL_STATE_MASK;
3325             if (state) {
3326                 error_report("vfio: Unable to power on device, stuck in D%d",
3327                              state);
3328             }
3329         }
3330     }
3331
3332     /*
3333      * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
3334      * Also put INTx Disable in known state.
3335      */
3336     cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
3337     cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
3338              PCI_COMMAND_INTX_DISABLE);
3339     vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
3340 }
3341
3342 static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
3343 {
3344     vfio_enable_intx(vdev);
3345 }
3346
3347 static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
3348                                 PCIHostDeviceAddress *host2)
3349 {
3350     return (host1->domain == host2->domain && host1->bus == host2->bus &&
3351             host1->slot == host2->slot && host1->function == host2->function);
3352 }
3353
3354 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
3355 {
3356     VFIOGroup *group;
3357     struct vfio_pci_hot_reset_info *info;
3358     struct vfio_pci_dependent_device *devices;
3359     struct vfio_pci_hot_reset *reset;
3360     int32_t *fds;
3361     int ret, i, count;
3362     bool multi = false;
3363
3364     trace_vfio_pci_hot_reset(vdev->host.domain, vdev->host.bus,
3365                              vdev->host.slot, vdev->host.function,
3366                              single ? "one" : "multi");
3367
3368     vfio_pci_pre_reset(vdev);
3369     vdev->needs_reset = false;
3370
3371     info = g_malloc0(sizeof(*info));
3372     info->argsz = sizeof(*info);
3373
3374     ret = ioctl(vdev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
3375     if (ret && errno != ENOSPC) {
3376         ret = -errno;
3377         if (!vdev->has_pm_reset) {
3378             error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
3379                          "no available reset mechanism.", vdev->host.domain,
3380                          vdev->host.bus, vdev->host.slot, vdev->host.function);
3381         }
3382         goto out_single;
3383     }
3384
3385     count = info->count;
3386     info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
3387     info->argsz = sizeof(*info) + (count * sizeof(*devices));
3388     devices = &info->devices[0];
3389
3390     ret = ioctl(vdev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
3391     if (ret) {
3392         ret = -errno;
3393         error_report("vfio: hot reset info failed: %m");
3394         goto out_single;
3395     }
3396
3397     trace_vfio_pci_hot_reset_has_dep_devices(vdev->host.domain,
3398                                              vdev->host.bus,
3399                                              vdev->host.slot,
3400                                              vdev->host.function);
3401
3402     /* Verify that we have all the groups required */
3403     for (i = 0; i < info->count; i++) {
3404         PCIHostDeviceAddress host;
3405         VFIOPCIDevice *tmp;
3406
3407         host.domain = devices[i].segment;
3408         host.bus = devices[i].bus;
3409         host.slot = PCI_SLOT(devices[i].devfn);
3410         host.function = PCI_FUNC(devices[i].devfn);
3411
3412         trace_vfio_pci_hot_reset_dep_devices(host.domain,
3413                 host.bus, host.slot, host.function, devices[i].group_id);
3414
3415         if (vfio_pci_host_match(&host, &vdev->host)) {
3416             continue;
3417         }
3418
3419         QLIST_FOREACH(group, &group_list, next) {
3420             if (group->groupid == devices[i].group_id) {
3421                 break;
3422             }
3423         }
3424
3425         if (!group) {
3426             if (!vdev->has_pm_reset) {
3427                 error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
3428                              "depends on group %d which is not owned.",
3429                              vdev->host.domain, vdev->host.bus, vdev->host.slot,
3430                              vdev->host.function, devices[i].group_id);
3431             }
3432             ret = -EPERM;
3433             goto out;
3434         }
3435
3436         /* Prep dependent devices for reset and clear our marker. */
3437         QLIST_FOREACH(tmp, &group->device_list, next) {
3438             if (vfio_pci_host_match(&host, &tmp->host)) {
3439                 if (single) {
3440                     error_report("vfio: found another in-use device "
3441                             "%04x:%02x:%02x.%x\n", host.domain, host.bus,
3442                             host.slot, host.function);
3443                     ret = -EINVAL;
3444                     goto out_single;
3445                 }
3446                 vfio_pci_pre_reset(tmp);
3447                 tmp->needs_reset = false;
3448                 multi = true;
3449                 break;
3450             }
3451         }
3452     }
3453
3454     if (!single && !multi) {
3455         error_report("vfio: No other in-use devices for multi hot reset\n");
3456         ret = -EINVAL;
3457         goto out_single;
3458     }
3459
3460     /* Determine how many group fds need to be passed */
3461     count = 0;
3462     QLIST_FOREACH(group, &group_list, next) {
3463         for (i = 0; i < info->count; i++) {
3464             if (group->groupid == devices[i].group_id) {
3465                 count++;
3466                 break;
3467             }
3468         }
3469     }
3470
3471     reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
3472     reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
3473     fds = &reset->group_fds[0];
3474
3475     /* Fill in group fds */
3476     QLIST_FOREACH(group, &group_list, next) {
3477         for (i = 0; i < info->count; i++) {
3478             if (group->groupid == devices[i].group_id) {
3479                 fds[reset->count++] = group->fd;
3480                 break;
3481             }
3482         }
3483     }
3484
3485     /* Bus reset! */
3486     ret = ioctl(vdev->fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
3487     g_free(reset);
3488
3489     trace_vfio_pci_hot_reset_result(vdev->host.domain,
3490                                     vdev->host.bus,
3491                                     vdev->host.slot,
3492                                     vdev->host.function,
3493                                     ret ? "%m" : "Success");
3494
3495 out:
3496     /* Re-enable INTx on affected devices */
3497     for (i = 0; i < info->count; i++) {
3498         PCIHostDeviceAddress host;
3499         VFIOPCIDevice *tmp;
3500
3501         host.domain = devices[i].segment;
3502         host.bus = devices[i].bus;
3503         host.slot = PCI_SLOT(devices[i].devfn);
3504         host.function = PCI_FUNC(devices[i].devfn);
3505
3506         if (vfio_pci_host_match(&host, &vdev->host)) {
3507             continue;
3508         }
3509
3510         QLIST_FOREACH(group, &group_list, next) {
3511             if (group->groupid == devices[i].group_id) {
3512                 break;
3513             }
3514         }
3515
3516         if (!group) {
3517             break;
3518         }
3519
3520         QLIST_FOREACH(tmp, &group->device_list, next) {
3521             if (vfio_pci_host_match(&host, &tmp->host)) {
3522                 vfio_pci_post_reset(tmp);
3523                 break;
3524             }
3525         }
3526     }
3527 out_single:
3528     vfio_pci_post_reset(vdev);
3529     g_free(info);
3530
3531     return ret;
3532 }
3533
3534 /*
3535  * We want to differentiate hot reset of mulitple in-use devices vs hot reset
3536  * of a single in-use device.  VFIO_DEVICE_RESET will already handle the case
3537  * of doing hot resets when there is only a single device per bus.  The in-use
3538  * here refers to how many VFIODevices are affected.  A hot reset that affects
3539  * multiple devices, but only a single in-use device, means that we can call
3540  * it from our bus ->reset() callback since the extent is effectively a single
3541  * device.  This allows us to make use of it in the hotplug path.  When there
3542  * are multiple in-use devices, we can only trigger the hot reset during a
3543  * system reset and thus from our reset handler.  We separate _one vs _multi
3544  * here so that we don't overlap and do a double reset on the system reset
3545  * path where both our reset handler and ->reset() callback are used.  Calling
3546  * _one() will only do a hot reset for the one in-use devices case, calling
3547  * _multi() will do nothing if a _one() would have been sufficient.
3548  */
3549 static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev)
3550 {
3551     return vfio_pci_hot_reset(vdev, true);
3552 }
3553
3554 static int vfio_pci_hot_reset_multi(VFIOPCIDevice *vdev)
3555 {
3556     return vfio_pci_hot_reset(vdev, false);
3557 }
3558
3559 static void vfio_pci_reset_handler(void *opaque)
3560 {
3561     VFIOGroup *group;
3562     VFIOPCIDevice *vdev;
3563
3564     QLIST_FOREACH(group, &group_list, next) {
3565         QLIST_FOREACH(vdev, &group->device_list, next) {
3566             if (!vdev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
3567                 vdev->needs_reset = true;
3568             }
3569         }
3570     }
3571
3572     QLIST_FOREACH(group, &group_list, next) {
3573         QLIST_FOREACH(vdev, &group->device_list, next) {
3574             if (vdev->needs_reset) {
3575                 vfio_pci_hot_reset_multi(vdev);
3576             }
3577         }
3578     }
3579 }
3580
3581 static void vfio_kvm_device_add_group(VFIOGroup *group)
3582 {
3583 #ifdef CONFIG_KVM
3584     struct kvm_device_attr attr = {
3585         .group = KVM_DEV_VFIO_GROUP,
3586         .attr = KVM_DEV_VFIO_GROUP_ADD,
3587         .addr = (uint64_t)(unsigned long)&group->fd,
3588     };
3589
3590     if (!kvm_enabled()) {
3591         return;
3592     }
3593
3594     if (vfio_kvm_device_fd < 0) {
3595         struct kvm_create_device cd = {
3596             .type = KVM_DEV_TYPE_VFIO,
3597         };
3598
3599         if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
3600             error_report("KVM_CREATE_DEVICE: %m\n");
3601             return;
3602         }
3603
3604         vfio_kvm_device_fd = cd.fd;
3605     }
3606
3607     if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
3608         error_report("Failed to add group %d to KVM VFIO device: %m",
3609                      group->groupid);
3610     }
3611 #endif
3612 }
3613
3614 static void vfio_kvm_device_del_group(VFIOGroup *group)
3615 {
3616 #ifdef CONFIG_KVM
3617     struct kvm_device_attr attr = {
3618         .group = KVM_DEV_VFIO_GROUP,
3619         .attr = KVM_DEV_VFIO_GROUP_DEL,
3620         .addr = (uint64_t)(unsigned long)&group->fd,
3621     };
3622
3623     if (vfio_kvm_device_fd < 0) {
3624         return;
3625     }
3626
3627     if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
3628         error_report("Failed to remove group %d from KVM VFIO device: %m",
3629                      group->groupid);
3630     }
3631 #endif
3632 }
3633
3634 static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
3635 {
3636     VFIOAddressSpace *space;
3637
3638     QLIST_FOREACH(space, &vfio_address_spaces, list) {
3639         if (space->as == as) {
3640             return space;
3641         }
3642     }
3643
3644     /* No suitable VFIOAddressSpace, create a new one */
3645     space = g_malloc0(sizeof(*space));
3646     space->as = as;
3647     QLIST_INIT(&space->containers);
3648
3649     QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
3650
3651     return space;
3652 }
3653
3654 static void vfio_put_address_space(VFIOAddressSpace *space)
3655 {
3656     if (QLIST_EMPTY(&space->containers)) {
3657         QLIST_REMOVE(space, list);
3658         g_free(space);
3659     }
3660 }
3661
3662 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
3663 {
3664     VFIOContainer *container;
3665     int ret, fd;
3666     VFIOAddressSpace *space;
3667
3668     space = vfio_get_address_space(as);
3669
3670     QLIST_FOREACH(container, &space->containers, next) {
3671         if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
3672             group->container = container;
3673             QLIST_INSERT_HEAD(&container->group_list, group, container_next);
3674             return 0;
3675         }
3676     }
3677
3678     fd = qemu_open("/dev/vfio/vfio", O_RDWR);
3679     if (fd < 0) {
3680         error_report("vfio: failed to open /dev/vfio/vfio: %m");
3681         ret = -errno;
3682         goto put_space_exit;
3683     }
3684
3685     ret = ioctl(fd, VFIO_GET_API_VERSION);
3686     if (ret != VFIO_API_VERSION) {
3687         error_report("vfio: supported vfio version: %d, "
3688                      "reported version: %d", VFIO_API_VERSION, ret);
3689         ret = -EINVAL;
3690         goto close_fd_exit;
3691     }
3692
3693     container = g_malloc0(sizeof(*container));
3694     container->space = space;
3695     container->fd = fd;
3696
3697     if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
3698         ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
3699         if (ret) {
3700             error_report("vfio: failed to set group container: %m");
3701             ret = -errno;
3702             goto free_container_exit;
3703         }
3704
3705         ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
3706         if (ret) {
3707             error_report("vfio: failed to set iommu for container: %m");
3708             ret = -errno;
3709             goto free_container_exit;
3710         }
3711
3712         container->iommu_data.type1.listener = vfio_memory_listener;
3713         container->iommu_data.release = vfio_listener_release;
3714
3715         memory_listener_register(&container->iommu_data.type1.listener,
3716                                  container->space->as);
3717
3718         if (container->iommu_data.type1.error) {
3719             ret = container->iommu_data.type1.error;
3720             error_report("vfio: memory listener initialization failed for container");
3721             goto listener_release_exit;
3722         }
3723
3724         container->iommu_data.type1.initialized = true;
3725
3726     } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) {
3727         ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
3728         if (ret) {
3729             error_report("vfio: failed to set group container: %m");
3730             ret = -errno;
3731             goto free_container_exit;
3732         }
3733
3734         ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU);
3735         if (ret) {
3736             error_report("vfio: failed to set iommu for container: %m");
3737             ret = -errno;
3738             goto free_container_exit;
3739         }
3740
3741         /*
3742          * The host kernel code implementing VFIO_IOMMU_DISABLE is called
3743          * when container fd is closed so we do not call it explicitly
3744          * in this file.
3745          */
3746         ret = ioctl(fd, VFIO_IOMMU_ENABLE);
3747         if (ret) {
3748             error_report("vfio: failed to enable container: %m");
3749             ret = -errno;
3750             goto free_container_exit;
3751         }
3752
3753         container->iommu_data.type1.listener = vfio_memory_listener;
3754         container->iommu_data.release = vfio_listener_release;
3755
3756         memory_listener_register(&container->iommu_data.type1.listener,
3757                                  container->space->as);
3758
3759     } else {
3760         error_report("vfio: No available IOMMU models");
3761         ret = -EINVAL;
3762         goto free_container_exit;
3763     }
3764
3765     QLIST_INIT(&container->group_list);
3766     QLIST_INSERT_HEAD(&space->containers, container, next);
3767
3768     group->container = container;
3769     QLIST_INSERT_HEAD(&container->group_list, group, container_next);
3770
3771     return 0;
3772
3773 listener_release_exit:
3774     vfio_listener_release(container);
3775
3776 free_container_exit:
3777     g_free(container);
3778
3779 close_fd_exit:
3780     close(fd);
3781
3782 put_space_exit:
3783     vfio_put_address_space(space);
3784
3785     return ret;
3786 }
3787
3788 static void vfio_disconnect_container(VFIOGroup *group)
3789 {
3790     VFIOContainer *container = group->container;
3791
3792     if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
3793         error_report("vfio: error disconnecting group %d from container",
3794                      group->groupid);
3795     }
3796
3797     QLIST_REMOVE(group, container_next);
3798     group->container = NULL;
3799
3800     if (QLIST_EMPTY(&container->group_list)) {
3801         VFIOAddressSpace *space = container->space;
3802
3803         if (container->iommu_data.release) {
3804             container->iommu_data.release(container);
3805         }
3806         QLIST_REMOVE(container, next);
3807         trace_vfio_disconnect_container(container->fd);
3808         close(container->fd);
3809         g_free(container);
3810
3811         vfio_put_address_space(space);
3812     }
3813 }
3814
3815 static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as)
3816 {
3817     VFIOGroup *group;
3818     char path[32];
3819     struct vfio_group_status status = { .argsz = sizeof(status) };
3820
3821     QLIST_FOREACH(group, &group_list, next) {
3822         if (group->groupid == groupid) {
3823             /* Found it.  Now is it already in the right context? */
3824             if (group->container->space->as == as) {
3825                 return group;
3826             } else {
3827                 error_report("vfio: group %d used in multiple address spaces",
3828                              group->groupid);
3829                 return NULL;
3830             }
3831         }
3832     }
3833
3834     group = g_malloc0(sizeof(*group));
3835
3836     snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
3837     group->fd = qemu_open(path, O_RDWR);
3838     if (group->fd < 0) {
3839         error_report("vfio: error opening %s: %m", path);
3840         goto free_group_exit;
3841     }
3842
3843     if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
3844         error_report("vfio: error getting group status: %m");
3845         goto close_fd_exit;
3846     }
3847
3848     if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
3849         error_report("vfio: error, group %d is not viable, please ensure "
3850                      "all devices within the iommu_group are bound to their "
3851                      "vfio bus driver.", groupid);
3852         goto close_fd_exit;
3853     }
3854
3855     group->groupid = groupid;
3856     QLIST_INIT(&group->device_list);
3857
3858     if (vfio_connect_container(group, as)) {
3859         error_report("vfio: failed to setup container for group %d", groupid);
3860         goto close_fd_exit;
3861     }
3862
3863     if (QLIST_EMPTY(&group_list)) {
3864         qemu_register_reset(vfio_pci_reset_handler, NULL);
3865     }
3866
3867     QLIST_INSERT_HEAD(&group_list, group, next);
3868
3869     vfio_kvm_device_add_group(group);
3870
3871     return group;
3872
3873 close_fd_exit:
3874     close(group->fd);
3875
3876 free_group_exit:
3877     g_free(group);
3878
3879     return NULL;
3880 }
3881
3882 static void vfio_put_group(VFIOGroup *group)
3883 {
3884     if (!QLIST_EMPTY(&group->device_list)) {
3885         return;
3886     }
3887
3888     vfio_kvm_device_del_group(group);
3889     vfio_disconnect_container(group);
3890     QLIST_REMOVE(group, next);
3891     trace_vfio_put_group(group->fd);
3892     close(group->fd);
3893     g_free(group);
3894
3895     if (QLIST_EMPTY(&group_list)) {
3896         qemu_unregister_reset(vfio_pci_reset_handler, NULL);
3897     }
3898 }
3899
3900 static int vfio_get_device(VFIOGroup *group, const char *name,
3901                            VFIOPCIDevice *vdev)
3902 {
3903     struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
3904     struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
3905     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
3906     int ret, i;
3907
3908     ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
3909     if (ret < 0) {
3910         error_report("vfio: error getting device %s from group %d: %m",
3911                      name, group->groupid);
3912         error_printf("Verify all devices in group %d are bound to vfio-pci "
3913                      "or pci-stub and not already in use\n", group->groupid);
3914         return ret;
3915     }
3916
3917     vdev->fd = ret;
3918     vdev->group = group;
3919     QLIST_INSERT_HEAD(&group->device_list, vdev, next);
3920
3921     /* Sanity check device */
3922     ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
3923     if (ret) {
3924         error_report("vfio: error getting device info: %m");
3925         goto error;
3926     }
3927
3928     trace_vfio_get_device_irq(name, dev_info.flags,
3929                               dev_info.num_regions, dev_info.num_irqs);
3930
3931     if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
3932         error_report("vfio: Um, this isn't a PCI device");
3933         goto error;
3934     }
3935
3936     vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
3937
3938     if (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
3939         error_report("vfio: unexpected number of io regions %u",
3940                      dev_info.num_regions);
3941         goto error;
3942     }
3943
3944     if (dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
3945         error_report("vfio: unexpected number of irqs %u", dev_info.num_irqs);
3946         goto error;
3947     }
3948
3949     for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
3950         reg_info.index = i;
3951
3952         ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
3953         if (ret) {
3954             error_report("vfio: Error getting region %d info: %m", i);
3955             goto error;
3956         }
3957
3958         trace_vfio_get_device_region(name, i,
3959                                      (unsigned long)reg_info.size,
3960                                      (unsigned long)reg_info.offset,
3961                                      (unsigned long)reg_info.flags);
3962
3963         vdev->bars[i].flags = reg_info.flags;
3964         vdev->bars[i].size = reg_info.size;
3965         vdev->bars[i].fd_offset = reg_info.offset;
3966         vdev->bars[i].fd = vdev->fd;
3967         vdev->bars[i].nr = i;
3968         QLIST_INIT(&vdev->bars[i].quirks);
3969     }
3970
3971     reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
3972
3973     ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
3974     if (ret) {
3975         error_report("vfio: Error getting config info: %m");
3976         goto error;
3977     }
3978
3979     trace_vfio_get_device_config(name, (unsigned long)reg_info.size,
3980                                  (unsigned long)reg_info.offset,
3981                                  (unsigned long)reg_info.flags);
3982
3983     vdev->config_size = reg_info.size;
3984     if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
3985         vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
3986     }
3987     vdev->config_offset = reg_info.offset;
3988
3989     if ((vdev->features & VFIO_FEATURE_ENABLE_VGA) &&
3990         dev_info.num_regions > VFIO_PCI_VGA_REGION_INDEX) {
3991         struct vfio_region_info vga_info = {
3992             .argsz = sizeof(vga_info),
3993             .index = VFIO_PCI_VGA_REGION_INDEX,
3994          };
3995
3996         ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &vga_info);
3997         if (ret) {
3998             error_report(
3999                 "vfio: Device does not support requested feature x-vga");
4000             goto error;
4001         }
4002
4003         if (!(vga_info.flags & VFIO_REGION_INFO_FLAG_READ) ||
4004             !(vga_info.flags & VFIO_REGION_INFO_FLAG_WRITE) ||
4005             vga_info.size < 0xbffff + 1) {
4006             error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx",
4007                          (unsigned long)vga_info.flags,
4008                          (unsigned long)vga_info.size);
4009             goto error;
4010         }
4011
4012         vdev->vga.fd_offset = vga_info.offset;
4013         vdev->vga.fd = vdev->fd;
4014
4015         vdev->vga.region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
4016         vdev->vga.region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
4017         QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_MEM].quirks);
4018
4019         vdev->vga.region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
4020         vdev->vga.region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
4021         QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].quirks);
4022
4023         vdev->vga.region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
4024         vdev->vga.region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
4025         QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks);
4026
4027         vdev->has_vga = true;
4028     }
4029     irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
4030
4031     ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
4032     if (ret) {
4033         /* This can fail for an old kernel or legacy PCI dev */
4034         trace_vfio_get_device_get_irq_info_failure();
4035         ret = 0;
4036     } else if (irq_info.count == 1) {
4037         vdev->pci_aer = true;
4038     } else {
4039         error_report("vfio: %04x:%02x:%02x.%x "
4040                      "Could not enable error recovery for the device",
4041                      vdev->host.domain, vdev->host.bus, vdev->host.slot,
4042                      vdev->host.function);
4043     }
4044
4045 error:
4046     if (ret) {
4047         QLIST_REMOVE(vdev, next);
4048         vdev->group = NULL;
4049         close(vdev->fd);
4050     }
4051     return ret;
4052 }
4053
4054 static void vfio_put_device(VFIOPCIDevice *vdev)
4055 {
4056     QLIST_REMOVE(vdev, next);
4057     vdev->group = NULL;
4058     trace_vfio_put_device(vdev->fd);
4059     close(vdev->fd);
4060     if (vdev->msix) {
4061         g_free(vdev->msix);
4062         vdev->msix = NULL;
4063     }
4064 }
4065
4066 static void vfio_err_notifier_handler(void *opaque)
4067 {
4068     VFIOPCIDevice *vdev = opaque;
4069
4070     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
4071         return;
4072     }
4073
4074     /*
4075      * TBD. Retrieve the error details and decide what action
4076      * needs to be taken. One of the actions could be to pass
4077      * the error to the guest and have the guest driver recover
4078      * from the error. This requires that PCIe capabilities be
4079      * exposed to the guest. For now, we just terminate the
4080      * guest to contain the error.
4081      */
4082
4083     error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "
4084                  "Please collect any data possible and then kill the guest",
4085                  __func__, vdev->host.domain, vdev->host.bus,
4086                  vdev->host.slot, vdev->host.function);
4087
4088     vm_stop(RUN_STATE_INTERNAL_ERROR);
4089 }
4090
4091 /*
4092  * Registers error notifier for devices supporting error recovery.
4093  * If we encounter a failure in this function, we report an error
4094  * and continue after disabling error recovery support for the
4095  * device.
4096  */
4097 static void vfio_register_err_notifier(VFIOPCIDevice *vdev)
4098 {
4099     int ret;
4100     int argsz;
4101     struct vfio_irq_set *irq_set;
4102     int32_t *pfd;
4103
4104     if (!vdev->pci_aer) {
4105         return;
4106     }
4107
4108     if (event_notifier_init(&vdev->err_notifier, 0)) {
4109         error_report("vfio: Unable to init event notifier for error detection");
4110         vdev->pci_aer = false;
4111         return;
4112     }
4113
4114     argsz = sizeof(*irq_set) + sizeof(*pfd);
4115
4116     irq_set = g_malloc0(argsz);
4117     irq_set->argsz = argsz;
4118     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
4119                      VFIO_IRQ_SET_ACTION_TRIGGER;
4120     irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
4121     irq_set->start = 0;
4122     irq_set->count = 1;
4123     pfd = (int32_t *)&irq_set->data;
4124
4125     *pfd = event_notifier_get_fd(&vdev->err_notifier);
4126     qemu_set_fd_handler(*pfd, vfio_err_notifier_handler, NULL, vdev);
4127
4128     ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
4129     if (ret) {
4130         error_report("vfio: Failed to set up error notification");
4131         qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
4132         event_notifier_cleanup(&vdev->err_notifier);
4133         vdev->pci_aer = false;
4134     }
4135     g_free(irq_set);
4136 }
4137
4138 static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
4139 {
4140     int argsz;
4141     struct vfio_irq_set *irq_set;
4142     int32_t *pfd;
4143     int ret;
4144
4145     if (!vdev->pci_aer) {
4146         return;
4147     }
4148
4149     argsz = sizeof(*irq_set) + sizeof(*pfd);
4150
4151     irq_set = g_malloc0(argsz);
4152     irq_set->argsz = argsz;
4153     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
4154                      VFIO_IRQ_SET_ACTION_TRIGGER;
4155     irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
4156     irq_set->start = 0;
4157     irq_set->count = 1;
4158     pfd = (int32_t *)&irq_set->data;
4159     *pfd = -1;
4160
4161     ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
4162     if (ret) {
4163         error_report("vfio: Failed to de-assign error fd: %m");
4164     }
4165     g_free(irq_set);
4166     qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
4167                         NULL, NULL, vdev);
4168     event_notifier_cleanup(&vdev->err_notifier);
4169 }
4170
4171 static int vfio_initfn(PCIDevice *pdev)
4172 {
4173     VFIOPCIDevice *pvdev, *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
4174     VFIOGroup *group;
4175     char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
4176     ssize_t len;
4177     struct stat st;
4178     int groupid;
4179     int ret;
4180
4181     /* Check that the host device exists */
4182     snprintf(path, sizeof(path),
4183              "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
4184              vdev->host.domain, vdev->host.bus, vdev->host.slot,
4185              vdev->host.function);
4186     if (stat(path, &st) < 0) {
4187         error_report("vfio: error: no such host device: %s", path);
4188         return -errno;
4189     }
4190
4191     strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
4192
4193     len = readlink(path, iommu_group_path, sizeof(path));
4194     if (len <= 0 || len >= sizeof(path)) {
4195         error_report("vfio: error no iommu_group for device");
4196         return len < 0 ? -errno : ENAMETOOLONG;
4197     }
4198
4199     iommu_group_path[len] = 0;
4200     group_name = basename(iommu_group_path);
4201
4202     if (sscanf(group_name, "%d", &groupid) != 1) {
4203         error_report("vfio: error reading %s: %m", path);
4204         return -errno;
4205     }
4206
4207     trace_vfio_initfn(vdev->host.domain, vdev->host.bus,
4208                       vdev->host.slot, vdev->host.function, groupid);
4209
4210     group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev));
4211     if (!group) {
4212         error_report("vfio: failed to get group %d", groupid);
4213         return -ENOENT;
4214     }
4215
4216     snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
4217             vdev->host.domain, vdev->host.bus, vdev->host.slot,
4218             vdev->host.function);
4219
4220     QLIST_FOREACH(pvdev, &group->device_list, next) {
4221         if (pvdev->host.domain == vdev->host.domain &&
4222             pvdev->host.bus == vdev->host.bus &&
4223             pvdev->host.slot == vdev->host.slot &&
4224             pvdev->host.function == vdev->host.function) {
4225
4226             error_report("vfio: error: device %s is already attached", path);
4227             vfio_put_group(group);
4228             return -EBUSY;
4229         }
4230     }
4231
4232     ret = vfio_get_device(group, path, vdev);
4233     if (ret) {
4234         error_report("vfio: failed to get device %s", path);
4235         vfio_put_group(group);
4236         return ret;
4237     }
4238
4239     /* Get a copy of config space */
4240     ret = pread(vdev->fd, vdev->pdev.config,
4241                 MIN(pci_config_size(&vdev->pdev), vdev->config_size),
4242                 vdev->config_offset);
4243     if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
4244         ret = ret < 0 ? -errno : -EFAULT;
4245         error_report("vfio: Failed to read device config space");
4246         goto out_put;
4247     }
4248
4249     /* vfio emulates a lot for us, but some bits need extra love */
4250     vdev->emulated_config_bits = g_malloc0(vdev->config_size);
4251
4252     /* QEMU can choose to expose the ROM or not */
4253     memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
4254
4255     /* QEMU can change multi-function devices to single function, or reverse */
4256     vdev->emulated_config_bits[PCI_HEADER_TYPE] =
4257                                               PCI_HEADER_TYPE_MULTI_FUNCTION;
4258
4259     /* Restore or clear multifunction, this is always controlled by QEMU */
4260     if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
4261         vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
4262     } else {
4263         vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
4264     }
4265
4266     /*
4267      * Clear host resource mapping info.  If we choose not to register a
4268      * BAR, such as might be the case with the option ROM, we can get
4269      * confusing, unwritable, residual addresses from the host here.
4270      */
4271     memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
4272     memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
4273
4274     vfio_pci_size_rom(vdev);
4275
4276     ret = vfio_early_setup_msix(vdev);
4277     if (ret) {
4278         goto out_put;
4279     }
4280
4281     vfio_map_bars(vdev);
4282
4283     ret = vfio_add_capabilities(vdev);
4284     if (ret) {
4285         goto out_teardown;
4286     }
4287
4288     /* QEMU emulates all of MSI & MSIX */
4289     if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
4290         memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
4291                MSIX_CAP_LENGTH);
4292     }
4293
4294     if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
4295         memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
4296                vdev->msi_cap_size);
4297     }
4298
4299     if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
4300         vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
4301                                                   vfio_intx_mmap_enable, vdev);
4302         pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
4303         ret = vfio_enable_intx(vdev);
4304         if (ret) {
4305             goto out_teardown;
4306         }
4307     }
4308
4309     vfio_register_err_notifier(vdev);
4310
4311     return 0;
4312
4313 out_teardown:
4314     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
4315     vfio_teardown_msi(vdev);
4316     vfio_unmap_bars(vdev);
4317 out_put:
4318     g_free(vdev->emulated_config_bits);
4319     vfio_put_device(vdev);
4320     vfio_put_group(group);
4321     return ret;
4322 }
4323
4324 static void vfio_exitfn(PCIDevice *pdev)
4325 {
4326     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
4327     VFIOGroup *group = vdev->group;
4328
4329     vfio_unregister_err_notifier(vdev);
4330     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
4331     vfio_disable_interrupts(vdev);
4332     if (vdev->intx.mmap_timer) {
4333         timer_free(vdev->intx.mmap_timer);
4334     }
4335     vfio_teardown_msi(vdev);
4336     vfio_unmap_bars(vdev);
4337     g_free(vdev->emulated_config_bits);
4338     g_free(vdev->rom);
4339     vfio_put_device(vdev);
4340     vfio_put_group(group);
4341 }
4342
4343 static void vfio_pci_reset(DeviceState *dev)
4344 {
4345     PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
4346     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
4347
4348     trace_vfio_pci_reset(vdev->host.domain, vdev->host.bus,
4349                          vdev->host.slot, vdev->host.function);
4350
4351     vfio_pci_pre_reset(vdev);
4352
4353     if (vdev->reset_works && (vdev->has_flr || !vdev->has_pm_reset) &&
4354         !ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
4355         trace_vfio_pci_reset_flr(vdev->host.domain, vdev->host.bus,
4356                                   vdev->host.slot, vdev->host.function);
4357         goto post_reset;
4358     }
4359
4360     /* See if we can do our own bus reset */
4361     if (!vfio_pci_hot_reset_one(vdev)) {
4362         goto post_reset;
4363     }
4364
4365     /* If nothing else works and the device supports PM reset, use it */
4366     if (vdev->reset_works && vdev->has_pm_reset &&
4367         !ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
4368         trace_vfio_pci_reset_pm(vdev->host.domain, vdev->host.bus,
4369                                 vdev->host.slot, vdev->host.function);
4370         goto post_reset;
4371     }
4372
4373 post_reset:
4374     vfio_pci_post_reset(vdev);
4375 }
4376
4377 static void vfio_instance_init(Object *obj)
4378 {
4379     PCIDevice *pci_dev = PCI_DEVICE(obj);
4380     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, PCI_DEVICE(obj));
4381
4382     device_add_bootindex_property(obj, &vdev->bootindex,
4383                                   "bootindex", NULL,
4384                                   &pci_dev->qdev, NULL);
4385 }
4386
4387 static Property vfio_pci_dev_properties[] = {
4388     DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
4389     DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
4390                        intx.mmap_timeout, 1100),
4391     DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
4392                     VFIO_FEATURE_ENABLE_VGA_BIT, false),
4393     DEFINE_PROP_INT32("bootindex", VFIOPCIDevice, bootindex, -1),
4394     /*
4395      * TODO - support passed fds... is this necessary?
4396      * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
4397      * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name),
4398      */
4399     DEFINE_PROP_END_OF_LIST(),
4400 };
4401
4402 static const VMStateDescription vfio_pci_vmstate = {
4403     .name = "vfio-pci",
4404     .unmigratable = 1,
4405 };
4406
4407 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
4408 {
4409     DeviceClass *dc = DEVICE_CLASS(klass);
4410     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
4411
4412     dc->reset = vfio_pci_reset;
4413     dc->props = vfio_pci_dev_properties;
4414     dc->vmsd = &vfio_pci_vmstate;
4415     dc->desc = "VFIO-based PCI device assignment";
4416     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
4417     pdc->init = vfio_initfn;
4418     pdc->exit = vfio_exitfn;
4419     pdc->config_read = vfio_pci_read_config;
4420     pdc->config_write = vfio_pci_write_config;
4421     pdc->is_express = 1; /* We might be */
4422 }
4423
4424 static const TypeInfo vfio_pci_dev_info = {
4425     .name = "vfio-pci",
4426     .parent = TYPE_PCI_DEVICE,
4427     .instance_size = sizeof(VFIOPCIDevice),
4428     .class_init = vfio_pci_dev_class_init,
4429     .instance_init = vfio_instance_init,
4430 };
4431
4432 static void register_vfio_pci_dev_type(void)
4433 {
4434     type_register_static(&vfio_pci_dev_info);
4435 }
4436
4437 type_init(register_vfio_pci_dev_type)
4438
4439 static int vfio_container_do_ioctl(AddressSpace *as, int32_t groupid,
4440                                    int req, void *param)
4441 {
4442     VFIOGroup *group;
4443     VFIOContainer *container;
4444     int ret = -1;
4445
4446     group = vfio_get_group(groupid, as);
4447     if (!group) {
4448         error_report("vfio: group %d not registered", groupid);
4449         return ret;
4450     }
4451
4452     container = group->container;
4453     if (group->container) {
4454         ret = ioctl(container->fd, req, param);
4455         if (ret < 0) {
4456             error_report("vfio: failed to ioctl container: ret=%d, %s",
4457                          ret, strerror(errno));
4458         }
4459     }
4460
4461     vfio_put_group(group);
4462
4463     return ret;
4464 }
4465
4466 int vfio_container_ioctl(AddressSpace *as, int32_t groupid,
4467                          int req, void *param)
4468 {
4469     /* We allow only certain ioctls to the container */
4470     switch (req) {
4471     case VFIO_CHECK_EXTENSION:
4472     case VFIO_IOMMU_SPAPR_TCE_GET_INFO:
4473         break;
4474     default:
4475         /* Return an error on unknown requests */
4476         error_report("vfio: unsupported ioctl %X", req);
4477         return -1;
4478     }
4479
4480     return vfio_container_do_ioctl(as, groupid, req, param);
4481 }