hw/misc/vfio.c

   1 /*
   2  * vfio based device assignment support
   3  *
   4  * Copyright Red Hat, Inc. 2012
   5  *
   6  * Authors:
   7  *  Alex Williamson <alex.williamson@redhat.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  * Based on qemu-kvm device-assignment:
  13  *  Adapted for KVM by Qumranet.
  14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
  15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
  16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
  17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
  18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
  19  */
  20
  21 #include <dirent.h>
  22 #include <linux/vfio.h>
  23 #include <sys/ioctl.h>
  24 #include <sys/mman.h>
  25 #include <sys/stat.h>
  26 #include <sys/types.h>
  27 #include <unistd.h>
  28
  29 #include "config.h"
  30 #include "exec/address-spaces.h"
  31 #include "exec/memory.h"
  32 #include "hw/pci/msi.h"
  33 #include "hw/pci/msix.h"
  34 #include "hw/pci/pci.h"
  35 #include "qemu-common.h"
  36 #include "qemu/error-report.h"
  37 #include "qemu/event_notifier.h"
  38 #include "qemu/queue.h"
  39 #include "qemu/range.h"
  40 #include "sysemu/kvm.h"
  41 #include "sysemu/sysemu.h"
  42
  43 /* #define DEBUG_VFIO */
  44 #ifdef DEBUG_VFIO
  45 #define DPRINTF(fmt, ...) \
  46     do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
  47 #else
  48 #define DPRINTF(fmt, ...) \
  49     do { } while (0)
  50 #endif
  51
  52 /* Extra debugging, trap acceleration paths for more logging */
  53 #define VFIO_ALLOW_MMAP 1
  54 #define VFIO_ALLOW_KVM_INTX 1
  55 #define VFIO_ALLOW_KVM_MSI 1
  56 #define VFIO_ALLOW_KVM_MSIX 1
  57
  58 struct VFIODevice;
  59
  60 typedef struct VFIOQuirk {
  61     MemoryRegion mem;
  62     struct VFIODevice *vdev;
  63     QLIST_ENTRY(VFIOQuirk) next;
  64     struct {
  65         uint32_t base_offset:TARGET_PAGE_BITS;
  66         uint32_t address_offset:TARGET_PAGE_BITS;
  67         uint32_t address_size:3;
  68         uint32_t bar:3;
  69
  70         uint32_t address_match;
  71         uint32_t address_mask;
  72
  73         uint32_t address_val:TARGET_PAGE_BITS;
  74         uint32_t data_offset:TARGET_PAGE_BITS;
  75         uint32_t data_size:3;
  76
  77         uint8_t flags;
  78         uint8_t read_flags;
  79         uint8_t write_flags;
  80     } data;
  81 } VFIOQuirk;
  82
  83 typedef struct VFIOBAR {
  84     off_t fd_offset; /* offset of BAR within device fd */
  85     int fd; /* device fd, allows us to pass VFIOBAR as opaque data */
  86     MemoryRegion mem; /* slow, read/write access */
  87     MemoryRegion mmap_mem; /* direct mapped access */
  88     void *mmap;
  89     size_t size;
  90     uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
  91     uint8_t nr; /* cache the BAR number for debug */
  92     bool ioport;
  93     bool mem64;
  94     QLIST_HEAD(, VFIOQuirk) quirks;
  95 } VFIOBAR;
  96
  97 typedef struct VFIOVGARegion {
  98     MemoryRegion mem;
  99     off_t offset;
 100     int nr;
 101     QLIST_HEAD(, VFIOQuirk) quirks;
 102 } VFIOVGARegion;
 103
 104 typedef struct VFIOVGA {
 105     off_t fd_offset;
 106     int fd;
 107     VFIOVGARegion region[QEMU_PCI_VGA_NUM_REGIONS];
 108 } VFIOVGA;
 109
 110 typedef struct VFIOINTx {
 111     bool pending; /* interrupt pending */
 112     bool kvm_accel; /* set when QEMU bypass through KVM enabled */
 113     uint8_t pin; /* which pin to pull for qemu_set_irq */
 114     EventNotifier interrupt; /* eventfd triggered on interrupt */
 115     EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
 116     PCIINTxRoute route; /* routing info for QEMU bypass */
 117     uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */
 118     QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */
 119 } VFIOINTx;
 120
 121 typedef struct VFIOMSIVector {
 122     EventNotifier interrupt; /* eventfd triggered on interrupt */
 123     struct VFIODevice *vdev; /* back pointer to device */
 124     MSIMessage msg; /* cache the MSI message so we know when it changes */
 125     int virq; /* KVM irqchip route for QEMU bypass */
 126     bool use;
 127 } VFIOMSIVector;
 128
 129 enum {
 130     VFIO_INT_NONE = 0,
 131     VFIO_INT_INTx = 1,
 132     VFIO_INT_MSI  = 2,
 133     VFIO_INT_MSIX = 3,
 134 };
 135
 136 struct VFIOGroup;
 137
 138 typedef struct VFIOContainer {
 139     int fd; /* /dev/vfio/vfio, empowered by the attached groups */
 140     struct {
 141         /* enable abstraction to support various iommu backends */
 142         union {
 143             MemoryListener listener; /* Used by type1 iommu */
 144         };
 145         void (*release)(struct VFIOContainer *);
 146     } iommu_data;
 147     QLIST_HEAD(, VFIOGroup) group_list;
 148     QLIST_ENTRY(VFIOContainer) next;
 149 } VFIOContainer;
 150
 151 /* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
 152 typedef struct VFIOMSIXInfo {
 153     uint8_t table_bar;
 154     uint8_t pba_bar;
 155     uint16_t entries;
 156     uint32_t table_offset;
 157     uint32_t pba_offset;
 158     MemoryRegion mmap_mem;
 159     void *mmap;
 160 } VFIOMSIXInfo;
 161
 162 typedef struct VFIODevice {
 163     PCIDevice pdev;
 164     int fd;
 165     VFIOINTx intx;
 166     unsigned int config_size;
 167     uint8_t *emulated_config_bits; /* QEMU emulated bits, little-endian */
 168     off_t config_offset; /* Offset of config space region within device fd */
 169     unsigned int rom_size;
 170     off_t rom_offset; /* Offset of ROM region within device fd */
 171     void *rom;
 172     int msi_cap_size;
 173     VFIOMSIVector *msi_vectors;
 174     VFIOMSIXInfo *msix;
 175     int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
 176     int interrupt; /* Current interrupt type */
 177     VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
 178     VFIOVGA vga; /* 0xa0000, 0x3b0, 0x3c0 */
 179     PCIHostDeviceAddress host;
 180     QLIST_ENTRY(VFIODevice) next;
 181     struct VFIOGroup *group;
 182     EventNotifier err_notifier;
 183     uint32_t features;
 184 #define VFIO_FEATURE_ENABLE_VGA_BIT 0
 185 #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
 186     int32_t bootindex;
 187     uint8_t pm_cap;
 188     bool reset_works;
 189     bool has_vga;
 190     bool pci_aer;
 191     bool has_flr;
 192     bool has_pm_reset;
 193     bool needs_reset;
 194 } VFIODevice;
 195
 196 typedef struct VFIOGroup {
 197     int fd;
 198     int groupid;
 199     VFIOContainer *container;
 200     QLIST_HEAD(, VFIODevice) device_list;
 201     QLIST_ENTRY(VFIOGroup) next;
 202     QLIST_ENTRY(VFIOGroup) container_next;
 203 } VFIOGroup;
 204
 205 #define MSIX_CAP_LENGTH 12
 206
 207 static QLIST_HEAD(, VFIOContainer)
 208     container_list = QLIST_HEAD_INITIALIZER(container_list);
 209
 210 static QLIST_HEAD(, VFIOGroup)
 211     group_list = QLIST_HEAD_INITIALIZER(group_list);
 212
 213 #ifdef CONFIG_KVM
 214 /*
 215  * We have a single VFIO pseudo device per KVM VM.  Once created it lives
 216  * for the life of the VM.  Closing the file descriptor only drops our
 217  * reference to it and the device's reference to kvm.  Therefore once
 218  * initialized, this file descriptor is only released on QEMU exit and
 219  * we'll re-use it should another vfio device be attached before then.
 220  */
 221 static int vfio_kvm_device_fd = -1;
 222 #endif
 223
 224 static void vfio_disable_interrupts(VFIODevice *vdev);
 225 static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
 226 static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
 227                                   uint32_t val, int len);
 228 static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
 229
 230 /*
 231  * Common VFIO interrupt disable
 232  */
 233 static void vfio_disable_irqindex(VFIODevice *vdev, int index)
 234 {
 235     struct vfio_irq_set irq_set = {
 236         .argsz = sizeof(irq_set),
 237         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
 238         .index = index,
 239         .start = 0,
 240         .count = 0,
 241     };
 242
 243     ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 244 }
 245
 246 /*
 247  * INTx
 248  */
 249 static void vfio_unmask_intx(VFIODevice *vdev)
 250 {
 251     struct vfio_irq_set irq_set = {
 252         .argsz = sizeof(irq_set),
 253         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
 254         .index = VFIO_PCI_INTX_IRQ_INDEX,
 255         .start = 0,
 256         .count = 1,
 257     };
 258
 259     ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 260 }
 261
 262 #ifdef CONFIG_KVM /* Unused outside of CONFIG_KVM code */
 263 static void vfio_mask_intx(VFIODevice *vdev)
 264 {
 265     struct vfio_irq_set irq_set = {
 266         .argsz = sizeof(irq_set),
 267         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
 268         .index = VFIO_PCI_INTX_IRQ_INDEX,
 269         .start = 0,
 270         .count = 1,
 271     };
 272
 273     ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 274 }
 275 #endif
 276
 277 /*
 278  * Disabling BAR mmaping can be slow, but toggling it around INTx can
 279  * also be a huge overhead.  We try to get the best of both worlds by
 280  * waiting until an interrupt to disable mmaps (subsequent transitions
 281  * to the same state are effectively no overhead).  If the interrupt has
 282  * been serviced and the time gap is long enough, we re-enable mmaps for
 283  * performance.  This works well for things like graphics cards, which
 284  * may not use their interrupt at all and are penalized to an unusable
 285  * level by read/write BAR traps.  Other devices, like NICs, have more
 286  * regular interrupts and see much better latency by staying in non-mmap
 287  * mode.  We therefore set the default mmap_timeout such that a ping
 288  * is just enough to keep the mmap disabled.  Users can experiment with
 289  * other options with the x-intx-mmap-timeout-ms parameter (a value of
 290  * zero disables the timer).
 291  */
 292 static void vfio_intx_mmap_enable(void *opaque)
 293 {
 294     VFIODevice *vdev = opaque;
 295
 296     if (vdev->intx.pending) {
 297         timer_mod(vdev->intx.mmap_timer,
 298                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
 299         return;
 300     }
 301
 302     vfio_mmap_set_enabled(vdev, true);
 303 }
 304
 305 static void vfio_intx_interrupt(void *opaque)
 306 {
 307     VFIODevice *vdev = opaque;
 308
 309     if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
 310         return;
 311     }
 312
 313     DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __func__, vdev->host.domain,
 314             vdev->host.bus, vdev->host.slot, vdev->host.function,
 315             'A' + vdev->intx.pin);
 316
 317     vdev->intx.pending = true;
 318     pci_irq_assert(&vdev->pdev);
 319     vfio_mmap_set_enabled(vdev, false);
 320     if (vdev->intx.mmap_timeout) {
 321         timer_mod(vdev->intx.mmap_timer,
 322                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
 323     }
 324 }
 325
 326 static void vfio_eoi(VFIODevice *vdev)
 327 {
 328     if (!vdev->intx.pending) {
 329         return;
 330     }
 331
 332     DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n", __func__, vdev->host.domain,
 333             vdev->host.bus, vdev->host.slot, vdev->host.function);
 334
 335     vdev->intx.pending = false;
 336     pci_irq_deassert(&vdev->pdev);
 337     vfio_unmask_intx(vdev);
 338 }
 339
 340 static void vfio_enable_intx_kvm(VFIODevice *vdev)
 341 {
 342 #ifdef CONFIG_KVM
 343     struct kvm_irqfd irqfd = {
 344         .fd = event_notifier_get_fd(&vdev->intx.interrupt),
 345         .gsi = vdev->intx.route.irq,
 346         .flags = KVM_IRQFD_FLAG_RESAMPLE,
 347     };
 348     struct vfio_irq_set *irq_set;
 349     int ret, argsz;
 350     int32_t *pfd;
 351
 352     if (!VFIO_ALLOW_KVM_INTX || !kvm_irqfds_enabled() ||
 353         vdev->intx.route.mode != PCI_INTX_ENABLED ||
 354         !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
 355         return;
 356     }
 357
 358     /* Get to a known interrupt state */
 359     qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
 360     vfio_mask_intx(vdev);
 361     vdev->intx.pending = false;
 362     pci_irq_deassert(&vdev->pdev);
 363
 364     /* Get an eventfd for resample/unmask */
 365     if (event_notifier_init(&vdev->intx.unmask, 0)) {
 366         error_report("vfio: Error: event_notifier_init failed eoi");
 367         goto fail;
 368     }
 369
 370     /* KVM triggers it, VFIO listens for it */
 371     irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
 372
 373     if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
 374         error_report("vfio: Error: Failed to setup resample irqfd: %m");
 375         goto fail_irqfd;
 376     }
 377
 378     argsz = sizeof(*irq_set) + sizeof(*pfd);
 379
 380     irq_set = g_malloc0(argsz);
 381     irq_set->argsz = argsz;
 382     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
 383     irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 384     irq_set->start = 0;
 385     irq_set->count = 1;
 386     pfd = (int32_t *)&irq_set->data;
 387
 388     *pfd = irqfd.resamplefd;
 389
 390     ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
 391     g_free(irq_set);
 392     if (ret) {
 393         error_report("vfio: Error: Failed to setup INTx unmask fd: %m");
 394         goto fail_vfio;
 395     }
 396
 397     /* Let'em rip */
 398     vfio_unmask_intx(vdev);
 399
 400     vdev->intx.kvm_accel = true;
 401
 402     DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel enabled\n",
 403             __func__, vdev->host.domain, vdev->host.bus,
 404             vdev->host.slot, vdev->host.function);
 405
 406     return;
 407
 408 fail_vfio:
 409     irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
 410     kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
 411 fail_irqfd:
 412     event_notifier_cleanup(&vdev->intx.unmask);
 413 fail:
 414     qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
 415     vfio_unmask_intx(vdev);
 416 #endif
 417 }
 418
 419 static void vfio_disable_intx_kvm(VFIODevice *vdev)
 420 {
 421 #ifdef CONFIG_KVM
 422     struct kvm_irqfd irqfd = {
 423         .fd = event_notifier_get_fd(&vdev->intx.interrupt),
 424         .gsi = vdev->intx.route.irq,
 425         .flags = KVM_IRQFD_FLAG_DEASSIGN,
 426     };
 427
 428     if (!vdev->intx.kvm_accel) {
 429         return;
 430     }
 431
 432     /*
 433      * Get to a known state, hardware masked, QEMU ready to accept new
 434      * interrupts, QEMU IRQ de-asserted.
 435      */
 436     vfio_mask_intx(vdev);
 437     vdev->intx.pending = false;
 438     pci_irq_deassert(&vdev->pdev);
 439
 440     /* Tell KVM to stop listening for an INTx irqfd */
 441     if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
 442         error_report("vfio: Error: Failed to disable INTx irqfd: %m");
 443     }
 444
 445     /* We only need to close the eventfd for VFIO to cleanup the kernel side */
 446     event_notifier_cleanup(&vdev->intx.unmask);
 447
 448     /* QEMU starts listening for interrupt events. */
 449     qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
 450
 451     vdev->intx.kvm_accel = false;
 452
 453     /* If we've missed an event, let it re-fire through QEMU */
 454     vfio_unmask_intx(vdev);
 455
 456     DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n",
 457             __func__, vdev->host.domain, vdev->host.bus,
 458             vdev->host.slot, vdev->host.function);
 459 #endif
 460 }
 461
 462 static void vfio_update_irq(PCIDevice *pdev)
 463 {
 464     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
 465     PCIINTxRoute route;
 466
 467     if (vdev->interrupt != VFIO_INT_INTx) {
 468         return;
 469     }
 470
 471     route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
 472
 473     if (!pci_intx_route_changed(&vdev->intx.route, &route)) {
 474         return; /* Nothing changed */
 475     }
 476
 477     DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__,
 478             vdev->host.domain, vdev->host.bus, vdev->host.slot,
 479             vdev->host.function, vdev->intx.route.irq, route.irq);
 480
 481     vfio_disable_intx_kvm(vdev);
 482
 483     vdev->intx.route = route;
 484
 485     if (route.mode != PCI_INTX_ENABLED) {
 486         return;
 487     }
 488
 489     vfio_enable_intx_kvm(vdev);
 490
 491     /* Re-enable the interrupt in cased we missed an EOI */
 492     vfio_eoi(vdev);
 493 }
 494
 495 static int vfio_enable_intx(VFIODevice *vdev)
 496 {
 497     uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
 498     int ret, argsz;
 499     struct vfio_irq_set *irq_set;
 500     int32_t *pfd;
 501
 502     if (!pin) {
 503         return 0;
 504     }
 505
 506     vfio_disable_interrupts(vdev);
 507
 508     vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
 509     pci_config_set_interrupt_pin(vdev->pdev.config, pin);
 510
 511 #ifdef CONFIG_KVM
 512     /*
 513      * Only conditional to avoid generating error messages on platforms
 514      * where we won't actually use the result anyway.
 515      */
 516     if (kvm_irqfds_enabled() &&
 517         kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
 518         vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
 519                                                         vdev->intx.pin);
 520     }
 521 #endif
 522
 523     ret = event_notifier_init(&vdev->intx.interrupt, 0);
 524     if (ret) {
 525         error_report("vfio: Error: event_notifier_init failed");
 526         return ret;
 527     }
 528
 529     argsz = sizeof(*irq_set) + sizeof(*pfd);
 530
 531     irq_set = g_malloc0(argsz);
 532     irq_set->argsz = argsz;
 533     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 534     irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 535     irq_set->start = 0;
 536     irq_set->count = 1;
 537     pfd = (int32_t *)&irq_set->data;
 538
 539     *pfd = event_notifier_get_fd(&vdev->intx.interrupt);
 540     qemu_set_fd_handler(*pfd, vfio_intx_interrupt, NULL, vdev);
 541
 542     ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
 543     g_free(irq_set);
 544     if (ret) {
 545         error_report("vfio: Error: Failed to setup INTx fd: %m");
 546         qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
 547         event_notifier_cleanup(&vdev->intx.interrupt);
 548         return -errno;
 549     }
 550
 551     vfio_enable_intx_kvm(vdev);
 552
 553     vdev->interrupt = VFIO_INT_INTx;
 554
 555     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
 556             vdev->host.bus, vdev->host.slot, vdev->host.function);
 557
 558     return 0;
 559 }
 560
 561 static void vfio_disable_intx(VFIODevice *vdev)
 562 {
 563     int fd;
 564
 565     timer_del(vdev->intx.mmap_timer);
 566     vfio_disable_intx_kvm(vdev);
 567     vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
 568     vdev->intx.pending = false;
 569     pci_irq_deassert(&vdev->pdev);
 570     vfio_mmap_set_enabled(vdev, true);
 571
 572     fd = event_notifier_get_fd(&vdev->intx.interrupt);
 573     qemu_set_fd_handler(fd, NULL, NULL, vdev);
 574     event_notifier_cleanup(&vdev->intx.interrupt);
 575
 576     vdev->interrupt = VFIO_INT_NONE;
 577
 578     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
 579             vdev->host.bus, vdev->host.slot, vdev->host.function);
 580 }
 581
 582 /*
 583  * MSI/X
 584  */
 585 static void vfio_msi_interrupt(void *opaque)
 586 {
 587     VFIOMSIVector *vector = opaque;
 588     VFIODevice *vdev = vector->vdev;
 589     int nr = vector - vdev->msi_vectors;
 590
 591     if (!event_notifier_test_and_clear(&vector->interrupt)) {
 592         return;
 593     }
 594
 595 #ifdef VFIO_DEBUG
 596     MSIMessage msg;
 597
 598     if (vdev->interrupt == VFIO_INT_MSIX) {
 599         msg = msi_get_message(&vdev->pdev, nr);
 600     } else if (vdev->interrupt == VFIO_INT_MSI) {
 601         msg = msix_get_message(&vdev->pdev, nr);
 602     } else {
 603         abort();
 604     }
 605
 606     DPRINTF("%s(%04x:%02x:%02x.%x) vector %d 0x%"PRIx64"/0x%x\n", __func__,
 607             vdev->host.domain, vdev->host.bus, vdev->host.slot,
 608             vdev->host.function, nr, msg.address, msg.data);
 609 #endif
 610
 611     if (vdev->interrupt == VFIO_INT_MSIX) {
 612         msix_notify(&vdev->pdev, nr);
 613     } else if (vdev->interrupt == VFIO_INT_MSI) {
 614         msi_notify(&vdev->pdev, nr);
 615     } else {
 616         error_report("vfio: MSI interrupt receieved, but not enabled?");
 617     }
 618 }
 619
 620 static int vfio_enable_vectors(VFIODevice *vdev, bool msix)
 621 {
 622     struct vfio_irq_set *irq_set;
 623     int ret = 0, i, argsz;
 624     int32_t *fds;
 625
 626     argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
 627
 628     irq_set = g_malloc0(argsz);
 629     irq_set->argsz = argsz;
 630     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 631     irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
 632     irq_set->start = 0;
 633     irq_set->count = vdev->nr_vectors;
 634     fds = (int32_t *)&irq_set->data;
 635
 636     for (i = 0; i < vdev->nr_vectors; i++) {
 637         if (!vdev->msi_vectors[i].use) {
 638             fds[i] = -1;
 639             continue;
 640         }
 641
 642         fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
 643     }
 644
 645     ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
 646
 647     g_free(irq_set);
 648
 649     return ret;
 650 }
 651
 652 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
 653                                    MSIMessage *msg, IOHandler *handler)
 654 {
 655     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
 656     VFIOMSIVector *vector;
 657     int ret;
 658
 659     DPRINTF("%s(%04x:%02x:%02x.%x) vector %d used\n", __func__,
 660             vdev->host.domain, vdev->host.bus, vdev->host.slot,
 661             vdev->host.function, nr);
 662
 663     vector = &vdev->msi_vectors[nr];
 664     vector->vdev = vdev;
 665     vector->use = true;
 666
 667     msix_vector_use(pdev, nr);
 668
 669     if (event_notifier_init(&vector->interrupt, 0)) {
 670         error_report("vfio: Error: event_notifier_init failed");
 671     }
 672
 673     /*
 674      * Attempt to enable route through KVM irqchip,
 675      * default to userspace handling if unavailable.
 676      */
 677     vector->virq = msg && VFIO_ALLOW_KVM_MSIX ?
 678                    kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
 679     if (vector->virq < 0 ||
 680         kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
 681                                        NULL, vector->virq) < 0) {
 682         if (vector->virq >= 0) {
 683             kvm_irqchip_release_virq(kvm_state, vector->virq);
 684             vector->virq = -1;
 685         }
 686         qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 687                             handler, NULL, vector);
 688     }
 689
 690     /*
 691      * We don't want to have the host allocate all possible MSI vectors
 692      * for a device if they're not in use, so we shutdown and incrementally
 693      * increase them as needed.
 694      */
 695     if (vdev->nr_vectors < nr + 1) {
 696         vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
 697         vdev->nr_vectors = nr + 1;
 698         ret = vfio_enable_vectors(vdev, true);
 699         if (ret) {
 700             error_report("vfio: failed to enable vectors, %d", ret);
 701         }
 702     } else {
 703         int argsz;
 704         struct vfio_irq_set *irq_set;
 705         int32_t *pfd;
 706
 707         argsz = sizeof(*irq_set) + sizeof(*pfd);
 708
 709         irq_set = g_malloc0(argsz);
 710         irq_set->argsz = argsz;
 711         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
 712                          VFIO_IRQ_SET_ACTION_TRIGGER;
 713         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 714         irq_set->start = nr;
 715         irq_set->count = 1;
 716         pfd = (int32_t *)&irq_set->data;
 717
 718         *pfd = event_notifier_get_fd(&vector->interrupt);
 719
 720         ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
 721         g_free(irq_set);
 722         if (ret) {
 723             error_report("vfio: failed to modify vector, %d", ret);
 724         }
 725     }
 726
 727     return 0;
 728 }
 729
 730 static int vfio_msix_vector_use(PCIDevice *pdev,
 731                                 unsigned int nr, MSIMessage msg)
 732 {
 733     return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
 734 }
 735
 736 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
 737 {
 738     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
 739     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
 740     int argsz;
 741     struct vfio_irq_set *irq_set;
 742     int32_t *pfd;
 743
 744     DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__,
 745             vdev->host.domain, vdev->host.bus, vdev->host.slot,
 746             vdev->host.function, nr);
 747
 748     /*
 749      * XXX What's the right thing to do here?  This turns off the interrupt
 750      * completely, but do we really just want to switch the interrupt to
 751      * bouncing through userspace and let msix.c drop it?  Not sure.
 752      */
 753     msix_vector_unuse(pdev, nr);
 754
 755     argsz = sizeof(*irq_set) + sizeof(*pfd);
 756
 757     irq_set = g_malloc0(argsz);
 758     irq_set->argsz = argsz;
 759     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
 760                      VFIO_IRQ_SET_ACTION_TRIGGER;
 761     irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 762     irq_set->start = nr;
 763     irq_set->count = 1;
 764     pfd = (int32_t *)&irq_set->data;
 765
 766     *pfd = -1;
 767
 768     ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
 769
 770     g_free(irq_set);
 771
 772     if (vector->virq < 0) {
 773         qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 774                             NULL, NULL, NULL);
 775     } else {
 776         kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
 777                                           vector->virq);
 778         kvm_irqchip_release_virq(kvm_state, vector->virq);
 779         vector->virq = -1;
 780     }
 781
 782     event_notifier_cleanup(&vector->interrupt);
 783     vector->use = false;
 784 }
 785
 786 static void vfio_enable_msix(VFIODevice *vdev)
 787 {
 788     vfio_disable_interrupts(vdev);
 789
 790     vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(VFIOMSIVector));
 791
 792     vdev->interrupt = VFIO_INT_MSIX;
 793
 794     /*
 795      * Some communication channels between VF & PF or PF & fw rely on the
 796      * physical state of the device and expect that enabling MSI-X from the
 797      * guest enables the same on the host.  When our guest is Linux, the
 798      * guest driver call to pci_enable_msix() sets the enabling bit in the
 799      * MSI-X capability, but leaves the vector table masked.  We therefore
 800      * can't rely on a vector_use callback (from request_irq() in the guest)
 801      * to switch the physical device into MSI-X mode because that may come a
 802      * long time after pci_enable_msix().  This code enables vector 0 with
 803      * triggering to userspace, then immediately release the vector, leaving
 804      * the physical device with no vectors enabled, but MSI-X enabled, just
 805      * like the guest view.
 806      */
 807     vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL);
 808     vfio_msix_vector_release(&vdev->pdev, 0);
 809
 810     if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
 811                                   vfio_msix_vector_release, NULL)) {
 812         error_report("vfio: msix_set_vector_notifiers failed");
 813     }
 814
 815     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
 816             vdev->host.bus, vdev->host.slot, vdev->host.function);
 817 }
 818
 819 static void vfio_enable_msi(VFIODevice *vdev)
 820 {
 821     int ret, i;
 822
 823     vfio_disable_interrupts(vdev);
 824
 825     vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
 826 retry:
 827     vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector));
 828
 829     for (i = 0; i < vdev->nr_vectors; i++) {
 830         VFIOMSIVector *vector = &vdev->msi_vectors[i];
 831
 832         vector->vdev = vdev;
 833         vector->use = true;
 834
 835         if (event_notifier_init(&vector->interrupt, 0)) {
 836             error_report("vfio: Error: event_notifier_init failed");
 837         }
 838
 839         vector->msg = msi_get_message(&vdev->pdev, i);
 840
 841         /*
 842          * Attempt to enable route through KVM irqchip,
 843          * default to userspace handling if unavailable.
 844          */
 845         vector->virq = VFIO_ALLOW_KVM_MSI ?
 846                        kvm_irqchip_add_msi_route(kvm_state, vector->msg) : -1;
 847         if (vector->virq < 0 ||
 848             kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
 849                                            NULL, vector->virq) < 0) {
 850             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 851                                 vfio_msi_interrupt, NULL, vector);
 852         }
 853     }
 854
 855     ret = vfio_enable_vectors(vdev, false);
 856     if (ret) {
 857         if (ret < 0) {
 858             error_report("vfio: Error: Failed to setup MSI fds: %m");
 859         } else if (ret != vdev->nr_vectors) {
 860             error_report("vfio: Error: Failed to enable %d "
 861                          "MSI vectors, retry with %d", vdev->nr_vectors, ret);
 862         }
 863
 864         for (i = 0; i < vdev->nr_vectors; i++) {
 865             VFIOMSIVector *vector = &vdev->msi_vectors[i];
 866             if (vector->virq >= 0) {
 867                 kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
 868                                                   vector->virq);
 869                 kvm_irqchip_release_virq(kvm_state, vector->virq);
 870                 vector->virq = -1;
 871             } else {
 872                 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 873                                     NULL, NULL, NULL);
 874             }
 875             event_notifier_cleanup(&vector->interrupt);
 876         }
 877
 878         g_free(vdev->msi_vectors);
 879
 880         if (ret > 0 && ret != vdev->nr_vectors) {
 881             vdev->nr_vectors = ret;
 882             goto retry;
 883         }
 884         vdev->nr_vectors = 0;
 885
 886         return;
 887     }
 888
 889     vdev->interrupt = VFIO_INT_MSI;
 890
 891     DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__,
 892             vdev->host.domain, vdev->host.bus, vdev->host.slot,
 893             vdev->host.function, vdev->nr_vectors);
 894 }
 895
 896 static void vfio_disable_msi_common(VFIODevice *vdev)
 897 {
 898     g_free(vdev->msi_vectors);
 899     vdev->msi_vectors = NULL;
 900     vdev->nr_vectors = 0;
 901     vdev->interrupt = VFIO_INT_NONE;
 902
 903     vfio_enable_intx(vdev);
 904 }
 905
 906 static void vfio_disable_msix(VFIODevice *vdev)
 907 {
 908     int i;
 909
 910     msix_unset_vector_notifiers(&vdev->pdev);
 911
 912     /*
 913      * MSI-X will only release vectors if MSI-X is still enabled on the
 914      * device, check through the rest and release it ourselves if necessary.
 915      */
 916     for (i = 0; i < vdev->nr_vectors; i++) {
 917         if (vdev->msi_vectors[i].use) {
 918             vfio_msix_vector_release(&vdev->pdev, i);
 919         }
 920     }
 921
 922     if (vdev->nr_vectors) {
 923         vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
 924     }
 925
 926     vfio_disable_msi_common(vdev);
 927
 928     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
 929             vdev->host.bus, vdev->host.slot, vdev->host.function);
 930 }
 931
 932 static void vfio_disable_msi(VFIODevice *vdev)
 933 {
 934     int i;
 935
 936     vfio_disable_irqindex(vdev, VFIO_PCI_MSI_IRQ_INDEX);
 937
 938     for (i = 0; i < vdev->nr_vectors; i++) {
 939         VFIOMSIVector *vector = &vdev->msi_vectors[i];
 940
 941         if (!vector->use) {
 942             continue;
 943         }
 944
 945         if (vector->virq >= 0) {
 946             kvm_irqchip_remove_irqfd_notifier(kvm_state,
 947                                               &vector->interrupt, vector->virq);
 948             kvm_irqchip_release_virq(kvm_state, vector->virq);
 949             vector->virq = -1;
 950         } else {
 951             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 952                                 NULL, NULL, NULL);
 953         }
 954
 955         event_notifier_cleanup(&vector->interrupt);
 956     }
 957
 958     vfio_disable_msi_common(vdev);
 959
 960     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
 961             vdev->host.bus, vdev->host.slot, vdev->host.function);
 962 }
 963
 964 static void vfio_update_msi(VFIODevice *vdev)
 965 {
 966     int i;
 967
 968     for (i = 0; i < vdev->nr_vectors; i++) {
 969         VFIOMSIVector *vector = &vdev->msi_vectors[i];
 970         MSIMessage msg;
 971
 972         if (!vector->use || vector->virq < 0) {
 973             continue;
 974         }
 975
 976         msg = msi_get_message(&vdev->pdev, i);
 977
 978         if (msg.address != vector->msg.address ||
 979             msg.data != vector->msg.data) {
 980
 981             DPRINTF("%s(%04x:%02x:%02x.%x) MSI vector %d changed\n",
 982                     __func__, vdev->host.domain, vdev->host.bus,
 983                     vdev->host.slot, vdev->host.function, i);
 984
 985             kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg);
 986             vector->msg = msg;
 987         }
 988     }
 989 }
 990
 991 /*
 992  * IO Port/MMIO - Beware of the endians, VFIO is always little endian
 993  */
 994 static void vfio_bar_write(void *opaque, hwaddr addr,
 995                            uint64_t data, unsigned size)
 996 {
 997     VFIOBAR *bar = opaque;
 998     union {
 999         uint8_t byte;
1000         uint16_t word;
1001         uint32_t dword;
1002         uint64_t qword;
1003     } buf;
1004
1005     switch (size) {
1006     case 1:
1007         buf.byte = data;
1008         break;
1009     case 2:
1010         buf.word = cpu_to_le16(data);
1011         break;
1012     case 4:
1013         buf.dword = cpu_to_le32(data);
1014         break;
1015     default:
1016         hw_error("vfio: unsupported write size, %d bytes\n", size);
1017         break;
1018     }
1019
1020     if (pwrite(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
1021         error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
1022                      __func__, addr, data, size);
1023     }
1024
1025 #ifdef DEBUG_VFIO
1026     {
1027         VFIODevice *vdev = container_of(bar, VFIODevice, bars[bar->nr]);
1028
1029         DPRINTF("%s(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx", 0x%"PRIx64
1030                 ", %d)\n", __func__, vdev->host.domain, vdev->host.bus,
1031                 vdev->host.slot, vdev->host.function, bar->nr, addr,
1032                 data, size);
1033     }
1034 #endif
1035
1036     /*
1037      * A read or write to a BAR always signals an INTx EOI.  This will
1038      * do nothing if not pending (including not in INTx mode).  We assume
1039      * that a BAR access is in response to an interrupt and that BAR
1040      * accesses will service the interrupt.  Unfortunately, we don't know
1041      * which access will service the interrupt, so we're potentially
1042      * getting quite a few host interrupts per guest interrupt.
1043      */
1044     vfio_eoi(container_of(bar, VFIODevice, bars[bar->nr]));
1045 }
1046
1047 static uint64_t vfio_bar_read(void *opaque,
1048                               hwaddr addr, unsigned size)
1049 {
1050     VFIOBAR *bar = opaque;
1051     union {
1052         uint8_t byte;
1053         uint16_t word;
1054         uint32_t dword;
1055         uint64_t qword;
1056     } buf;
1057     uint64_t data = 0;
1058
1059     if (pread(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
1060         error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1061                      __func__, addr, size);
1062         return (uint64_t)-1;
1063     }
1064
1065     switch (size) {
1066     case 1:
1067         data = buf.byte;
1068         break;
1069     case 2:
1070         data = le16_to_cpu(buf.word);
1071         break;
1072     case 4:
1073         data = le32_to_cpu(buf.dword);
1074         break;
1075     default:
1076         hw_error("vfio: unsupported read size, %d bytes\n", size);
1077         break;
1078     }
1079
1080 #ifdef DEBUG_VFIO
1081     {
1082         VFIODevice *vdev = container_of(bar, VFIODevice, bars[bar->nr]);
1083
1084         DPRINTF("%s(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx
1085                 ", %d) = 0x%"PRIx64"\n", __func__, vdev->host.domain,
1086                 vdev->host.bus, vdev->host.slot, vdev->host.function,
1087                 bar->nr, addr, size, data);
1088     }
1089 #endif
1090
1091     /* Same as write above */
1092     vfio_eoi(container_of(bar, VFIODevice, bars[bar->nr]));
1093
1094     return data;
1095 }
1096
1097 static const MemoryRegionOps vfio_bar_ops = {
1098     .read = vfio_bar_read,
1099     .write = vfio_bar_write,
1100     .endianness = DEVICE_LITTLE_ENDIAN,
1101 };
1102
1103 static void vfio_pci_load_rom(VFIODevice *vdev)
1104 {
1105     struct vfio_region_info reg_info = {
1106         .argsz = sizeof(reg_info),
1107         .index = VFIO_PCI_ROM_REGION_INDEX
1108     };
1109     uint64_t size;
1110     off_t off = 0;
1111     size_t bytes;
1112
1113     if (ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info)) {
1114         error_report("vfio: Error getting ROM info: %m");
1115         return;
1116     }
1117
1118     DPRINTF("Device %04x:%02x:%02x.%x ROM:\n", vdev->host.domain,
1119             vdev->host.bus, vdev->host.slot, vdev->host.function);
1120     DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
1121             (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
1122             (unsigned long)reg_info.flags);
1123
1124     vdev->rom_size = size = reg_info.size;
1125     vdev->rom_offset = reg_info.offset;
1126
1127     if (!vdev->rom_size) {
1128         return;
1129     }
1130
1131     vdev->rom = g_malloc(size);
1132     memset(vdev->rom, 0xff, size);
1133
1134     while (size) {
1135         bytes = pread(vdev->fd, vdev->rom + off, size, vdev->rom_offset + off);
1136         if (bytes == 0) {
1137             break;
1138         } else if (bytes > 0) {
1139             off += bytes;
1140             size -= bytes;
1141         } else {
1142             if (errno == EINTR || errno == EAGAIN) {
1143                 continue;
1144             }
1145             error_report("vfio: Error reading device ROM: %m");
1146             break;
1147         }
1148     }
1149 }
1150
1151 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
1152 {
1153     VFIODevice *vdev = opaque;
1154     uint64_t val = ((uint64_t)1 << (size * 8)) - 1;
1155
1156     /* Load the ROM lazily when the guest tries to read it */
1157     if (unlikely(!vdev->rom)) {
1158         vfio_pci_load_rom(vdev);
1159     }
1160
1161     memcpy(&val, vdev->rom + addr,
1162            (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
1163
1164     DPRINTF("%s(%04x:%02x:%02x.%x, 0x%"HWADDR_PRIx", 0x%x) = 0x%"PRIx64"\n",
1165             __func__, vdev->host.domain, vdev->host.bus, vdev->host.slot,
1166             vdev->host.function, addr, size, val);
1167
1168     return val;
1169 }
1170
1171 static void vfio_rom_write(void *opaque, hwaddr addr,
1172                            uint64_t data, unsigned size)
1173 {
1174 }
1175
1176 static const MemoryRegionOps vfio_rom_ops = {
1177     .read = vfio_rom_read,
1178     .write = vfio_rom_write,
1179     .endianness = DEVICE_LITTLE_ENDIAN,
1180 };
1181
1182 static void vfio_pci_size_rom(VFIODevice *vdev)
1183 {
1184     uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
1185     off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
1186     char name[32];
1187
1188     if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
1189         return;
1190     }
1191
1192     /*
1193      * Use the same size ROM BAR as the physical device.  The contents
1194      * will get filled in later when the guest tries to read it.
1195      */
1196     if (pread(vdev->fd, &orig, 4, offset) != 4 ||
1197         pwrite(vdev->fd, &size, 4, offset) != 4 ||
1198         pread(vdev->fd, &size, 4, offset) != 4 ||
1199         pwrite(vdev->fd, &orig, 4, offset) != 4) {
1200         error_report("%s(%04x:%02x:%02x.%x) failed: %m",
1201                      __func__, vdev->host.domain, vdev->host.bus,
1202                      vdev->host.slot, vdev->host.function);
1203         return;
1204     }
1205
1206     size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
1207
1208     if (!size) {
1209         return;
1210     }
1211
1212     DPRINTF("%04x:%02x:%02x.%x ROM size 0x%x\n", vdev->host.domain,
1213             vdev->host.bus, vdev->host.slot, vdev->host.function, size);
1214
1215     snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
1216              vdev->host.domain, vdev->host.bus, vdev->host.slot,
1217              vdev->host.function);
1218
1219     memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
1220                           &vfio_rom_ops, vdev, name, size);
1221
1222     pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
1223                      PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
1224
1225     vdev->pdev.has_rom = true;
1226 }
1227
1228 static void vfio_vga_write(void *opaque, hwaddr addr,
1229                            uint64_t data, unsigned size)
1230 {
1231     VFIOVGARegion *region = opaque;
1232     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1233     union {
1234         uint8_t byte;
1235         uint16_t word;
1236         uint32_t dword;
1237         uint64_t qword;
1238     } buf;
1239     off_t offset = vga->fd_offset + region->offset + addr;
1240
1241     switch (size) {
1242     case 1:
1243         buf.byte = data;
1244         break;
1245     case 2:
1246         buf.word = cpu_to_le16(data);
1247         break;
1248     case 4:
1249         buf.dword = cpu_to_le32(data);
1250         break;
1251     default:
1252         hw_error("vfio: unsupported write size, %d bytes\n", size);
1253         break;
1254     }
1255
1256     if (pwrite(vga->fd, &buf, size, offset) != size) {
1257         error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
1258                      __func__, region->offset + addr, data, size);
1259     }
1260
1261     DPRINTF("%s(0x%"HWADDR_PRIx", 0x%"PRIx64", %d)\n",
1262             __func__, region->offset + addr, data, size);
1263 }
1264
1265 static uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
1266 {
1267     VFIOVGARegion *region = opaque;
1268     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1269     union {
1270         uint8_t byte;
1271         uint16_t word;
1272         uint32_t dword;
1273         uint64_t qword;
1274     } buf;
1275     uint64_t data = 0;
1276     off_t offset = vga->fd_offset + region->offset + addr;
1277
1278     if (pread(vga->fd, &buf, size, offset) != size) {
1279         error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1280                      __func__, region->offset + addr, size);
1281         return (uint64_t)-1;
1282     }
1283
1284     switch (size) {
1285     case 1:
1286         data = buf.byte;
1287         break;
1288     case 2:
1289         data = le16_to_cpu(buf.word);
1290         break;
1291     case 4:
1292         data = le32_to_cpu(buf.dword);
1293         break;
1294     default:
1295         hw_error("vfio: unsupported read size, %d bytes\n", size);
1296         break;
1297     }
1298
1299     DPRINTF("%s(0x%"HWADDR_PRIx", %d) = 0x%"PRIx64"\n",
1300             __func__, region->offset + addr, size, data);
1301
1302     return data;
1303 }
1304
1305 static const MemoryRegionOps vfio_vga_ops = {
1306     .read = vfio_vga_read,
1307     .write = vfio_vga_write,
1308     .endianness = DEVICE_LITTLE_ENDIAN,
1309 };
1310
1311 /*
1312  * Device specific quirks
1313  */
1314
1315 /* Is range1 fully contained within range2?  */
1316 static bool vfio_range_contained(uint64_t first1, uint64_t len1,
1317                                  uint64_t first2, uint64_t len2) {
1318     return (first1 >= first2 && first1 + len1 <= first2 + len2);
1319 }
1320
1321 static bool vfio_flags_enabled(uint8_t flags, uint8_t mask)
1322 {
1323     return (mask && (flags & mask) == mask);
1324 }
1325
1326 static uint64_t vfio_generic_window_quirk_read(void *opaque,
1327                                                hwaddr addr, unsigned size)
1328 {
1329     VFIOQuirk *quirk = opaque;
1330     VFIODevice *vdev = quirk->vdev;
1331     uint64_t data;
1332
1333     if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) &&
1334         ranges_overlap(addr, size,
1335                        quirk->data.data_offset, quirk->data.data_size)) {
1336         hwaddr offset = addr - quirk->data.data_offset;
1337
1338         if (!vfio_range_contained(addr, size, quirk->data.data_offset,
1339                                   quirk->data.data_size)) {
1340             hw_error("%s: window data read not fully contained: %s\n",
1341                      __func__, memory_region_name(&quirk->mem));
1342         }
1343
1344         data = vfio_pci_read_config(&vdev->pdev,
1345                                     quirk->data.address_val + offset, size);
1346
1347         DPRINTF("%s read(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx", %d) = 0x%"
1348                 PRIx64"\n", memory_region_name(&quirk->mem), vdev->host.domain,
1349                 vdev->host.bus, vdev->host.slot, vdev->host.function,
1350                 quirk->data.bar, addr, size, data);
1351     } else {
1352         data = vfio_bar_read(&vdev->bars[quirk->data.bar],
1353                              addr + quirk->data.base_offset, size);
1354     }
1355
1356     return data;
1357 }
1358
1359 static void vfio_generic_window_quirk_write(void *opaque, hwaddr addr,
1360                                             uint64_t data, unsigned size)
1361 {
1362     VFIOQuirk *quirk = opaque;
1363     VFIODevice *vdev = quirk->vdev;
1364
1365     if (ranges_overlap(addr, size,
1366                        quirk->data.address_offset, quirk->data.address_size)) {
1367
1368         if (addr != quirk->data.address_offset) {
1369             hw_error("%s: offset write into address window: %s\n",
1370                      __func__, memory_region_name(&quirk->mem));
1371         }
1372
1373         if ((data & ~quirk->data.address_mask) == quirk->data.address_match) {
1374             quirk->data.flags |= quirk->data.write_flags |
1375                                  quirk->data.read_flags;
1376             quirk->data.address_val = data & quirk->data.address_mask;
1377         } else {
1378             quirk->data.flags &= ~(quirk->data.write_flags |
1379                                    quirk->data.read_flags);
1380         }
1381     }
1382
1383     if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) &&
1384         ranges_overlap(addr, size,
1385                        quirk->data.data_offset, quirk->data.data_size)) {
1386         hwaddr offset = addr - quirk->data.data_offset;
1387
1388         if (!vfio_range_contained(addr, size, quirk->data.data_offset,
1389                                   quirk->data.data_size)) {
1390             hw_error("%s: window data write not fully contained: %s\n",
1391                      __func__, memory_region_name(&quirk->mem));
1392         }
1393
1394         vfio_pci_write_config(&vdev->pdev,
1395                               quirk->data.address_val + offset, data, size);
1396         DPRINTF("%s write(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx", 0x%"
1397                 PRIx64", %d)\n", memory_region_name(&quirk->mem),
1398                 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1399                 vdev->host.function, quirk->data.bar, addr, data, size);
1400         return;
1401     }
1402
1403     vfio_bar_write(&vdev->bars[quirk->data.bar],
1404                    addr + quirk->data.base_offset, data, size);
1405 }
1406
1407 static const MemoryRegionOps vfio_generic_window_quirk = {
1408     .read = vfio_generic_window_quirk_read,
1409     .write = vfio_generic_window_quirk_write,
1410     .endianness = DEVICE_LITTLE_ENDIAN,
1411 };
1412
1413 static uint64_t vfio_generic_quirk_read(void *opaque,
1414                                         hwaddr addr, unsigned size)
1415 {
1416     VFIOQuirk *quirk = opaque;
1417     VFIODevice *vdev = quirk->vdev;
1418     hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
1419     hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK;
1420     uint64_t data;
1421
1422     if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) &&
1423         ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) {
1424         if (!vfio_range_contained(addr, size, offset,
1425                                   quirk->data.address_mask + 1)) {
1426             hw_error("%s: read not fully contained: %s\n",
1427                      __func__, memory_region_name(&quirk->mem));
1428         }
1429
1430         data = vfio_pci_read_config(&vdev->pdev, addr - offset, size);
1431
1432         DPRINTF("%s read(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx", %d) = 0x%"
1433                 PRIx64"\n", memory_region_name(&quirk->mem), vdev->host.domain,
1434                 vdev->host.bus, vdev->host.slot, vdev->host.function,
1435                 quirk->data.bar, addr + base, size, data);
1436     } else {
1437         data = vfio_bar_read(&vdev->bars[quirk->data.bar], addr + base, size);
1438     }
1439
1440     return data;
1441 }
1442
1443 static void vfio_generic_quirk_write(void *opaque, hwaddr addr,
1444                                      uint64_t data, unsigned size)
1445 {
1446     VFIOQuirk *quirk = opaque;
1447     VFIODevice *vdev = quirk->vdev;
1448     hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
1449     hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK;
1450
1451     if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) &&
1452         ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) {
1453         if (!vfio_range_contained(addr, size, offset,
1454                                   quirk->data.address_mask + 1)) {
1455             hw_error("%s: write not fully contained: %s\n",
1456                      __func__, memory_region_name(&quirk->mem));
1457         }
1458
1459         vfio_pci_write_config(&vdev->pdev, addr - offset, data, size);
1460
1461         DPRINTF("%s write(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx", 0x%"
1462                 PRIx64", %d)\n", memory_region_name(&quirk->mem),
1463                 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1464                 vdev->host.function, quirk->data.bar, addr + base, data, size);
1465     } else {
1466         vfio_bar_write(&vdev->bars[quirk->data.bar], addr + base, data, size);
1467     }
1468 }
1469
1470 static const MemoryRegionOps vfio_generic_quirk = {
1471     .read = vfio_generic_quirk_read,
1472     .write = vfio_generic_quirk_write,
1473     .endianness = DEVICE_LITTLE_ENDIAN,
1474 };
1475
1476 #define PCI_VENDOR_ID_ATI               0x1002
1477
1478 /*
1479  * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR
1480  * through VGA register 0x3c3.  On newer cards, the I/O port BAR is always
1481  * BAR4 (older cards like the X550 used BAR1, but we don't care to support
1482  * those).  Note that on bare metal, a read of 0x3c3 doesn't always return the
1483  * I/O port BAR address.  Originally this was coded to return the virtual BAR
1484  * address only if the physical register read returns the actual BAR address,
1485  * but users have reported greater success if we return the virtual address
1486  * unconditionally.
1487  */
1488 static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
1489                                         hwaddr addr, unsigned size)
1490 {
1491     VFIOQuirk *quirk = opaque;
1492     VFIODevice *vdev = quirk->vdev;
1493     uint64_t data = vfio_pci_read_config(&vdev->pdev,
1494                                          PCI_BASE_ADDRESS_0 + (4 * 4) + 1,
1495                                          size);
1496     DPRINTF("%s(0x3c3, 1) = 0x%"PRIx64"\n", __func__, data);
1497
1498     return data;
1499 }
1500
1501 static const MemoryRegionOps vfio_ati_3c3_quirk = {
1502     .read = vfio_ati_3c3_quirk_read,
1503     .endianness = DEVICE_LITTLE_ENDIAN,
1504 };
1505
1506 static void vfio_vga_probe_ati_3c3_quirk(VFIODevice *vdev)
1507 {
1508     PCIDevice *pdev = &vdev->pdev;
1509     VFIOQuirk *quirk;
1510
1511     if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1512         return;
1513     }
1514
1515     /*
1516      * As long as the BAR is >= 256 bytes it will be aligned such that the
1517      * lower byte is always zero.  Filter out anything else, if it exists.
1518      */
1519     if (!vdev->bars[4].ioport || vdev->bars[4].size < 256) {
1520         return;
1521     }
1522
1523     quirk = g_malloc0(sizeof(*quirk));
1524     quirk->vdev = vdev;
1525
1526     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, quirk,
1527                           "vfio-ati-3c3-quirk", 1);
1528     memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
1529                                 3 /* offset 3 bytes from 0x3c0 */, &quirk->mem);
1530
1531     QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
1532                       quirk, next);
1533
1534     DPRINTF("Enabled ATI/AMD quirk 0x3c3 BAR4for device %04x:%02x:%02x.%x\n",
1535             vdev->host.domain, vdev->host.bus, vdev->host.slot,
1536             vdev->host.function);
1537 }
1538
1539 /*
1540  * Newer ATI/AMD devices, including HD5450 and HD7850, have a window to PCI
1541  * config space through MMIO BAR2 at offset 0x4000.  Nothing seems to access
1542  * the MMIO space directly, but a window to this space is provided through
1543  * I/O port BAR4.  Offset 0x0 is the address register and offset 0x4 is the
1544  * data register.  When the address is programmed to a range of 0x4000-0x4fff
1545  * PCI configuration space is available.  Experimentation seems to indicate
1546  * that only read-only access is provided, but we drop writes when the window
1547  * is enabled to config space nonetheless.
1548  */
1549 static void vfio_probe_ati_bar4_window_quirk(VFIODevice *vdev, int nr)
1550 {
1551     PCIDevice *pdev = &vdev->pdev;
1552     VFIOQuirk *quirk;
1553
1554     if (!vdev->has_vga || nr != 4 ||
1555         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1556         return;
1557     }
1558
1559     quirk = g_malloc0(sizeof(*quirk));
1560     quirk->vdev = vdev;
1561     quirk->data.address_size = 4;
1562     quirk->data.data_offset = 4;
1563     quirk->data.data_size = 4;
1564     quirk->data.address_match = 0x4000;
1565     quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
1566     quirk->data.bar = nr;
1567     quirk->data.read_flags = quirk->data.write_flags = 1;
1568
1569     memory_region_init_io(&quirk->mem, OBJECT(vdev),
1570                           &vfio_generic_window_quirk, quirk,
1571                           "vfio-ati-bar4-window-quirk", 8);
1572     memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
1573                           quirk->data.base_offset, &quirk->mem, 1);
1574
1575     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1576
1577     DPRINTF("Enabled ATI/AMD BAR4 window quirk for device %04x:%02x:%02x.%x\n",
1578             vdev->host.domain, vdev->host.bus, vdev->host.slot,
1579             vdev->host.function);
1580 }
1581
1582 /*
1583  * Trap the BAR2 MMIO window to config space as well.
1584  */
1585 static void vfio_probe_ati_bar2_4000_quirk(VFIODevice *vdev, int nr)
1586 {
1587     PCIDevice *pdev = &vdev->pdev;
1588     VFIOQuirk *quirk;
1589
1590     /* Only enable on newer devices where BAR2 is 64bit */
1591     if (!vdev->has_vga || nr != 2 || !vdev->bars[2].mem64 ||
1592         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1593         return;
1594     }
1595
1596     quirk = g_malloc0(sizeof(*quirk));
1597     quirk->vdev = vdev;
1598     quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
1599     quirk->data.address_match = 0x4000;
1600     quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
1601     quirk->data.bar = nr;
1602
1603     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk,
1604                           "vfio-ati-bar2-4000-quirk",
1605                           TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
1606     memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
1607                           quirk->data.address_match & TARGET_PAGE_MASK,
1608                           &quirk->mem, 1);
1609
1610     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1611
1612     DPRINTF("Enabled ATI/AMD BAR2 0x4000 quirk for device %04x:%02x:%02x.%x\n",
1613             vdev->host.domain, vdev->host.bus, vdev->host.slot,
1614             vdev->host.function);
1615 }
1616
1617 /*
1618  * Older ATI/AMD cards like the X550 have a similar window to that above.
1619  * I/O port BAR1 provides a window to a mirror of PCI config space located
1620  * in BAR2 at offset 0xf00.  We don't care to support such older cards, but
1621  * note it for future reference.
1622  */
1623
1624 #define PCI_VENDOR_ID_NVIDIA                    0x10de
1625
1626 /*
1627  * Nvidia has several different methods to get to config space, the
1628  * nouveu project has several of these documented here:
1629  * https://github.com/pathscale/envytools/tree/master/hwdocs
1630  *
1631  * The first quirk is actually not documented in envytools and is found
1632  * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]).  This is an
1633  * NV46 chipset.  The backdoor uses the legacy VGA I/O ports to access
1634  * the mirror of PCI config space found at BAR0 offset 0x1800.  The access
1635  * sequence first writes 0x338 to I/O port 0x3d4.  The target offset is
1636  * then written to 0x3d0.  Finally 0x538 is written for a read and 0x738
1637  * is written for a write to 0x3d4.  The BAR0 offset is then accessible
1638  * through 0x3d0.  This quirk doesn't seem to be necessary on newer cards
1639  * that use the I/O port BAR5 window but it doesn't hurt to leave it.
1640  */
1641 enum {
1642     NV_3D0_NONE = 0,
1643     NV_3D0_SELECT,
1644     NV_3D0_WINDOW,
1645     NV_3D0_READ,
1646     NV_3D0_WRITE,
1647 };
1648
1649 static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque,
1650                                            hwaddr addr, unsigned size)
1651 {
1652     VFIOQuirk *quirk = opaque;
1653     VFIODevice *vdev = quirk->vdev;
1654     PCIDevice *pdev = &vdev->pdev;
1655     uint64_t data = vfio_vga_read(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
1656                                   addr + quirk->data.base_offset, size);
1657
1658     if (quirk->data.flags == NV_3D0_READ && addr == quirk->data.data_offset) {
1659         data = vfio_pci_read_config(pdev, quirk->data.address_val, size);
1660         DPRINTF("%s(0x3d0, %d) = 0x%"PRIx64"\n", __func__, size, data);
1661     }
1662
1663     quirk->data.flags = NV_3D0_NONE;
1664
1665     return data;
1666 }
1667
1668 static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr,
1669                                         uint64_t data, unsigned size)
1670 {
1671     VFIOQuirk *quirk = opaque;
1672     VFIODevice *vdev = quirk->vdev;
1673     PCIDevice *pdev = &vdev->pdev;
1674
1675     switch (quirk->data.flags) {
1676     case NV_3D0_NONE:
1677         if (addr == quirk->data.address_offset && data == 0x338) {
1678             quirk->data.flags = NV_3D0_SELECT;
1679         }
1680         break;
1681     case NV_3D0_SELECT:
1682         quirk->data.flags = NV_3D0_NONE;
1683         if (addr == quirk->data.data_offset &&
1684             (data & ~quirk->data.address_mask) == quirk->data.address_match) {
1685             quirk->data.flags = NV_3D0_WINDOW;
1686             quirk->data.address_val = data & quirk->data.address_mask;
1687         }
1688         break;
1689     case NV_3D0_WINDOW:
1690         quirk->data.flags = NV_3D0_NONE;
1691         if (addr == quirk->data.address_offset) {
1692             if (data == 0x538) {
1693                 quirk->data.flags = NV_3D0_READ;
1694             } else if (data == 0x738) {
1695                 quirk->data.flags = NV_3D0_WRITE;
1696             }
1697         }
1698         break;
1699     case NV_3D0_WRITE:
1700         quirk->data.flags = NV_3D0_NONE;
1701         if (addr == quirk->data.data_offset) {
1702             vfio_pci_write_config(pdev, quirk->data.address_val, data, size);
1703             DPRINTF("%s(0x3d0, 0x%"PRIx64", %d)\n", __func__, data, size);
1704             return;
1705         }
1706         break;
1707     }
1708
1709     vfio_vga_write(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
1710                    addr + quirk->data.base_offset, data, size);
1711 }
1712
1713 static const MemoryRegionOps vfio_nvidia_3d0_quirk = {
1714     .read = vfio_nvidia_3d0_quirk_read,
1715     .write = vfio_nvidia_3d0_quirk_write,
1716     .endianness = DEVICE_LITTLE_ENDIAN,
1717 };
1718
1719 static void vfio_vga_probe_nvidia_3d0_quirk(VFIODevice *vdev)
1720 {
1721     PCIDevice *pdev = &vdev->pdev;
1722     VFIOQuirk *quirk;
1723
1724     if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA ||
1725         !vdev->bars[1].size) {
1726         return;
1727     }
1728
1729     quirk = g_malloc0(sizeof(*quirk));
1730     quirk->vdev = vdev;
1731     quirk->data.base_offset = 0x10;
1732     quirk->data.address_offset = 4;
1733     quirk->data.address_size = 2;
1734     quirk->data.address_match = 0x1800;
1735     quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1;
1736     quirk->data.data_offset = 0;
1737     quirk->data.data_size = 4;
1738
1739     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_3d0_quirk,
1740                           quirk, "vfio-nvidia-3d0-quirk", 6);
1741     memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
1742                                 quirk->data.base_offset, &quirk->mem);
1743
1744     QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
1745                       quirk, next);
1746
1747     DPRINTF("Enabled NVIDIA VGA 0x3d0 quirk for device %04x:%02x:%02x.%x\n",
1748             vdev->host.domain, vdev->host.bus, vdev->host.slot,
1749             vdev->host.function);
1750 }
1751
1752 /*
1753  * The second quirk is documented in envytools.  The I/O port BAR5 is just
1754  * a set of address/data ports to the MMIO BARs.  The BAR we care about is
1755  * again BAR0.  This backdoor is apparently a bit newer than the one above
1756  * so we need to not only trap 256 bytes @0x1800, but all of PCI config
1757  * space, including extended space is available at the 4k @0x88000.
1758  */
1759 enum {
1760     NV_BAR5_ADDRESS = 0x1,
1761     NV_BAR5_ENABLE = 0x2,
1762     NV_BAR5_MASTER = 0x4,
1763     NV_BAR5_VALID = 0x7,
1764 };
1765
1766 static void vfio_nvidia_bar5_window_quirk_write(void *opaque, hwaddr addr,
1767                                                 uint64_t data, unsigned size)
1768 {
1769     VFIOQuirk *quirk = opaque;
1770
1771     switch (addr) {
1772     case 0x0:
1773         if (data & 0x1) {
1774             quirk->data.flags |= NV_BAR5_MASTER;
1775         } else {
1776             quirk->data.flags &= ~NV_BAR5_MASTER;
1777         }
1778         break;
1779     case 0x4:
1780         if (data & 0x1) {
1781             quirk->data.flags |= NV_BAR5_ENABLE;
1782         } else {
1783             quirk->data.flags &= ~NV_BAR5_ENABLE;
1784         }
1785         break;
1786     case 0x8:
1787         if (quirk->data.flags & NV_BAR5_MASTER) {
1788             if ((data & ~0xfff) == 0x88000) {
1789                 quirk->data.flags |= NV_BAR5_ADDRESS;
1790                 quirk->data.address_val = data & 0xfff;
1791             } else if ((data & ~0xff) == 0x1800) {
1792                 quirk->data.flags |= NV_BAR5_ADDRESS;
1793                 quirk->data.address_val = data & 0xff;
1794             } else {
1795                 quirk->data.flags &= ~NV_BAR5_ADDRESS;
1796             }
1797         }
1798         break;
1799     }
1800
1801     vfio_generic_window_quirk_write(opaque, addr, data, size);
1802 }
1803
1804 static const MemoryRegionOps vfio_nvidia_bar5_window_quirk = {
1805     .read = vfio_generic_window_quirk_read,
1806     .write = vfio_nvidia_bar5_window_quirk_write,
1807     .valid.min_access_size = 4,
1808     .endianness = DEVICE_LITTLE_ENDIAN,
1809 };
1810
1811 static void vfio_probe_nvidia_bar5_window_quirk(VFIODevice *vdev, int nr)
1812 {
1813     PCIDevice *pdev = &vdev->pdev;
1814     VFIOQuirk *quirk;
1815
1816     if (!vdev->has_vga || nr != 5 ||
1817         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
1818         return;
1819     }
1820
1821     quirk = g_malloc0(sizeof(*quirk));
1822     quirk->vdev = vdev;
1823     quirk->data.read_flags = quirk->data.write_flags = NV_BAR5_VALID;
1824     quirk->data.address_offset = 0x8;
1825     quirk->data.address_size = 0; /* actually 4, but avoids generic code */
1826     quirk->data.data_offset = 0xc;
1827     quirk->data.data_size = 4;
1828     quirk->data.bar = nr;
1829
1830     memory_region_init_io(&quirk->mem, OBJECT(vdev),
1831                           &vfio_nvidia_bar5_window_quirk, quirk,
1832                           "vfio-nvidia-bar5-window-quirk", 16);
1833     memory_region_add_subregion_overlap(&vdev->bars[nr].mem, 0, &quirk->mem, 1);
1834
1835     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1836
1837     DPRINTF("Enabled NVIDIA BAR5 window quirk for device %04x:%02x:%02x.%x\n",
1838             vdev->host.domain, vdev->host.bus, vdev->host.slot,
1839             vdev->host.function);
1840 }
1841
1842 static void vfio_nvidia_88000_quirk_write(void *opaque, hwaddr addr,
1843                                           uint64_t data, unsigned size)
1844 {
1845     VFIOQuirk *quirk = opaque;
1846     VFIODevice *vdev = quirk->vdev;
1847     PCIDevice *pdev = &vdev->pdev;
1848     hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
1849
1850     vfio_generic_quirk_write(opaque, addr, data, size);
1851
1852     /*
1853      * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
1854      * MSI capability ID register.  Both the ID and next register are
1855      * read-only, so we allow writes covering either of those to real hw.
1856      * NB - only fixed for the 0x88000 MMIO window.
1857      */
1858     if ((pdev->cap_present & QEMU_PCI_CAP_MSI) &&
1859         vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) {
1860         vfio_bar_write(&vdev->bars[quirk->data.bar], addr + base, data, size);
1861     }
1862 }
1863
1864 static const MemoryRegionOps vfio_nvidia_88000_quirk = {
1865     .read = vfio_generic_quirk_read,
1866     .write = vfio_nvidia_88000_quirk_write,
1867     .endianness = DEVICE_LITTLE_ENDIAN,
1868 };
1869
1870 /*
1871  * Finally, BAR0 itself.  We want to redirect any accesses to either
1872  * 0x1800 or 0x88000 through the PCI config space access functions.
1873  *
1874  * NB - quirk at a page granularity or else they don't seem to work when
1875  *      BARs are mmap'd
1876  *
1877  * Here's offset 0x88000...
1878  */
1879 static void vfio_probe_nvidia_bar0_88000_quirk(VFIODevice *vdev, int nr)
1880 {
1881     PCIDevice *pdev = &vdev->pdev;
1882     VFIOQuirk *quirk;
1883
1884     if (!vdev->has_vga || nr != 0 ||
1885         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
1886         return;
1887     }
1888
1889     quirk = g_malloc0(sizeof(*quirk));
1890     quirk->vdev = vdev;
1891     quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
1892     quirk->data.address_match = 0x88000;
1893     quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
1894     quirk->data.bar = nr;
1895
1896     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_88000_quirk,
1897                           quirk, "vfio-nvidia-bar0-88000-quirk",
1898                           TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
1899     memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
1900                           quirk->data.address_match & TARGET_PAGE_MASK,
1901                           &quirk->mem, 1);
1902
1903     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1904
1905     DPRINTF("Enabled NVIDIA BAR0 0x88000 quirk for device %04x:%02x:%02x.%x\n",
1906             vdev->host.domain, vdev->host.bus, vdev->host.slot,
1907             vdev->host.function);
1908 }
1909
1910 /*
1911  * And here's the same for BAR0 offset 0x1800...
1912  */
1913 static void vfio_probe_nvidia_bar0_1800_quirk(VFIODevice *vdev, int nr)
1914 {
1915     PCIDevice *pdev = &vdev->pdev;
1916     VFIOQuirk *quirk;
1917
1918     if (!vdev->has_vga || nr != 0 ||
1919         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
1920         return;
1921     }
1922
1923     /* Log the chipset ID */
1924     DPRINTF("Nvidia NV%02x\n",
1925             (unsigned int)(vfio_bar_read(&vdev->bars[0], 0, 4) >> 20) & 0xff);
1926
1927     quirk = g_malloc0(sizeof(*quirk));
1928     quirk->vdev = vdev;
1929     quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
1930     quirk->data.address_match = 0x1800;
1931     quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1;
1932     quirk->data.bar = nr;
1933
1934     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk,
1935                           "vfio-nvidia-bar0-1800-quirk",
1936                           TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
1937     memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
1938                           quirk->data.address_match & TARGET_PAGE_MASK,
1939                           &quirk->mem, 1);
1940
1941     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1942
1943     DPRINTF("Enabled NVIDIA BAR0 0x1800 quirk for device %04x:%02x:%02x.%x\n",
1944             vdev->host.domain, vdev->host.bus, vdev->host.slot,
1945             vdev->host.function);
1946 }
1947
1948 /*
1949  * TODO - Some Nvidia devices provide config access to their companion HDA
1950  * device and even to their parent bridge via these config space mirrors.
1951  * Add quirks for those regions.
1952  */
1953
1954 /*
1955  * Common quirk probe entry points.
1956  */
1957 static void vfio_vga_quirk_setup(VFIODevice *vdev)
1958 {
1959     vfio_vga_probe_ati_3c3_quirk(vdev);
1960     vfio_vga_probe_nvidia_3d0_quirk(vdev);
1961 }
1962
1963 static void vfio_vga_quirk_teardown(VFIODevice *vdev)
1964 {
1965     int i;
1966
1967     for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) {
1968         while (!QLIST_EMPTY(&vdev->vga.region[i].quirks)) {
1969             VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga.region[i].quirks);
1970             memory_region_del_subregion(&vdev->vga.region[i].mem, &quirk->mem);
1971             QLIST_REMOVE(quirk, next);
1972             g_free(quirk);
1973         }
1974     }
1975 }
1976
1977 static void vfio_bar_quirk_setup(VFIODevice *vdev, int nr)
1978 {
1979     vfio_probe_ati_bar4_window_quirk(vdev, nr);
1980     vfio_probe_ati_bar2_4000_quirk(vdev, nr);
1981     vfio_probe_nvidia_bar5_window_quirk(vdev, nr);
1982     vfio_probe_nvidia_bar0_88000_quirk(vdev, nr);
1983     vfio_probe_nvidia_bar0_1800_quirk(vdev, nr);
1984 }
1985
1986 static void vfio_bar_quirk_teardown(VFIODevice *vdev, int nr)
1987 {
1988     VFIOBAR *bar = &vdev->bars[nr];
1989
1990     while (!QLIST_EMPTY(&bar->quirks)) {
1991         VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
1992         memory_region_del_subregion(&bar->mem, &quirk->mem);
1993         QLIST_REMOVE(quirk, next);
1994         g_free(quirk);
1995     }
1996 }
1997
1998 /*
1999  * PCI config space
2000  */
2001 static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
2002 {
2003     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
2004     uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
2005
2006     memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
2007     emu_bits = le32_to_cpu(emu_bits);
2008
2009     if (emu_bits) {
2010         emu_val = pci_default_read_config(pdev, addr, len);
2011     }
2012
2013     if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
2014         ssize_t ret;
2015
2016         ret = pread(vdev->fd, &phys_val, len, vdev->config_offset + addr);
2017         if (ret != len) {
2018             error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m",
2019                          __func__, vdev->host.domain, vdev->host.bus,
2020                          vdev->host.slot, vdev->host.function, addr, len);
2021             return -errno;
2022         }
2023         phys_val = le32_to_cpu(phys_val);
2024     }
2025
2026     val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
2027
2028     DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, len=0x%x) %x\n", __func__,
2029             vdev->host.domain, vdev->host.bus, vdev->host.slot,
2030             vdev->host.function, addr, len, val);
2031
2032     return val;
2033 }
2034
2035 static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
2036                                   uint32_t val, int len)
2037 {
2038     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
2039     uint32_t val_le = cpu_to_le32(val);
2040
2041     DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, 0x%x, len=0x%x)\n", __func__,
2042             vdev->host.domain, vdev->host.bus, vdev->host.slot,
2043             vdev->host.function, addr, val, len);
2044
2045     /* Write everything to VFIO, let it filter out what we can't write */
2046     if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) != len) {
2047         error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m",
2048                      __func__, vdev->host.domain, vdev->host.bus,
2049                      vdev->host.slot, vdev->host.function, addr, val, len);
2050     }
2051
2052     /* MSI/MSI-X Enabling/Disabling */
2053     if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
2054         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
2055         int is_enabled, was_enabled = msi_enabled(pdev);
2056
2057         pci_default_write_config(pdev, addr, val, len);
2058
2059         is_enabled = msi_enabled(pdev);
2060
2061         if (!was_enabled) {
2062             if (is_enabled) {
2063                 vfio_enable_msi(vdev);
2064             }
2065         } else {
2066             if (!is_enabled) {
2067                 vfio_disable_msi(vdev);
2068             } else {
2069                 vfio_update_msi(vdev);
2070             }
2071         }
2072     } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
2073         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
2074         int is_enabled, was_enabled = msix_enabled(pdev);
2075
2076         pci_default_write_config(pdev, addr, val, len);
2077
2078         is_enabled = msix_enabled(pdev);
2079
2080         if (!was_enabled && is_enabled) {
2081             vfio_enable_msix(vdev);
2082         } else if (was_enabled && !is_enabled) {
2083             vfio_disable_msix(vdev);
2084         }
2085     } else {
2086         /* Write everything to QEMU to keep emulated bits correct */
2087         pci_default_write_config(pdev, addr, val, len);
2088     }
2089 }
2090
2091 /*
2092  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
2093  */
2094 static int vfio_dma_unmap(VFIOContainer *container,
2095                           hwaddr iova, ram_addr_t size)
2096 {
2097     struct vfio_iommu_type1_dma_unmap unmap = {
2098         .argsz = sizeof(unmap),
2099         .flags = 0,
2100         .iova = iova,
2101         .size = size,
2102     };
2103
2104     if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
2105         DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
2106         return -errno;
2107     }
2108
2109     return 0;
2110 }
2111
2112 static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
2113                         ram_addr_t size, void *vaddr, bool readonly)
2114 {
2115     struct vfio_iommu_type1_dma_map map = {
2116         .argsz = sizeof(map),
2117         .flags = VFIO_DMA_MAP_FLAG_READ,
2118         .vaddr = (__u64)(uintptr_t)vaddr,
2119         .iova = iova,
2120         .size = size,
2121     };
2122
2123     if (!readonly) {
2124         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
2125     }
2126
2127     /*
2128      * Try the mapping, if it fails with EBUSY, unmap the region and try
2129      * again.  This shouldn't be necessary, but we sometimes see it in
2130      * the the VGA ROM space.
2131      */
2132     if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
2133         (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
2134          ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
2135         return 0;
2136     }
2137
2138     DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
2139     return -errno;
2140 }
2141
2142 static bool vfio_listener_skipped_section(MemoryRegionSection *section)
2143 {
2144     return !memory_region_is_ram(section->mr);
2145 }
2146
2147 static void vfio_listener_region_add(MemoryListener *listener,
2148                                      MemoryRegionSection *section)
2149 {
2150     VFIOContainer *container = container_of(listener, VFIOContainer,
2151                                             iommu_data.listener);
2152     hwaddr iova, end;
2153     void *vaddr;
2154     int ret;
2155
2156     assert(!memory_region_is_iommu(section->mr));
2157
2158     if (vfio_listener_skipped_section(section)) {
2159         DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
2160                 section->offset_within_address_space,
2161                 section->offset_within_address_space +
2162                 int128_get64(int128_sub(section->size, int128_one())));
2163         return;
2164     }
2165
2166     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
2167                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
2168         error_report("%s received unaligned region", __func__);
2169         return;
2170     }
2171
2172     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
2173     end = (section->offset_within_address_space + int128_get64(section->size)) &
2174           TARGET_PAGE_MASK;
2175
2176     if (iova >= end) {
2177         return;
2178     }
2179
2180     vaddr = memory_region_get_ram_ptr(section->mr) +
2181             section->offset_within_region +
2182             (iova - section->offset_within_address_space);
2183
2184     DPRINTF("region_add %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
2185             iova, end - 1, vaddr);
2186
2187     memory_region_ref(section->mr);
2188     ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
2189     if (ret) {
2190         error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
2191                      "0x%"HWADDR_PRIx", %p) = %d (%m)",
2192                      container, iova, end - iova, vaddr, ret);
2193     }
2194 }
2195
2196 static void vfio_listener_region_del(MemoryListener *listener,
2197                                      MemoryRegionSection *section)
2198 {
2199     VFIOContainer *container = container_of(listener, VFIOContainer,
2200                                             iommu_data.listener);
2201     hwaddr iova, end;
2202     int ret;
2203
2204     if (vfio_listener_skipped_section(section)) {
2205         DPRINTF("SKIPPING region_del %"HWADDR_PRIx" - %"PRIx64"\n",
2206                 section->offset_within_address_space,
2207                 section->offset_within_address_space +
2208                 int128_get64(int128_sub(section->size, int128_one())));
2209         return;
2210     }
2211
2212     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
2213                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
2214         error_report("%s received unaligned region", __func__);
2215         return;
2216     }
2217
2218     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
2219     end = (section->offset_within_address_space + int128_get64(section->size)) &
2220           TARGET_PAGE_MASK;
2221
2222     if (iova >= end) {
2223         return;
2224     }
2225
2226     DPRINTF("region_del %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
2227             iova, end - 1);
2228
2229     ret = vfio_dma_unmap(container, iova, end - iova);
2230     memory_region_unref(section->mr);
2231     if (ret) {
2232         error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
2233                      "0x%"HWADDR_PRIx") = %d (%m)",
2234                      container, iova, end - iova, ret);
2235     }
2236 }
2237
2238 static MemoryListener vfio_memory_listener = {
2239     .region_add = vfio_listener_region_add,
2240     .region_del = vfio_listener_region_del,
2241 };
2242
2243 static void vfio_listener_release(VFIOContainer *container)
2244 {
2245     memory_listener_unregister(&container->iommu_data.listener);
2246 }
2247
2248 /*
2249  * Interrupt setup
2250  */
2251 static void vfio_disable_interrupts(VFIODevice *vdev)
2252 {
2253     switch (vdev->interrupt) {
2254     case VFIO_INT_INTx:
2255         vfio_disable_intx(vdev);
2256         break;
2257     case VFIO_INT_MSI:
2258         vfio_disable_msi(vdev);
2259         break;
2260     case VFIO_INT_MSIX:
2261         vfio_disable_msix(vdev);
2262         break;
2263     }
2264 }
2265
2266 static int vfio_setup_msi(VFIODevice *vdev, int pos)
2267 {
2268     uint16_t ctrl;
2269     bool msi_64bit, msi_maskbit;
2270     int ret, entries;
2271
2272     if (pread(vdev->fd, &ctrl, sizeof(ctrl),
2273               vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
2274         return -errno;
2275     }
2276     ctrl = le16_to_cpu(ctrl);
2277
2278     msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
2279     msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
2280     entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
2281
2282     DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @0x%x\n", vdev->host.domain,
2283             vdev->host.bus, vdev->host.slot, vdev->host.function, pos);
2284
2285     ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
2286     if (ret < 0) {
2287         if (ret == -ENOTSUP) {
2288             return 0;
2289         }
2290         error_report("vfio: msi_init failed");
2291         return ret;
2292     }
2293     vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
2294
2295     return 0;
2296 }
2297
2298 /*
2299  * We don't have any control over how pci_add_capability() inserts
2300  * capabilities into the chain.  In order to setup MSI-X we need a
2301  * MemoryRegion for the BAR.  In order to setup the BAR and not
2302  * attempt to mmap the MSI-X table area, which VFIO won't allow, we
2303  * need to first look for where the MSI-X table lives.  So we
2304  * unfortunately split MSI-X setup across two functions.
2305  */
2306 static int vfio_early_setup_msix(VFIODevice *vdev)
2307 {
2308     uint8_t pos;
2309     uint16_t ctrl;
2310     uint32_t table, pba;
2311
2312     pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
2313     if (!pos) {
2314         return 0;
2315     }
2316
2317     if (pread(vdev->fd, &ctrl, sizeof(ctrl),
2318               vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
2319         return -errno;
2320     }
2321
2322     if (pread(vdev->fd, &table, sizeof(table),
2323               vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
2324         return -errno;
2325     }
2326
2327     if (pread(vdev->fd, &pba, sizeof(pba),
2328               vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
2329         return -errno;
2330     }
2331
2332     ctrl = le16_to_cpu(ctrl);
2333     table = le32_to_cpu(table);
2334     pba = le32_to_cpu(pba);
2335
2336     vdev->msix = g_malloc0(sizeof(*(vdev->msix)));
2337     vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
2338     vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
2339     vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
2340     vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
2341     vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
2342
2343     DPRINTF("%04x:%02x:%02x.%x "
2344             "PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d\n",
2345             vdev->host.domain, vdev->host.bus, vdev->host.slot,
2346             vdev->host.function, pos, vdev->msix->table_bar,
2347             vdev->msix->table_offset, vdev->msix->entries);
2348
2349     return 0;
2350 }
2351
2352 static int vfio_setup_msix(VFIODevice *vdev, int pos)
2353 {
2354     int ret;
2355
2356     ret = msix_init(&vdev->pdev, vdev->msix->entries,
2357                     &vdev->bars[vdev->msix->table_bar].mem,
2358                     vdev->msix->table_bar, vdev->msix->table_offset,
2359                     &vdev->bars[vdev->msix->pba_bar].mem,
2360                     vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
2361     if (ret < 0) {
2362         if (ret == -ENOTSUP) {
2363             return 0;
2364         }
2365         error_report("vfio: msix_init failed");
2366         return ret;
2367     }
2368
2369     return 0;
2370 }
2371
2372 static void vfio_teardown_msi(VFIODevice *vdev)
2373 {
2374     msi_uninit(&vdev->pdev);
2375
2376     if (vdev->msix) {
2377         msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
2378                     &vdev->bars[vdev->msix->pba_bar].mem);
2379     }
2380 }
2381
2382 /*
2383  * Resource setup
2384  */
2385 static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled)
2386 {
2387     int i;
2388
2389     for (i = 0; i < PCI_ROM_SLOT; i++) {
2390         VFIOBAR *bar = &vdev->bars[i];
2391
2392         if (!bar->size) {
2393             continue;
2394         }
2395
2396         memory_region_set_enabled(&bar->mmap_mem, enabled);
2397         if (vdev->msix && vdev->msix->table_bar == i) {
2398             memory_region_set_enabled(&vdev->msix->mmap_mem, enabled);
2399         }
2400     }
2401 }
2402
2403 static void vfio_unmap_bar(VFIODevice *vdev, int nr)
2404 {
2405     VFIOBAR *bar = &vdev->bars[nr];
2406
2407     if (!bar->size) {
2408         return;
2409     }
2410
2411     vfio_bar_quirk_teardown(vdev, nr);
2412
2413     memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
2414     munmap(bar->mmap, memory_region_size(&bar->mmap_mem));
2415
2416     if (vdev->msix && vdev->msix->table_bar == nr) {
2417         memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
2418         munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
2419     }
2420
2421     memory_region_destroy(&bar->mem);
2422 }
2423
2424 static int vfio_mmap_bar(VFIODevice *vdev, VFIOBAR *bar,
2425                          MemoryRegion *mem, MemoryRegion *submem,
2426                          void **map, size_t size, off_t offset,
2427                          const char *name)
2428 {
2429     int ret = 0;
2430
2431     if (VFIO_ALLOW_MMAP && size && bar->flags & VFIO_REGION_INFO_FLAG_MMAP) {
2432         int prot = 0;
2433
2434         if (bar->flags & VFIO_REGION_INFO_FLAG_READ) {
2435             prot |= PROT_READ;
2436         }
2437
2438         if (bar->flags & VFIO_REGION_INFO_FLAG_WRITE) {
2439             prot |= PROT_WRITE;
2440         }
2441
2442         *map = mmap(NULL, size, prot, MAP_SHARED,
2443                     bar->fd, bar->fd_offset + offset);
2444         if (*map == MAP_FAILED) {
2445             *map = NULL;
2446             ret = -errno;
2447             goto empty_region;
2448         }
2449
2450         memory_region_init_ram_ptr(submem, OBJECT(vdev), name, size, *map);
2451     } else {
2452 empty_region:
2453         /* Create a zero sized sub-region to make cleanup easy. */
2454         memory_region_init(submem, OBJECT(vdev), name, 0);
2455     }
2456
2457     memory_region_add_subregion(mem, offset, submem);
2458
2459     return ret;
2460 }
2461
2462 static void vfio_map_bar(VFIODevice *vdev, int nr)
2463 {
2464     VFIOBAR *bar = &vdev->bars[nr];
2465     unsigned size = bar->size;
2466     char name[64];
2467     uint32_t pci_bar;
2468     uint8_t type;
2469     int ret;
2470
2471     /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
2472     if (!size) {
2473         return;
2474     }
2475
2476     snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
2477              vdev->host.domain, vdev->host.bus, vdev->host.slot,
2478              vdev->host.function, nr);
2479
2480     /* Determine what type of BAR this is for registration */
2481     ret = pread(vdev->fd, &pci_bar, sizeof(pci_bar),
2482                 vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
2483     if (ret != sizeof(pci_bar)) {
2484         error_report("vfio: Failed to read BAR %d (%m)", nr);
2485         return;
2486     }
2487
2488     pci_bar = le32_to_cpu(pci_bar);
2489     bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
2490     bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
2491     type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
2492                                     ~PCI_BASE_ADDRESS_MEM_MASK);
2493
2494     /* A "slow" read/write mapping underlies all BARs */
2495     memory_region_init_io(&bar->mem, OBJECT(vdev), &vfio_bar_ops,
2496                           bar, name, size);
2497     pci_register_bar(&vdev->pdev, nr, type, &bar->mem);
2498
2499     /*
2500      * We can't mmap areas overlapping the MSIX vector table, so we
2501      * potentially insert a direct-mapped subregion before and after it.
2502      */
2503     if (vdev->msix && vdev->msix->table_bar == nr) {
2504         size = vdev->msix->table_offset & TARGET_PAGE_MASK;
2505     }
2506
2507     strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
2508     if (vfio_mmap_bar(vdev, bar, &bar->mem,
2509                       &bar->mmap_mem, &bar->mmap, size, 0, name)) {
2510         error_report("%s unsupported. Performance may be slow", name);
2511     }
2512
2513     if (vdev->msix && vdev->msix->table_bar == nr) {
2514         unsigned start;
2515
2516         start = TARGET_PAGE_ALIGN(vdev->msix->table_offset +
2517                                   (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
2518
2519         size = start < bar->size ? bar->size - start : 0;
2520         strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
2521         /* VFIOMSIXInfo contains another MemoryRegion for this mapping */
2522         if (vfio_mmap_bar(vdev, bar, &bar->mem, &vdev->msix->mmap_mem,
2523                           &vdev->msix->mmap, size, start, name)) {
2524             error_report("%s unsupported. Performance may be slow", name);
2525         }
2526     }
2527
2528     vfio_bar_quirk_setup(vdev, nr);
2529 }
2530
2531 static void vfio_map_bars(VFIODevice *vdev)
2532 {
2533     int i;
2534
2535     for (i = 0; i < PCI_ROM_SLOT; i++) {
2536         vfio_map_bar(vdev, i);
2537     }
2538
2539     if (vdev->has_vga) {
2540         memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
2541                               OBJECT(vdev), &vfio_vga_ops,
2542                               &vdev->vga.region[QEMU_PCI_VGA_MEM],
2543                               "vfio-vga-mmio@0xa0000",
2544                               QEMU_PCI_VGA_MEM_SIZE);
2545         memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
2546                               OBJECT(vdev), &vfio_vga_ops,
2547                               &vdev->vga.region[QEMU_PCI_VGA_IO_LO],
2548                               "vfio-vga-io@0x3b0",
2549                               QEMU_PCI_VGA_IO_LO_SIZE);
2550         memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
2551                               OBJECT(vdev), &vfio_vga_ops,
2552                               &vdev->vga.region[QEMU_PCI_VGA_IO_HI],
2553                               "vfio-vga-io@0x3c0",
2554                               QEMU_PCI_VGA_IO_HI_SIZE);
2555
2556         pci_register_vga(&vdev->pdev, &vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
2557                          &vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
2558                          &vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem);
2559         vfio_vga_quirk_setup(vdev);
2560     }
2561 }
2562
2563 static void vfio_unmap_bars(VFIODevice *vdev)
2564 {
2565     int i;
2566
2567     for (i = 0; i < PCI_ROM_SLOT; i++) {
2568         vfio_unmap_bar(vdev, i);
2569     }
2570
2571     if (vdev->has_vga) {
2572         vfio_vga_quirk_teardown(vdev);
2573         pci_unregister_vga(&vdev->pdev);
2574         memory_region_destroy(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem);
2575         memory_region_destroy(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem);
2576         memory_region_destroy(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem);
2577     }
2578 }
2579
2580 /*
2581  * General setup
2582  */
2583 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
2584 {
2585     uint8_t tmp, next = 0xff;
2586
2587     for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
2588          tmp = pdev->config[tmp + 1]) {
2589         if (tmp > pos && tmp < next) {
2590             next = tmp;
2591         }
2592     }
2593
2594     return next - pos;
2595 }
2596
2597 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
2598 {
2599     pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
2600 }
2601
2602 static void vfio_add_emulated_word(VFIODevice *vdev, int pos,
2603                                    uint16_t val, uint16_t mask)
2604 {
2605     vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
2606     vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
2607     vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
2608 }
2609
2610 static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
2611 {
2612     pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
2613 }
2614
2615 static void vfio_add_emulated_long(VFIODevice *vdev, int pos,
2616                                    uint32_t val, uint32_t mask)
2617 {
2618     vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
2619     vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
2620     vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
2621 }
2622
2623 static int vfio_setup_pcie_cap(VFIODevice *vdev, int pos, uint8_t size)
2624 {
2625     uint16_t flags;
2626     uint8_t type;
2627
2628     flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
2629     type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
2630
2631     if (type != PCI_EXP_TYPE_ENDPOINT &&
2632         type != PCI_EXP_TYPE_LEG_END &&
2633         type != PCI_EXP_TYPE_RC_END) {
2634
2635         error_report("vfio: Assignment of PCIe type 0x%x "
2636                      "devices is not currently supported", type);
2637         return -EINVAL;
2638     }
2639
2640     if (!pci_bus_is_express(vdev->pdev.bus)) {
2641         /*
2642          * Use express capability as-is on PCI bus.  It doesn't make much
2643          * sense to even expose, but some drivers (ex. tg3) depend on it
2644          * and guests don't seem to be particular about it.  We'll need
2645          * to revist this or force express devices to express buses if we
2646          * ever expose an IOMMU to the guest.
2647          */
2648     } else if (pci_bus_is_root(vdev->pdev.bus)) {
2649         /*
2650          * On a Root Complex bus Endpoints become Root Complex Integrated
2651          * Endpoints, which changes the type and clears the LNK & LNK2 fields.
2652          */
2653         if (type == PCI_EXP_TYPE_ENDPOINT) {
2654             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2655                                    PCI_EXP_TYPE_RC_END << 4,
2656                                    PCI_EXP_FLAGS_TYPE);
2657
2658             /* Link Capabilities, Status, and Control goes away */
2659             if (size > PCI_EXP_LNKCTL) {
2660                 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
2661                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2662                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
2663
2664 #ifndef PCI_EXP_LNKCAP2
2665 #define PCI_EXP_LNKCAP2 44
2666 #endif
2667 #ifndef PCI_EXP_LNKSTA2
2668 #define PCI_EXP_LNKSTA2 50
2669 #endif
2670                 /* Link 2 Capabilities, Status, and Control goes away */
2671                 if (size > PCI_EXP_LNKCAP2) {
2672                     vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
2673                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
2674                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
2675                 }
2676             }
2677
2678         } else if (type == PCI_EXP_TYPE_LEG_END) {
2679             /*
2680              * Legacy endpoints don't belong on the root complex.  Windows
2681              * seems to be happier with devices if we skip the capability.
2682              */
2683             return 0;
2684         }
2685
2686     } else {
2687         /*
2688          * Convert Root Complex Integrated Endpoints to regular endpoints.
2689          * These devices don't support LNK/LNK2 capabilities, so make them up.
2690          */
2691         if (type == PCI_EXP_TYPE_RC_END) {
2692             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2693                                    PCI_EXP_TYPE_ENDPOINT << 4,
2694                                    PCI_EXP_FLAGS_TYPE);
2695             vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
2696                                    PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25, ~0);
2697             vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2698         }
2699
2700         /* Mark the Link Status bits as emulated to allow virtual negotiation */
2701         vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA,
2702                                pci_get_word(vdev->pdev.config + pos +
2703                                             PCI_EXP_LNKSTA),
2704                                PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
2705     }
2706
2707     pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size);
2708     if (pos >= 0) {
2709         vdev->pdev.exp.exp_cap = pos;
2710     }
2711
2712     return pos;
2713 }
2714
2715 static void vfio_check_pcie_flr(VFIODevice *vdev, uint8_t pos)
2716 {
2717     uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
2718
2719     if (cap & PCI_EXP_DEVCAP_FLR) {
2720         DPRINTF("%04x:%02x:%02x.%x Supports FLR via PCIe cap\n",
2721                 vdev->host.domain, vdev->host.bus, vdev->host.slot,
2722                 vdev->host.function);
2723         vdev->has_flr = true;
2724     }
2725 }
2726
2727 static void vfio_check_pm_reset(VFIODevice *vdev, uint8_t pos)
2728 {
2729     uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
2730
2731     if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
2732         DPRINTF("%04x:%02x:%02x.%x Supports PM reset\n",
2733                 vdev->host.domain, vdev->host.bus, vdev->host.slot,
2734                 vdev->host.function);
2735         vdev->has_pm_reset = true;
2736     }
2737 }
2738
2739 static void vfio_check_af_flr(VFIODevice *vdev, uint8_t pos)
2740 {
2741     uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
2742
2743     if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
2744         DPRINTF("%04x:%02x:%02x.%x Supports FLR via AF cap\n",
2745                 vdev->host.domain, vdev->host.bus, vdev->host.slot,
2746                 vdev->host.function);
2747         vdev->has_flr = true;
2748     }
2749 }
2750
2751 static int vfio_add_std_cap(VFIODevice *vdev, uint8_t pos)
2752 {
2753     PCIDevice *pdev = &vdev->pdev;
2754     uint8_t cap_id, next, size;
2755     int ret;
2756
2757     cap_id = pdev->config[pos];
2758     next = pdev->config[pos + 1];
2759
2760     /*
2761      * If it becomes important to configure capabilities to their actual
2762      * size, use this as the default when it's something we don't recognize.
2763      * Since QEMU doesn't actually handle many of the config accesses,
2764      * exact size doesn't seem worthwhile.
2765      */
2766     size = vfio_std_cap_max_size(pdev, pos);
2767
2768     /*
2769      * pci_add_capability always inserts the new capability at the head
2770      * of the chain.  Therefore to end up with a chain that matches the
2771      * physical device, we insert from the end by making this recursive.
2772      * This is also why we pre-caclulate size above as cached config space
2773      * will be changed as we unwind the stack.
2774      */
2775     if (next) {
2776         ret = vfio_add_std_cap(vdev, next);
2777         if (ret) {
2778             return ret;
2779         }
2780     } else {
2781         /* Begin the rebuild, use QEMU emulated list bits */
2782         pdev->config[PCI_CAPABILITY_LIST] = 0;
2783         vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
2784         vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
2785     }
2786
2787     /* Use emulated next pointer to allow dropping caps */
2788     pci_set_byte(vdev->emulated_config_bits + pos + 1, 0xff);
2789
2790     switch (cap_id) {
2791     case PCI_CAP_ID_MSI:
2792         ret = vfio_setup_msi(vdev, pos);
2793         break;
2794     case PCI_CAP_ID_EXP:
2795         vfio_check_pcie_flr(vdev, pos);
2796         ret = vfio_setup_pcie_cap(vdev, pos, size);
2797         break;
2798     case PCI_CAP_ID_MSIX:
2799         ret = vfio_setup_msix(vdev, pos);
2800         break;
2801     case PCI_CAP_ID_PM:
2802         vfio_check_pm_reset(vdev, pos);
2803         vdev->pm_cap = pos;
2804         ret = pci_add_capability(pdev, cap_id, pos, size);
2805         break;
2806     case PCI_CAP_ID_AF:
2807         vfio_check_af_flr(vdev, pos);
2808         ret = pci_add_capability(pdev, cap_id, pos, size);
2809         break;
2810     default:
2811         ret = pci_add_capability(pdev, cap_id, pos, size);
2812         break;
2813     }
2814
2815     if (ret < 0) {
2816         error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability "
2817                      "0x%x[0x%x]@0x%x: %d", vdev->host.domain,
2818                      vdev->host.bus, vdev->host.slot, vdev->host.function,
2819                      cap_id, size, pos, ret);
2820         return ret;
2821     }
2822
2823     return 0;
2824 }
2825
2826 static int vfio_add_capabilities(VFIODevice *vdev)
2827 {
2828     PCIDevice *pdev = &vdev->pdev;
2829
2830     if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
2831         !pdev->config[PCI_CAPABILITY_LIST]) {
2832         return 0; /* Nothing to add */
2833     }
2834
2835     return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
2836 }
2837
2838 static void vfio_pci_pre_reset(VFIODevice *vdev)
2839 {
2840     PCIDevice *pdev = &vdev->pdev;
2841     uint16_t cmd;
2842
2843     vfio_disable_interrupts(vdev);
2844
2845     /* Make sure the device is in D0 */
2846     if (vdev->pm_cap) {
2847         uint16_t pmcsr;
2848         uint8_t state;
2849
2850         pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
2851         state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2852         if (state) {
2853             pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
2854             vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
2855             /* vfio handles the necessary delay here */
2856             pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
2857             state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2858             if (state) {
2859                 error_report("vfio: Unable to power on device, stuck in D%d\n",
2860                              state);
2861             }
2862         }
2863     }
2864
2865     /*
2866      * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
2867      * Also put INTx Disable in known state.
2868      */
2869     cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
2870     cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
2871              PCI_COMMAND_INTX_DISABLE);
2872     vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
2873 }
2874
2875 static void vfio_pci_post_reset(VFIODevice *vdev)
2876 {
2877     vfio_enable_intx(vdev);
2878 }
2879
2880 static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
2881                                 PCIHostDeviceAddress *host2)
2882 {
2883     return (host1->domain == host2->domain && host1->bus == host2->bus &&
2884             host1->slot == host2->slot && host1->function == host2->function);
2885 }
2886
2887 static int vfio_pci_hot_reset(VFIODevice *vdev, bool single)
2888 {
2889     VFIOGroup *group;
2890     struct vfio_pci_hot_reset_info *info;
2891     struct vfio_pci_dependent_device *devices;
2892     struct vfio_pci_hot_reset *reset;
2893     int32_t *fds;
2894     int ret, i, count;
2895     bool multi = false;
2896
2897     DPRINTF("%s(%04x:%02x:%02x.%x) %s\n", __func__, vdev->host.domain,
2898             vdev->host.bus, vdev->host.slot, vdev->host.function,
2899             single ? "one" : "multi");
2900
2901     vfio_pci_pre_reset(vdev);
2902     vdev->needs_reset = false;
2903
2904     info = g_malloc0(sizeof(*info));
2905     info->argsz = sizeof(*info);
2906
2907     ret = ioctl(vdev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2908     if (ret && errno != ENOSPC) {
2909         ret = -errno;
2910         if (!vdev->has_pm_reset) {
2911             error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
2912                          "no available reset mechanism.", vdev->host.domain,
2913                          vdev->host.bus, vdev->host.slot, vdev->host.function);
2914         }
2915         goto out_single;
2916     }
2917
2918     count = info->count;
2919     info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
2920     info->argsz = sizeof(*info) + (count * sizeof(*devices));
2921     devices = &info->devices[0];
2922
2923     ret = ioctl(vdev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2924     if (ret) {
2925         ret = -errno;
2926         error_report("vfio: hot reset info failed: %m");
2927         goto out_single;
2928     }
2929
2930     DPRINTF("%04x:%02x:%02x.%x: hot reset dependent devices:\n",
2931             vdev->host.domain, vdev->host.bus, vdev->host.slot,
2932             vdev->host.function);
2933
2934     /* Verify that we have all the groups required */
2935     for (i = 0; i < info->count; i++) {
2936         PCIHostDeviceAddress host;
2937         VFIODevice *tmp;
2938
2939         host.domain = devices[i].segment;
2940         host.bus = devices[i].bus;
2941         host.slot = PCI_SLOT(devices[i].devfn);
2942         host.function = PCI_FUNC(devices[i].devfn);
2943
2944         DPRINTF("\t%04x:%02x:%02x.%x group %d\n", host.domain,
2945                 host.bus, host.slot, host.function, devices[i].group_id);
2946
2947         if (vfio_pci_host_match(&host, &vdev->host)) {
2948             continue;
2949         }
2950
2951         QLIST_FOREACH(group, &group_list, next) {
2952             if (group->groupid == devices[i].group_id) {
2953                 break;
2954             }
2955         }
2956
2957         if (!group) {
2958             if (!vdev->has_pm_reset) {
2959                 error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
2960                              "depends on group %d which is not owned.",
2961                              vdev->host.domain, vdev->host.bus, vdev->host.slot,
2962                              vdev->host.function, devices[i].group_id);
2963             }
2964             ret = -EPERM;
2965             goto out;
2966         }
2967
2968         /* Prep dependent devices for reset and clear our marker. */
2969         QLIST_FOREACH(tmp, &group->device_list, next) {
2970             if (vfio_pci_host_match(&host, &tmp->host)) {
2971                 if (single) {
2972                     DPRINTF("vfio: found another in-use device "
2973                             "%04x:%02x:%02x.%x\n", host.domain, host.bus,
2974                             host.slot, host.function);
2975                     ret = -EINVAL;
2976                     goto out_single;
2977                 }
2978                 vfio_pci_pre_reset(tmp);
2979                 tmp->needs_reset = false;
2980                 multi = true;
2981                 break;
2982             }
2983         }
2984     }
2985
2986     if (!single && !multi) {
2987         DPRINTF("vfio: No other in-use devices for multi hot reset\n");
2988         ret = -EINVAL;
2989         goto out_single;
2990     }
2991
2992     /* Determine how many group fds need to be passed */
2993     count = 0;
2994     QLIST_FOREACH(group, &group_list, next) {
2995         for (i = 0; i < info->count; i++) {
2996             if (group->groupid == devices[i].group_id) {
2997                 count++;
2998                 break;
2999             }
3000         }
3001     }
3002
3003     reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
3004     reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
3005     fds = &reset->group_fds[0];
3006
3007     /* Fill in group fds */
3008     QLIST_FOREACH(group, &group_list, next) {
3009         for (i = 0; i < info->count; i++) {
3010             if (group->groupid == devices[i].group_id) {
3011                 fds[reset->count++] = group->fd;
3012                 break;
3013             }
3014         }
3015     }
3016
3017     /* Bus reset! */
3018     ret = ioctl(vdev->fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
3019     g_free(reset);
3020
3021     DPRINTF("%04x:%02x:%02x.%x hot reset: %s\n", vdev->host.domain,
3022             vdev->host.bus, vdev->host.slot, vdev->host.function,
3023             ret ? "%m" : "Success");
3024
3025 out:
3026     /* Re-enable INTx on affected devices */
3027     for (i = 0; i < info->count; i++) {
3028         PCIHostDeviceAddress host;
3029         VFIODevice *tmp;
3030
3031         host.domain = devices[i].segment;
3032         host.bus = devices[i].bus;
3033         host.slot = PCI_SLOT(devices[i].devfn);
3034         host.function = PCI_FUNC(devices[i].devfn);
3035
3036         if (vfio_pci_host_match(&host, &vdev->host)) {
3037             continue;
3038         }
3039
3040         QLIST_FOREACH(group, &group_list, next) {
3041             if (group->groupid == devices[i].group_id) {
3042                 break;
3043             }
3044         }
3045
3046         if (!group) {
3047             break;
3048         }
3049
3050         QLIST_FOREACH(tmp, &group->device_list, next) {
3051             if (vfio_pci_host_match(&host, &tmp->host)) {
3052                 vfio_pci_post_reset(tmp);
3053                 break;
3054             }
3055         }
3056     }
3057 out_single:
3058     vfio_pci_post_reset(vdev);
3059     g_free(info);
3060
3061     return ret;
3062 }
3063
3064 /*
3065  * We want to differentiate hot reset of mulitple in-use devices vs hot reset
3066  * of a single in-use device.  VFIO_DEVICE_RESET will already handle the case
3067  * of doing hot resets when there is only a single device per bus.  The in-use
3068  * here refers to how many VFIODevices are affected.  A hot reset that affects
3069  * multiple devices, but only a single in-use device, means that we can call
3070  * it from our bus ->reset() callback since the extent is effectively a single
3071  * device.  This allows us to make use of it in the hotplug path.  When there
3072  * are multiple in-use devices, we can only trigger the hot reset during a
3073  * system reset and thus from our reset handler.  We separate _one vs _multi
3074  * here so that we don't overlap and do a double reset on the system reset
3075  * path where both our reset handler and ->reset() callback are used.  Calling
3076  * _one() will only do a hot reset for the one in-use devices case, calling
3077  * _multi() will do nothing if a _one() would have been sufficient.
3078  */
3079 static int vfio_pci_hot_reset_one(VFIODevice *vdev)
3080 {
3081     return vfio_pci_hot_reset(vdev, true);
3082 }
3083
3084 static int vfio_pci_hot_reset_multi(VFIODevice *vdev)
3085 {
3086     return vfio_pci_hot_reset(vdev, false);
3087 }
3088
3089 static void vfio_pci_reset_handler(void *opaque)
3090 {
3091     VFIOGroup *group;
3092     VFIODevice *vdev;
3093
3094     QLIST_FOREACH(group, &group_list, next) {
3095         QLIST_FOREACH(vdev, &group->device_list, next) {
3096             if (!vdev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
3097                 vdev->needs_reset = true;
3098             }
3099         }
3100     }
3101
3102     QLIST_FOREACH(group, &group_list, next) {
3103         QLIST_FOREACH(vdev, &group->device_list, next) {
3104             if (vdev->needs_reset) {
3105                 vfio_pci_hot_reset_multi(vdev);
3106             }
3107         }
3108     }
3109 }
3110
3111 static void vfio_kvm_device_add_group(VFIOGroup *group)
3112 {
3113 #ifdef CONFIG_KVM
3114     struct kvm_device_attr attr = {
3115         .group = KVM_DEV_VFIO_GROUP,
3116         .attr = KVM_DEV_VFIO_GROUP_ADD,
3117         .addr = (uint64_t)(unsigned long)&group->fd,
3118     };
3119
3120     if (!kvm_enabled()) {
3121         return;
3122     }
3123
3124     if (vfio_kvm_device_fd < 0) {
3125         struct kvm_create_device cd = {
3126             .type = KVM_DEV_TYPE_VFIO,
3127         };
3128
3129         if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
3130             DPRINTF("KVM_CREATE_DEVICE: %m\n");
3131             return;
3132         }
3133
3134         vfio_kvm_device_fd = cd.fd;
3135     }
3136
3137     if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
3138         error_report("Failed to add group %d to KVM VFIO device: %m",
3139                      group->groupid);
3140     }
3141 #endif
3142 }
3143
3144 static void vfio_kvm_device_del_group(VFIOGroup *group)
3145 {
3146 #ifdef CONFIG_KVM
3147     struct kvm_device_attr attr = {
3148         .group = KVM_DEV_VFIO_GROUP,
3149         .attr = KVM_DEV_VFIO_GROUP_DEL,
3150         .addr = (uint64_t)(unsigned long)&group->fd,
3151     };
3152
3153     if (vfio_kvm_device_fd < 0) {
3154         return;
3155     }
3156
3157     if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
3158         error_report("Failed to remove group %d to KVM VFIO device: %m",
3159                      group->groupid);
3160     }
3161 #endif
3162 }
3163
3164 static int vfio_connect_container(VFIOGroup *group)
3165 {
3166     VFIOContainer *container;
3167     int ret, fd;
3168
3169     if (group->container) {
3170         return 0;
3171     }
3172
3173     QLIST_FOREACH(container, &container_list, next) {
3174         if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
3175             group->container = container;
3176             QLIST_INSERT_HEAD(&container->group_list, group, container_next);
3177             return 0;
3178         }
3179     }
3180
3181     fd = qemu_open("/dev/vfio/vfio", O_RDWR);
3182     if (fd < 0) {
3183         error_report("vfio: failed to open /dev/vfio/vfio: %m");
3184         return -errno;
3185     }
3186
3187     ret = ioctl(fd, VFIO_GET_API_VERSION);
3188     if (ret != VFIO_API_VERSION) {
3189         error_report("vfio: supported vfio version: %d, "
3190                      "reported version: %d", VFIO_API_VERSION, ret);
3191         close(fd);
3192         return -EINVAL;
3193     }
3194
3195     container = g_malloc0(sizeof(*container));
3196     container->fd = fd;
3197
3198     if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
3199         ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
3200         if (ret) {
3201             error_report("vfio: failed to set group container: %m");
3202             g_free(container);
3203             close(fd);
3204             return -errno;
3205         }
3206
3207         ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
3208         if (ret) {
3209             error_report("vfio: failed to set iommu for container: %m");
3210             g_free(container);
3211             close(fd);
3212             return -errno;
3213         }
3214
3215         container->iommu_data.listener = vfio_memory_listener;
3216         container->iommu_data.release = vfio_listener_release;
3217
3218         memory_listener_register(&container->iommu_data.listener, &address_space_memory);
3219     } else {
3220         error_report("vfio: No available IOMMU models");
3221         g_free(container);
3222         close(fd);
3223         return -EINVAL;
3224     }
3225
3226     QLIST_INIT(&container->group_list);
3227     QLIST_INSERT_HEAD(&container_list, container, next);
3228
3229     group->container = container;
3230     QLIST_INSERT_HEAD(&container->group_list, group, container_next);
3231
3232     return 0;
3233 }
3234
3235 static void vfio_disconnect_container(VFIOGroup *group)
3236 {
3237     VFIOContainer *container = group->container;
3238
3239     if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
3240         error_report("vfio: error disconnecting group %d from container",
3241                      group->groupid);
3242     }
3243
3244     QLIST_REMOVE(group, container_next);
3245     group->container = NULL;
3246
3247     if (QLIST_EMPTY(&container->group_list)) {
3248         if (container->iommu_data.release) {
3249             container->iommu_data.release(container);
3250         }
3251         QLIST_REMOVE(container, next);
3252         DPRINTF("vfio_disconnect_container: close container->fd\n");
3253         close(container->fd);
3254         g_free(container);
3255     }
3256 }
3257
3258 static VFIOGroup *vfio_get_group(int groupid)
3259 {
3260     VFIOGroup *group;
3261     char path[32];
3262     struct vfio_group_status status = { .argsz = sizeof(status) };
3263
3264     QLIST_FOREACH(group, &group_list, next) {
3265         if (group->groupid == groupid) {
3266             return group;
3267         }
3268     }
3269
3270     group = g_malloc0(sizeof(*group));
3271
3272     snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
3273     group->fd = qemu_open(path, O_RDWR);
3274     if (group->fd < 0) {
3275         error_report("vfio: error opening %s: %m", path);
3276         g_free(group);
3277         return NULL;
3278     }
3279
3280     if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
3281         error_report("vfio: error getting group status: %m");
3282         close(group->fd);
3283         g_free(group);
3284         return NULL;
3285     }
3286
3287     if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
3288         error_report("vfio: error, group %d is not viable, please ensure "
3289                      "all devices within the iommu_group are bound to their "
3290                      "vfio bus driver.", groupid);
3291         close(group->fd);
3292         g_free(group);
3293         return NULL;
3294     }
3295
3296     group->groupid = groupid;
3297     QLIST_INIT(&group->device_list);
3298
3299     if (vfio_connect_container(group)) {
3300         error_report("vfio: failed to setup container for group %d", groupid);
3301         close(group->fd);
3302         g_free(group);
3303         return NULL;
3304     }
3305
3306     if (QLIST_EMPTY(&group_list)) {
3307         qemu_register_reset(vfio_pci_reset_handler, NULL);
3308     }
3309
3310     QLIST_INSERT_HEAD(&group_list, group, next);
3311
3312     vfio_kvm_device_add_group(group);
3313
3314     return group;
3315 }
3316
3317 static void vfio_put_group(VFIOGroup *group)
3318 {
3319     if (!QLIST_EMPTY(&group->device_list)) {
3320         return;
3321     }
3322
3323     vfio_kvm_device_del_group(group);
3324     vfio_disconnect_container(group);
3325     QLIST_REMOVE(group, next);
3326     DPRINTF("vfio_put_group: close group->fd\n");
3327     close(group->fd);
3328     g_free(group);
3329
3330     if (QLIST_EMPTY(&group_list)) {
3331         qemu_unregister_reset(vfio_pci_reset_handler, NULL);
3332     }
3333 }
3334
3335 static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
3336 {
3337     struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
3338     struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
3339     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
3340     int ret, i;
3341
3342     ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
3343     if (ret < 0) {
3344         error_report("vfio: error getting device %s from group %d: %m",
3345                      name, group->groupid);
3346         error_printf("Verify all devices in group %d are bound to vfio-pci "
3347                      "or pci-stub and not already in use\n", group->groupid);
3348         return ret;
3349     }
3350
3351     vdev->fd = ret;
3352     vdev->group = group;
3353     QLIST_INSERT_HEAD(&group->device_list, vdev, next);
3354
3355     /* Sanity check device */
3356     ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
3357     if (ret) {
3358         error_report("vfio: error getting device info: %m");
3359         goto error;
3360     }
3361
3362     DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
3363             dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
3364
3365     if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
3366         error_report("vfio: Um, this isn't a PCI device");
3367         goto error;
3368     }
3369
3370     vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
3371
3372     if (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
3373         error_report("vfio: unexpected number of io regions %u",
3374                      dev_info.num_regions);
3375         goto error;
3376     }
3377
3378     if (dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
3379         error_report("vfio: unexpected number of irqs %u", dev_info.num_irqs);
3380         goto error;
3381     }
3382
3383     for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
3384         reg_info.index = i;
3385
3386         ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
3387         if (ret) {
3388             error_report("vfio: Error getting region %d info: %m", i);
3389             goto error;
3390         }
3391
3392         DPRINTF("Device %s region %d:\n", name, i);
3393         DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
3394                 (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
3395                 (unsigned long)reg_info.flags);
3396
3397         vdev->bars[i].flags = reg_info.flags;
3398         vdev->bars[i].size = reg_info.size;
3399         vdev->bars[i].fd_offset = reg_info.offset;
3400         vdev->bars[i].fd = vdev->fd;
3401         vdev->bars[i].nr = i;
3402         QLIST_INIT(&vdev->bars[i].quirks);
3403     }
3404
3405     reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
3406
3407     ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
3408     if (ret) {
3409         error_report("vfio: Error getting config info: %m");
3410         goto error;
3411     }
3412
3413     DPRINTF("Device %s config:\n", name);
3414     DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
3415             (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
3416             (unsigned long)reg_info.flags);
3417
3418     vdev->config_size = reg_info.size;
3419     if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
3420         vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
3421     }
3422     vdev->config_offset = reg_info.offset;
3423
3424     if ((vdev->features & VFIO_FEATURE_ENABLE_VGA) &&
3425         dev_info.num_regions > VFIO_PCI_VGA_REGION_INDEX) {
3426         struct vfio_region_info vga_info = {
3427             .argsz = sizeof(vga_info),
3428             .index = VFIO_PCI_VGA_REGION_INDEX,
3429          };
3430
3431         ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &vga_info);
3432         if (ret) {
3433             error_report(
3434                 "vfio: Device does not support requested feature x-vga");
3435             goto error;
3436         }
3437
3438         if (!(vga_info.flags & VFIO_REGION_INFO_FLAG_READ) ||
3439             !(vga_info.flags & VFIO_REGION_INFO_FLAG_WRITE) ||
3440             vga_info.size < 0xbffff + 1) {
3441             error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx",
3442                          (unsigned long)vga_info.flags,
3443                          (unsigned long)vga_info.size);
3444             goto error;
3445         }
3446
3447         vdev->vga.fd_offset = vga_info.offset;
3448         vdev->vga.fd = vdev->fd;
3449
3450         vdev->vga.region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
3451         vdev->vga.region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
3452         QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_MEM].quirks);
3453
3454         vdev->vga.region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
3455         vdev->vga.region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
3456         QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].quirks);
3457
3458         vdev->vga.region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
3459         vdev->vga.region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
3460         QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks);
3461
3462         vdev->has_vga = true;
3463     }
3464     irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
3465
3466     ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
3467     if (ret) {
3468         /* This can fail for an old kernel or legacy PCI dev */
3469         DPRINTF("VFIO_DEVICE_GET_IRQ_INFO failure: %m\n");
3470         ret = 0;
3471     } else if (irq_info.count == 1) {
3472         vdev->pci_aer = true;
3473     } else {
3474         error_report("vfio: %04x:%02x:%02x.%x "
3475                      "Could not enable error recovery for the device",
3476                      vdev->host.domain, vdev->host.bus, vdev->host.slot,
3477                      vdev->host.function);
3478     }
3479
3480 error:
3481     if (ret) {
3482         QLIST_REMOVE(vdev, next);
3483         vdev->group = NULL;
3484         close(vdev->fd);
3485     }
3486     return ret;
3487 }
3488
3489 static void vfio_put_device(VFIODevice *vdev)
3490 {
3491     QLIST_REMOVE(vdev, next);
3492     vdev->group = NULL;
3493     DPRINTF("vfio_put_device: close vdev->fd\n");
3494     close(vdev->fd);
3495     if (vdev->msix) {
3496         g_free(vdev->msix);
3497         vdev->msix = NULL;
3498     }
3499 }
3500
3501 static void vfio_err_notifier_handler(void *opaque)
3502 {
3503     VFIODevice *vdev = opaque;
3504
3505     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
3506         return;
3507     }
3508
3509     /*
3510      * TBD. Retrieve the error details and decide what action
3511      * needs to be taken. One of the actions could be to pass
3512      * the error to the guest and have the guest driver recover
3513      * from the error. This requires that PCIe capabilities be
3514      * exposed to the guest. For now, we just terminate the
3515      * guest to contain the error.
3516      */
3517
3518     error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "
3519                  "Please collect any data possible and then kill the guest",
3520                  __func__, vdev->host.domain, vdev->host.bus,
3521                  vdev->host.slot, vdev->host.function);
3522
3523     vm_stop(RUN_STATE_IO_ERROR);
3524 }
3525
3526 /*
3527  * Registers error notifier for devices supporting error recovery.
3528  * If we encounter a failure in this function, we report an error
3529  * and continue after disabling error recovery support for the
3530  * device.
3531  */
3532 static void vfio_register_err_notifier(VFIODevice *vdev)
3533 {
3534     int ret;
3535     int argsz;
3536     struct vfio_irq_set *irq_set;
3537     int32_t *pfd;
3538
3539     if (!vdev->pci_aer) {
3540         return;
3541     }
3542
3543     if (event_notifier_init(&vdev->err_notifier, 0)) {
3544         error_report("vfio: Unable to init event notifier for error detection");
3545         vdev->pci_aer = false;
3546         return;
3547     }
3548
3549     argsz = sizeof(*irq_set) + sizeof(*pfd);
3550
3551     irq_set = g_malloc0(argsz);
3552     irq_set->argsz = argsz;
3553     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
3554                      VFIO_IRQ_SET_ACTION_TRIGGER;
3555     irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
3556     irq_set->start = 0;
3557     irq_set->count = 1;
3558     pfd = (int32_t *)&irq_set->data;
3559
3560     *pfd = event_notifier_get_fd(&vdev->err_notifier);
3561     qemu_set_fd_handler(*pfd, vfio_err_notifier_handler, NULL, vdev);
3562
3563     ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
3564     if (ret) {
3565         error_report("vfio: Failed to set up error notification");
3566         qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
3567         event_notifier_cleanup(&vdev->err_notifier);
3568         vdev->pci_aer = false;
3569     }
3570     g_free(irq_set);
3571 }
3572
3573 static void vfio_unregister_err_notifier(VFIODevice *vdev)
3574 {
3575     int argsz;
3576     struct vfio_irq_set *irq_set;
3577     int32_t *pfd;
3578     int ret;
3579
3580     if (!vdev->pci_aer) {
3581         return;
3582     }
3583
3584     argsz = sizeof(*irq_set) + sizeof(*pfd);
3585
3586     irq_set = g_malloc0(argsz);
3587     irq_set->argsz = argsz;
3588     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
3589                      VFIO_IRQ_SET_ACTION_TRIGGER;
3590     irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
3591     irq_set->start = 0;
3592     irq_set->count = 1;
3593     pfd = (int32_t *)&irq_set->data;
3594     *pfd = -1;
3595
3596     ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
3597     if (ret) {
3598         error_report("vfio: Failed to de-assign error fd: %m");
3599     }
3600     g_free(irq_set);
3601     qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
3602                         NULL, NULL, vdev);
3603     event_notifier_cleanup(&vdev->err_notifier);
3604 }
3605
3606 static int vfio_initfn(PCIDevice *pdev)
3607 {
3608     VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
3609     VFIOGroup *group;
3610     char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
3611     ssize_t len;
3612     struct stat st;
3613     int groupid;
3614     int ret;
3615
3616     /* Check that the host device exists */
3617     snprintf(path, sizeof(path),
3618              "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
3619              vdev->host.domain, vdev->host.bus, vdev->host.slot,
3620              vdev->host.function);
3621     if (stat(path, &st) < 0) {
3622         error_report("vfio: error: no such host device: %s", path);
3623         return -errno;
3624     }
3625
3626     strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
3627
3628     len = readlink(path, iommu_group_path, PATH_MAX);
3629     if (len <= 0) {
3630         error_report("vfio: error no iommu_group for device");
3631         return -errno;
3632     }
3633
3634     iommu_group_path[len] = 0;
3635     group_name = basename(iommu_group_path);
3636
3637     if (sscanf(group_name, "%d", &groupid) != 1) {
3638         error_report("vfio: error reading %s: %m", path);
3639         return -errno;
3640     }
3641
3642     DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
3643             vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
3644
3645     group = vfio_get_group(groupid);
3646     if (!group) {
3647         error_report("vfio: failed to get group %d", groupid);
3648         return -ENOENT;
3649     }
3650
3651     snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
3652             vdev->host.domain, vdev->host.bus, vdev->host.slot,
3653             vdev->host.function);
3654
3655     QLIST_FOREACH(pvdev, &group->device_list, next) {
3656         if (pvdev->host.domain == vdev->host.domain &&
3657             pvdev->host.bus == vdev->host.bus &&
3658             pvdev->host.slot == vdev->host.slot &&
3659             pvdev->host.function == vdev->host.function) {
3660
3661             error_report("vfio: error: device %s is already attached", path);
3662             vfio_put_group(group);
3663             return -EBUSY;
3664         }
3665     }
3666
3667     ret = vfio_get_device(group, path, vdev);
3668     if (ret) {
3669         error_report("vfio: failed to get device %s", path);
3670         vfio_put_group(group);
3671         return ret;
3672     }
3673
3674     /* Get a copy of config space */
3675     ret = pread(vdev->fd, vdev->pdev.config,
3676                 MIN(pci_config_size(&vdev->pdev), vdev->config_size),
3677                 vdev->config_offset);
3678     if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
3679         ret = ret < 0 ? -errno : -EFAULT;
3680         error_report("vfio: Failed to read device config space");
3681         goto out_put;
3682     }
3683
3684     /* vfio emulates a lot for us, but some bits need extra love */
3685     vdev->emulated_config_bits = g_malloc0(vdev->config_size);
3686
3687     /* QEMU can choose to expose the ROM or not */
3688     memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
3689
3690     /* QEMU can change multi-function devices to single function, or reverse */
3691     vdev->emulated_config_bits[PCI_HEADER_TYPE] =
3692                                               PCI_HEADER_TYPE_MULTI_FUNCTION;
3693
3694     /* Restore or clear multifunction, this is always controlled by QEMU */
3695     if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
3696         vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
3697     } else {
3698         vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
3699     }
3700
3701     /*
3702      * Clear host resource mapping info.  If we choose not to register a
3703      * BAR, such as might be the case with the option ROM, we can get
3704      * confusing, unwritable, residual addresses from the host here.
3705      */
3706     memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
3707     memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
3708
3709     vfio_pci_size_rom(vdev);
3710
3711     ret = vfio_early_setup_msix(vdev);
3712     if (ret) {
3713         goto out_put;
3714     }
3715
3716     vfio_map_bars(vdev);
3717
3718     ret = vfio_add_capabilities(vdev);
3719     if (ret) {
3720         goto out_teardown;
3721     }
3722
3723     /* QEMU emulates all of MSI & MSIX */
3724     if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
3725         memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
3726                MSIX_CAP_LENGTH);
3727     }
3728
3729     if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
3730         memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
3731                vdev->msi_cap_size);
3732     }
3733
3734     if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
3735         vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
3736                                                   vfio_intx_mmap_enable, vdev);
3737         pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
3738         ret = vfio_enable_intx(vdev);
3739         if (ret) {
3740             goto out_teardown;
3741         }
3742     }
3743
3744     add_boot_device_path(vdev->bootindex, &pdev->qdev, NULL);
3745     vfio_register_err_notifier(vdev);
3746
3747     return 0;
3748
3749 out_teardown:
3750     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3751     vfio_teardown_msi(vdev);
3752     vfio_unmap_bars(vdev);
3753 out_put:
3754     g_free(vdev->emulated_config_bits);
3755     vfio_put_device(vdev);
3756     vfio_put_group(group);
3757     return ret;
3758 }
3759
3760 static void vfio_exitfn(PCIDevice *pdev)
3761 {
3762     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
3763     VFIOGroup *group = vdev->group;
3764
3765     vfio_unregister_err_notifier(vdev);
3766     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3767     vfio_disable_interrupts(vdev);
3768     if (vdev->intx.mmap_timer) {
3769         timer_free(vdev->intx.mmap_timer);
3770     }
3771     vfio_teardown_msi(vdev);
3772     vfio_unmap_bars(vdev);
3773     g_free(vdev->emulated_config_bits);
3774     g_free(vdev->rom);
3775     vfio_put_device(vdev);
3776     vfio_put_group(group);
3777 }
3778
3779 static void vfio_pci_reset(DeviceState *dev)
3780 {
3781     PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
3782     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
3783
3784     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
3785             vdev->host.bus, vdev->host.slot, vdev->host.function);
3786
3787     vfio_pci_pre_reset(vdev);
3788
3789     if (vdev->reset_works && (vdev->has_flr || !vdev->has_pm_reset) &&
3790         !ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
3791         DPRINTF("%04x:%02x:%02x.%x FLR/VFIO_DEVICE_RESET\n", vdev->host.domain,
3792             vdev->host.bus, vdev->host.slot, vdev->host.function);
3793         goto post_reset;
3794     }
3795
3796     /* See if we can do our own bus reset */
3797     if (!vfio_pci_hot_reset_one(vdev)) {
3798         goto post_reset;
3799     }
3800
3801     /* If nothing else works and the device supports PM reset, use it */
3802     if (vdev->reset_works && vdev->has_pm_reset &&
3803         !ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
3804         DPRINTF("%04x:%02x:%02x.%x PCI PM Reset\n", vdev->host.domain,
3805             vdev->host.bus, vdev->host.slot, vdev->host.function);
3806         goto post_reset;
3807     }
3808
3809 post_reset:
3810     vfio_pci_post_reset(vdev);
3811 }
3812
3813 static Property vfio_pci_dev_properties[] = {
3814     DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
3815     DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIODevice,
3816                        intx.mmap_timeout, 1100),
3817     DEFINE_PROP_BIT("x-vga", VFIODevice, features,
3818                     VFIO_FEATURE_ENABLE_VGA_BIT, false),
3819     DEFINE_PROP_INT32("bootindex", VFIODevice, bootindex, -1),
3820     /*
3821      * TODO - support passed fds... is this necessary?
3822      * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
3823      * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
3824      */
3825     DEFINE_PROP_END_OF_LIST(),
3826 };
3827
3828 static const VMStateDescription vfio_pci_vmstate = {
3829     .name = "vfio-pci",
3830     .unmigratable = 1,
3831 };
3832
3833 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
3834 {
3835     DeviceClass *dc = DEVICE_CLASS(klass);
3836     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3837
3838     dc->reset = vfio_pci_reset;
3839     dc->props = vfio_pci_dev_properties;
3840     dc->vmsd = &vfio_pci_vmstate;
3841     dc->desc = "VFIO-based PCI device assignment";
3842     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
3843     pdc->init = vfio_initfn;
3844     pdc->exit = vfio_exitfn;
3845     pdc->config_read = vfio_pci_read_config;
3846     pdc->config_write = vfio_pci_write_config;
3847     pdc->is_express = 1; /* We might be */
3848 }
3849
3850 static const TypeInfo vfio_pci_dev_info = {
3851     .name = "vfio-pci",
3852     .parent = TYPE_PCI_DEVICE,
3853     .instance_size = sizeof(VFIODevice),
3854     .class_init = vfio_pci_dev_class_init,
3855 };
3856
3857 static void register_vfio_pci_dev_type(void)
3858 {
3859     type_register_static(&vfio_pci_dev_info);
3860 }
3861
3862 type_init(register_vfio_pci_dev_type)