spapr: Drop CAS reboot flag
[qemu/kevin.git] / hw / vfio / pci-quirks.c
blob2d348f8237fa6a8e7aac10a4a920d4b10842a8ea
1 /*
2 * device quirks for PCI devices
4 * Copyright Red Hat, Inc. 2012-2015
6 * Authors:
7 * Alex Williamson <alex.williamson@redhat.com>
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
13 #include "qemu/osdep.h"
14 #include "exec/memop.h"
15 #include "qemu/units.h"
16 #include "qemu/error-report.h"
17 #include "qemu/main-loop.h"
18 #include "qemu/module.h"
19 #include "qemu/range.h"
20 #include "qapi/error.h"
21 #include "qapi/visitor.h"
22 #include <sys/ioctl.h>
23 #include "hw/hw.h"
24 #include "hw/nvram/fw_cfg.h"
25 #include "hw/qdev-properties.h"
26 #include "pci.h"
27 #include "trace.h"
30 * List of device ids/vendor ids for which to disable
31 * option rom loading. This avoids the guest hangs during rom
32 * execution as noticed with the BCM 57810 card for lack of a
33 * more better way to handle such issues.
34 * The user can still override by specifying a romfile or
35 * rombar=1.
36 * Please see https://bugs.launchpad.net/qemu/+bug/1284874
37 * for an analysis of the 57810 card hang. When adding
38 * a new vendor id/device id combination below, please also add
39 * your card/environment details and information that could
40 * help in debugging to the bug tracking this issue
42 static const struct {
43 uint32_t vendor;
44 uint32_t device;
45 } romblacklist[] = {
46 { 0x14e4, 0x168e }, /* Broadcom BCM 57810 */
49 bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev)
51 int i;
53 for (i = 0 ; i < ARRAY_SIZE(romblacklist); i++) {
54 if (vfio_pci_is(vdev, romblacklist[i].vendor, romblacklist[i].device)) {
55 trace_vfio_quirk_rom_blacklisted(vdev->vbasedev.name,
56 romblacklist[i].vendor,
57 romblacklist[i].device);
58 return true;
61 return false;
65 * Device specific region quirks (mostly backdoors to PCI config space)
69 * The generic window quirks operate on an address and data register,
70 * vfio_generic_window_address_quirk handles the address register and
71 * vfio_generic_window_data_quirk handles the data register. These ops
72 * pass reads and writes through to hardware until a value matching the
73 * stored address match/mask is written. When this occurs, the data
74 * register access emulated PCI config space for the device rather than
75 * passing through accesses. This enables devices where PCI config space
76 * is accessible behind a window register to maintain the virtualization
77 * provided through vfio.
79 typedef struct VFIOConfigWindowMatch {
80 uint32_t match;
81 uint32_t mask;
82 } VFIOConfigWindowMatch;
84 typedef struct VFIOConfigWindowQuirk {
85 struct VFIOPCIDevice *vdev;
87 uint32_t address_val;
89 uint32_t address_offset;
90 uint32_t data_offset;
92 bool window_enabled;
93 uint8_t bar;
95 MemoryRegion *addr_mem;
96 MemoryRegion *data_mem;
98 uint32_t nr_matches;
99 VFIOConfigWindowMatch matches[];
100 } VFIOConfigWindowQuirk;
102 static uint64_t vfio_generic_window_quirk_address_read(void *opaque,
103 hwaddr addr,
104 unsigned size)
106 VFIOConfigWindowQuirk *window = opaque;
107 VFIOPCIDevice *vdev = window->vdev;
109 return vfio_region_read(&vdev->bars[window->bar].region,
110 addr + window->address_offset, size);
113 static void vfio_generic_window_quirk_address_write(void *opaque, hwaddr addr,
114 uint64_t data,
115 unsigned size)
117 VFIOConfigWindowQuirk *window = opaque;
118 VFIOPCIDevice *vdev = window->vdev;
119 int i;
121 window->window_enabled = false;
123 vfio_region_write(&vdev->bars[window->bar].region,
124 addr + window->address_offset, data, size);
126 for (i = 0; i < window->nr_matches; i++) {
127 if ((data & ~window->matches[i].mask) == window->matches[i].match) {
128 window->window_enabled = true;
129 window->address_val = data & window->matches[i].mask;
130 trace_vfio_quirk_generic_window_address_write(vdev->vbasedev.name,
131 memory_region_name(window->addr_mem), data);
132 break;
137 static const MemoryRegionOps vfio_generic_window_address_quirk = {
138 .read = vfio_generic_window_quirk_address_read,
139 .write = vfio_generic_window_quirk_address_write,
140 .endianness = DEVICE_LITTLE_ENDIAN,
143 static uint64_t vfio_generic_window_quirk_data_read(void *opaque,
144 hwaddr addr, unsigned size)
146 VFIOConfigWindowQuirk *window = opaque;
147 VFIOPCIDevice *vdev = window->vdev;
148 uint64_t data;
150 /* Always read data reg, discard if window enabled */
151 data = vfio_region_read(&vdev->bars[window->bar].region,
152 addr + window->data_offset, size);
154 if (window->window_enabled) {
155 data = vfio_pci_read_config(&vdev->pdev, window->address_val, size);
156 trace_vfio_quirk_generic_window_data_read(vdev->vbasedev.name,
157 memory_region_name(window->data_mem), data);
160 return data;
163 static void vfio_generic_window_quirk_data_write(void *opaque, hwaddr addr,
164 uint64_t data, unsigned size)
166 VFIOConfigWindowQuirk *window = opaque;
167 VFIOPCIDevice *vdev = window->vdev;
169 if (window->window_enabled) {
170 vfio_pci_write_config(&vdev->pdev, window->address_val, data, size);
171 trace_vfio_quirk_generic_window_data_write(vdev->vbasedev.name,
172 memory_region_name(window->data_mem), data);
173 return;
176 vfio_region_write(&vdev->bars[window->bar].region,
177 addr + window->data_offset, data, size);
180 static const MemoryRegionOps vfio_generic_window_data_quirk = {
181 .read = vfio_generic_window_quirk_data_read,
182 .write = vfio_generic_window_quirk_data_write,
183 .endianness = DEVICE_LITTLE_ENDIAN,
187 * The generic mirror quirk handles devices which expose PCI config space
188 * through a region within a BAR. When enabled, reads and writes are
189 * redirected through to emulated PCI config space. XXX if PCI config space
190 * used memory regions, this could just be an alias.
192 typedef struct VFIOConfigMirrorQuirk {
193 struct VFIOPCIDevice *vdev;
194 uint32_t offset;
195 uint8_t bar;
196 MemoryRegion *mem;
197 uint8_t data[];
198 } VFIOConfigMirrorQuirk;
200 static uint64_t vfio_generic_quirk_mirror_read(void *opaque,
201 hwaddr addr, unsigned size)
203 VFIOConfigMirrorQuirk *mirror = opaque;
204 VFIOPCIDevice *vdev = mirror->vdev;
205 uint64_t data;
207 /* Read and discard in case the hardware cares */
208 (void)vfio_region_read(&vdev->bars[mirror->bar].region,
209 addr + mirror->offset, size);
211 data = vfio_pci_read_config(&vdev->pdev, addr, size);
212 trace_vfio_quirk_generic_mirror_read(vdev->vbasedev.name,
213 memory_region_name(mirror->mem),
214 addr, data);
215 return data;
218 static void vfio_generic_quirk_mirror_write(void *opaque, hwaddr addr,
219 uint64_t data, unsigned size)
221 VFIOConfigMirrorQuirk *mirror = opaque;
222 VFIOPCIDevice *vdev = mirror->vdev;
224 vfio_pci_write_config(&vdev->pdev, addr, data, size);
225 trace_vfio_quirk_generic_mirror_write(vdev->vbasedev.name,
226 memory_region_name(mirror->mem),
227 addr, data);
230 static const MemoryRegionOps vfio_generic_mirror_quirk = {
231 .read = vfio_generic_quirk_mirror_read,
232 .write = vfio_generic_quirk_mirror_write,
233 .endianness = DEVICE_LITTLE_ENDIAN,
236 /* Is range1 fully contained within range2? */
237 static bool vfio_range_contained(uint64_t first1, uint64_t len1,
238 uint64_t first2, uint64_t len2) {
239 return (first1 >= first2 && first1 + len1 <= first2 + len2);
242 #define PCI_VENDOR_ID_ATI 0x1002
245 * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR
246 * through VGA register 0x3c3. On newer cards, the I/O port BAR is always
247 * BAR4 (older cards like the X550 used BAR1, but we don't care to support
248 * those). Note that on bare metal, a read of 0x3c3 doesn't always return the
249 * I/O port BAR address. Originally this was coded to return the virtual BAR
250 * address only if the physical register read returns the actual BAR address,
251 * but users have reported greater success if we return the virtual address
252 * unconditionally.
254 static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
255 hwaddr addr, unsigned size)
257 VFIOPCIDevice *vdev = opaque;
258 uint64_t data = vfio_pci_read_config(&vdev->pdev,
259 PCI_BASE_ADDRESS_4 + 1, size);
261 trace_vfio_quirk_ati_3c3_read(vdev->vbasedev.name, data);
263 return data;
266 static const MemoryRegionOps vfio_ati_3c3_quirk = {
267 .read = vfio_ati_3c3_quirk_read,
268 .endianness = DEVICE_LITTLE_ENDIAN,
271 VFIOQuirk *vfio_quirk_alloc(int nr_mem)
273 VFIOQuirk *quirk = g_new0(VFIOQuirk, 1);
274 QLIST_INIT(&quirk->ioeventfds);
275 quirk->mem = g_new0(MemoryRegion, nr_mem);
276 quirk->nr_mem = nr_mem;
278 return quirk;
281 static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd)
283 QLIST_REMOVE(ioeventfd, next);
284 memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
285 true, ioeventfd->data, &ioeventfd->e);
287 if (ioeventfd->vfio) {
288 struct vfio_device_ioeventfd vfio_ioeventfd;
290 vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
291 vfio_ioeventfd.flags = ioeventfd->size;
292 vfio_ioeventfd.data = ioeventfd->data;
293 vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
294 ioeventfd->region_addr;
295 vfio_ioeventfd.fd = -1;
297 if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd)) {
298 error_report("Failed to remove vfio ioeventfd for %s+0x%"
299 HWADDR_PRIx"[%d]:0x%"PRIx64" (%m)",
300 memory_region_name(ioeventfd->mr), ioeventfd->addr,
301 ioeventfd->size, ioeventfd->data);
303 } else {
304 qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
305 NULL, NULL, NULL);
308 event_notifier_cleanup(&ioeventfd->e);
309 trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
310 (uint64_t)ioeventfd->addr, ioeventfd->size,
311 ioeventfd->data);
312 g_free(ioeventfd);
315 static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
317 VFIOIOEventFD *ioeventfd, *tmp;
319 QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) {
320 if (ioeventfd->dynamic) {
321 vfio_ioeventfd_exit(vdev, ioeventfd);
326 static void vfio_ioeventfd_handler(void *opaque)
328 VFIOIOEventFD *ioeventfd = opaque;
330 if (event_notifier_test_and_clear(&ioeventfd->e)) {
331 vfio_region_write(ioeventfd->region, ioeventfd->region_addr,
332 ioeventfd->data, ioeventfd->size);
333 trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd->mr),
334 (uint64_t)ioeventfd->addr, ioeventfd->size,
335 ioeventfd->data);
339 static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev,
340 MemoryRegion *mr, hwaddr addr,
341 unsigned size, uint64_t data,
342 VFIORegion *region,
343 hwaddr region_addr, bool dynamic)
345 VFIOIOEventFD *ioeventfd;
347 if (vdev->no_kvm_ioeventfd) {
348 return NULL;
351 ioeventfd = g_malloc0(sizeof(*ioeventfd));
353 if (event_notifier_init(&ioeventfd->e, 0)) {
354 g_free(ioeventfd);
355 return NULL;
359 * MemoryRegion and relative offset, plus additional ioeventfd setup
360 * parameters for configuring and later tearing down KVM ioeventfd.
362 ioeventfd->mr = mr;
363 ioeventfd->addr = addr;
364 ioeventfd->size = size;
365 ioeventfd->data = data;
366 ioeventfd->dynamic = dynamic;
368 * VFIORegion and relative offset for implementing the userspace
369 * handler. data & size fields shared for both uses.
371 ioeventfd->region = region;
372 ioeventfd->region_addr = region_addr;
374 if (!vdev->no_vfio_ioeventfd) {
375 struct vfio_device_ioeventfd vfio_ioeventfd;
377 vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
378 vfio_ioeventfd.flags = ioeventfd->size;
379 vfio_ioeventfd.data = ioeventfd->data;
380 vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
381 ioeventfd->region_addr;
382 vfio_ioeventfd.fd = event_notifier_get_fd(&ioeventfd->e);
384 ioeventfd->vfio = !ioctl(vdev->vbasedev.fd,
385 VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd);
388 if (!ioeventfd->vfio) {
389 qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
390 vfio_ioeventfd_handler, NULL, ioeventfd);
393 memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
394 true, ioeventfd->data, &ioeventfd->e);
395 trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr,
396 size, data, ioeventfd->vfio);
398 return ioeventfd;
401 static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev)
403 VFIOQuirk *quirk;
406 * As long as the BAR is >= 256 bytes it will be aligned such that the
407 * lower byte is always zero. Filter out anything else, if it exists.
409 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
410 !vdev->bars[4].ioport || vdev->bars[4].region.size < 256) {
411 return;
414 quirk = vfio_quirk_alloc(1);
416 memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, vdev,
417 "vfio-ati-3c3-quirk", 1);
418 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
419 3 /* offset 3 bytes from 0x3c0 */, quirk->mem);
421 QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks,
422 quirk, next);
424 trace_vfio_quirk_ati_3c3_probe(vdev->vbasedev.name);
428 * Newer ATI/AMD devices, including HD5450 and HD7850, have a mirror to PCI
429 * config space through MMIO BAR2 at offset 0x4000. Nothing seems to access
430 * the MMIO space directly, but a window to this space is provided through
431 * I/O port BAR4. Offset 0x0 is the address register and offset 0x4 is the
432 * data register. When the address is programmed to a range of 0x4000-0x4fff
433 * PCI configuration space is available. Experimentation seems to indicate
434 * that read-only may be provided by hardware.
436 static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr)
438 VFIOQuirk *quirk;
439 VFIOConfigWindowQuirk *window;
441 /* This windows doesn't seem to be used except by legacy VGA code */
442 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
443 !vdev->vga || nr != 4) {
444 return;
447 quirk = vfio_quirk_alloc(2);
448 window = quirk->data = g_malloc0(sizeof(*window) +
449 sizeof(VFIOConfigWindowMatch));
450 window->vdev = vdev;
451 window->address_offset = 0;
452 window->data_offset = 4;
453 window->nr_matches = 1;
454 window->matches[0].match = 0x4000;
455 window->matches[0].mask = vdev->config_size - 1;
456 window->bar = nr;
457 window->addr_mem = &quirk->mem[0];
458 window->data_mem = &quirk->mem[1];
460 memory_region_init_io(window->addr_mem, OBJECT(vdev),
461 &vfio_generic_window_address_quirk, window,
462 "vfio-ati-bar4-window-address-quirk", 4);
463 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
464 window->address_offset,
465 window->addr_mem, 1);
467 memory_region_init_io(window->data_mem, OBJECT(vdev),
468 &vfio_generic_window_data_quirk, window,
469 "vfio-ati-bar4-window-data-quirk", 4);
470 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
471 window->data_offset,
472 window->data_mem, 1);
474 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
476 trace_vfio_quirk_ati_bar4_probe(vdev->vbasedev.name);
480 * Trap the BAR2 MMIO mirror to config space as well.
482 static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr)
484 VFIOQuirk *quirk;
485 VFIOConfigMirrorQuirk *mirror;
487 /* Only enable on newer devices where BAR2 is 64bit */
488 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
489 !vdev->vga || nr != 2 || !vdev->bars[2].mem64) {
490 return;
493 quirk = vfio_quirk_alloc(1);
494 mirror = quirk->data = g_malloc0(sizeof(*mirror));
495 mirror->mem = quirk->mem;
496 mirror->vdev = vdev;
497 mirror->offset = 0x4000;
498 mirror->bar = nr;
500 memory_region_init_io(mirror->mem, OBJECT(vdev),
501 &vfio_generic_mirror_quirk, mirror,
502 "vfio-ati-bar2-4000-quirk", PCI_CONFIG_SPACE_SIZE);
503 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
504 mirror->offset, mirror->mem, 1);
506 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
508 trace_vfio_quirk_ati_bar2_probe(vdev->vbasedev.name);
512 * Older ATI/AMD cards like the X550 have a similar window to that above.
513 * I/O port BAR1 provides a window to a mirror of PCI config space located
514 * in BAR2 at offset 0xf00. We don't care to support such older cards, but
515 * note it for future reference.
519 * Nvidia has several different methods to get to config space, the
520 * nouveu project has several of these documented here:
521 * https://github.com/pathscale/envytools/tree/master/hwdocs
523 * The first quirk is actually not documented in envytools and is found
524 * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]). This is an
525 * NV46 chipset. The backdoor uses the legacy VGA I/O ports to access
526 * the mirror of PCI config space found at BAR0 offset 0x1800. The access
527 * sequence first writes 0x338 to I/O port 0x3d4. The target offset is
528 * then written to 0x3d0. Finally 0x538 is written for a read and 0x738
529 * is written for a write to 0x3d4. The BAR0 offset is then accessible
530 * through 0x3d0. This quirk doesn't seem to be necessary on newer cards
531 * that use the I/O port BAR5 window but it doesn't hurt to leave it.
533 typedef enum {NONE = 0, SELECT, WINDOW, READ, WRITE} VFIONvidia3d0State;
534 static const char *nv3d0_states[] = { "NONE", "SELECT",
535 "WINDOW", "READ", "WRITE" };
537 typedef struct VFIONvidia3d0Quirk {
538 VFIOPCIDevice *vdev;
539 VFIONvidia3d0State state;
540 uint32_t offset;
541 } VFIONvidia3d0Quirk;
543 static uint64_t vfio_nvidia_3d4_quirk_read(void *opaque,
544 hwaddr addr, unsigned size)
546 VFIONvidia3d0Quirk *quirk = opaque;
547 VFIOPCIDevice *vdev = quirk->vdev;
549 quirk->state = NONE;
551 return vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
552 addr + 0x14, size);
555 static void vfio_nvidia_3d4_quirk_write(void *opaque, hwaddr addr,
556 uint64_t data, unsigned size)
558 VFIONvidia3d0Quirk *quirk = opaque;
559 VFIOPCIDevice *vdev = quirk->vdev;
560 VFIONvidia3d0State old_state = quirk->state;
562 quirk->state = NONE;
564 switch (data) {
565 case 0x338:
566 if (old_state == NONE) {
567 quirk->state = SELECT;
568 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
569 nv3d0_states[quirk->state]);
571 break;
572 case 0x538:
573 if (old_state == WINDOW) {
574 quirk->state = READ;
575 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
576 nv3d0_states[quirk->state]);
578 break;
579 case 0x738:
580 if (old_state == WINDOW) {
581 quirk->state = WRITE;
582 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
583 nv3d0_states[quirk->state]);
585 break;
588 vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
589 addr + 0x14, data, size);
592 static const MemoryRegionOps vfio_nvidia_3d4_quirk = {
593 .read = vfio_nvidia_3d4_quirk_read,
594 .write = vfio_nvidia_3d4_quirk_write,
595 .endianness = DEVICE_LITTLE_ENDIAN,
598 static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque,
599 hwaddr addr, unsigned size)
601 VFIONvidia3d0Quirk *quirk = opaque;
602 VFIOPCIDevice *vdev = quirk->vdev;
603 VFIONvidia3d0State old_state = quirk->state;
604 uint64_t data = vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
605 addr + 0x10, size);
607 quirk->state = NONE;
609 if (old_state == READ &&
610 (quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) {
611 uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1);
613 data = vfio_pci_read_config(&vdev->pdev, offset, size);
614 trace_vfio_quirk_nvidia_3d0_read(vdev->vbasedev.name,
615 offset, size, data);
618 return data;
621 static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr,
622 uint64_t data, unsigned size)
624 VFIONvidia3d0Quirk *quirk = opaque;
625 VFIOPCIDevice *vdev = quirk->vdev;
626 VFIONvidia3d0State old_state = quirk->state;
628 quirk->state = NONE;
630 if (old_state == SELECT) {
631 quirk->offset = (uint32_t)data;
632 quirk->state = WINDOW;
633 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
634 nv3d0_states[quirk->state]);
635 } else if (old_state == WRITE) {
636 if ((quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) {
637 uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1);
639 vfio_pci_write_config(&vdev->pdev, offset, data, size);
640 trace_vfio_quirk_nvidia_3d0_write(vdev->vbasedev.name,
641 offset, data, size);
642 return;
646 vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
647 addr + 0x10, data, size);
650 static const MemoryRegionOps vfio_nvidia_3d0_quirk = {
651 .read = vfio_nvidia_3d0_quirk_read,
652 .write = vfio_nvidia_3d0_quirk_write,
653 .endianness = DEVICE_LITTLE_ENDIAN,
656 static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev)
658 VFIOQuirk *quirk;
659 VFIONvidia3d0Quirk *data;
661 if (vdev->no_geforce_quirks ||
662 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
663 !vdev->bars[1].region.size) {
664 return;
667 quirk = vfio_quirk_alloc(2);
668 quirk->data = data = g_malloc0(sizeof(*data));
669 data->vdev = vdev;
671 memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_nvidia_3d4_quirk,
672 data, "vfio-nvidia-3d4-quirk", 2);
673 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
674 0x14 /* 0x3c0 + 0x14 */, &quirk->mem[0]);
676 memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_nvidia_3d0_quirk,
677 data, "vfio-nvidia-3d0-quirk", 2);
678 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
679 0x10 /* 0x3c0 + 0x10 */, &quirk->mem[1]);
681 QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks,
682 quirk, next);
684 trace_vfio_quirk_nvidia_3d0_probe(vdev->vbasedev.name);
688 * The second quirk is documented in envytools. The I/O port BAR5 is just
689 * a set of address/data ports to the MMIO BARs. The BAR we care about is
690 * again BAR0. This backdoor is apparently a bit newer than the one above
691 * so we need to not only trap 256 bytes @0x1800, but all of PCI config
692 * space, including extended space is available at the 4k @0x88000.
694 typedef struct VFIONvidiaBAR5Quirk {
695 uint32_t master;
696 uint32_t enable;
697 MemoryRegion *addr_mem;
698 MemoryRegion *data_mem;
699 bool enabled;
700 VFIOConfigWindowQuirk window; /* last for match data */
701 } VFIONvidiaBAR5Quirk;
703 static void vfio_nvidia_bar5_enable(VFIONvidiaBAR5Quirk *bar5)
705 VFIOPCIDevice *vdev = bar5->window.vdev;
707 if (((bar5->master & bar5->enable) & 0x1) == bar5->enabled) {
708 return;
711 bar5->enabled = !bar5->enabled;
712 trace_vfio_quirk_nvidia_bar5_state(vdev->vbasedev.name,
713 bar5->enabled ? "Enable" : "Disable");
714 memory_region_set_enabled(bar5->addr_mem, bar5->enabled);
715 memory_region_set_enabled(bar5->data_mem, bar5->enabled);
718 static uint64_t vfio_nvidia_bar5_quirk_master_read(void *opaque,
719 hwaddr addr, unsigned size)
721 VFIONvidiaBAR5Quirk *bar5 = opaque;
722 VFIOPCIDevice *vdev = bar5->window.vdev;
724 return vfio_region_read(&vdev->bars[5].region, addr, size);
727 static void vfio_nvidia_bar5_quirk_master_write(void *opaque, hwaddr addr,
728 uint64_t data, unsigned size)
730 VFIONvidiaBAR5Quirk *bar5 = opaque;
731 VFIOPCIDevice *vdev = bar5->window.vdev;
733 vfio_region_write(&vdev->bars[5].region, addr, data, size);
735 bar5->master = data;
736 vfio_nvidia_bar5_enable(bar5);
739 static const MemoryRegionOps vfio_nvidia_bar5_quirk_master = {
740 .read = vfio_nvidia_bar5_quirk_master_read,
741 .write = vfio_nvidia_bar5_quirk_master_write,
742 .endianness = DEVICE_LITTLE_ENDIAN,
745 static uint64_t vfio_nvidia_bar5_quirk_enable_read(void *opaque,
746 hwaddr addr, unsigned size)
748 VFIONvidiaBAR5Quirk *bar5 = opaque;
749 VFIOPCIDevice *vdev = bar5->window.vdev;
751 return vfio_region_read(&vdev->bars[5].region, addr + 4, size);
754 static void vfio_nvidia_bar5_quirk_enable_write(void *opaque, hwaddr addr,
755 uint64_t data, unsigned size)
757 VFIONvidiaBAR5Quirk *bar5 = opaque;
758 VFIOPCIDevice *vdev = bar5->window.vdev;
760 vfio_region_write(&vdev->bars[5].region, addr + 4, data, size);
762 bar5->enable = data;
763 vfio_nvidia_bar5_enable(bar5);
766 static const MemoryRegionOps vfio_nvidia_bar5_quirk_enable = {
767 .read = vfio_nvidia_bar5_quirk_enable_read,
768 .write = vfio_nvidia_bar5_quirk_enable_write,
769 .endianness = DEVICE_LITTLE_ENDIAN,
772 static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr)
774 VFIOQuirk *quirk;
775 VFIONvidiaBAR5Quirk *bar5;
776 VFIOConfigWindowQuirk *window;
778 if (vdev->no_geforce_quirks ||
779 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
780 !vdev->vga || nr != 5 || !vdev->bars[5].ioport) {
781 return;
784 quirk = vfio_quirk_alloc(4);
785 bar5 = quirk->data = g_malloc0(sizeof(*bar5) +
786 (sizeof(VFIOConfigWindowMatch) * 2));
787 window = &bar5->window;
789 window->vdev = vdev;
790 window->address_offset = 0x8;
791 window->data_offset = 0xc;
792 window->nr_matches = 2;
793 window->matches[0].match = 0x1800;
794 window->matches[0].mask = PCI_CONFIG_SPACE_SIZE - 1;
795 window->matches[1].match = 0x88000;
796 window->matches[1].mask = vdev->config_size - 1;
797 window->bar = nr;
798 window->addr_mem = bar5->addr_mem = &quirk->mem[0];
799 window->data_mem = bar5->data_mem = &quirk->mem[1];
801 memory_region_init_io(window->addr_mem, OBJECT(vdev),
802 &vfio_generic_window_address_quirk, window,
803 "vfio-nvidia-bar5-window-address-quirk", 4);
804 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
805 window->address_offset,
806 window->addr_mem, 1);
807 memory_region_set_enabled(window->addr_mem, false);
809 memory_region_init_io(window->data_mem, OBJECT(vdev),
810 &vfio_generic_window_data_quirk, window,
811 "vfio-nvidia-bar5-window-data-quirk", 4);
812 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
813 window->data_offset,
814 window->data_mem, 1);
815 memory_region_set_enabled(window->data_mem, false);
817 memory_region_init_io(&quirk->mem[2], OBJECT(vdev),
818 &vfio_nvidia_bar5_quirk_master, bar5,
819 "vfio-nvidia-bar5-master-quirk", 4);
820 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
821 0, &quirk->mem[2], 1);
823 memory_region_init_io(&quirk->mem[3], OBJECT(vdev),
824 &vfio_nvidia_bar5_quirk_enable, bar5,
825 "vfio-nvidia-bar5-enable-quirk", 4);
826 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
827 4, &quirk->mem[3], 1);
829 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
831 trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name);
834 typedef struct LastDataSet {
835 VFIOQuirk *quirk;
836 hwaddr addr;
837 uint64_t data;
838 unsigned size;
839 int hits;
840 int added;
841 } LastDataSet;
843 #define MAX_DYN_IOEVENTFD 10
844 #define HITS_FOR_IOEVENTFD 10
847 * Finally, BAR0 itself. We want to redirect any accesses to either
848 * 0x1800 or 0x88000 through the PCI config space access functions.
850 static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr,
851 uint64_t data, unsigned size)
853 VFIOConfigMirrorQuirk *mirror = opaque;
854 VFIOPCIDevice *vdev = mirror->vdev;
855 PCIDevice *pdev = &vdev->pdev;
856 LastDataSet *last = (LastDataSet *)&mirror->data;
858 vfio_generic_quirk_mirror_write(opaque, addr, data, size);
861 * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
862 * MSI capability ID register. Both the ID and next register are
863 * read-only, so we allow writes covering either of those to real hw.
865 if ((pdev->cap_present & QEMU_PCI_CAP_MSI) &&
866 vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) {
867 vfio_region_write(&vdev->bars[mirror->bar].region,
868 addr + mirror->offset, data, size);
869 trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name);
873 * Automatically add an ioeventfd to handle any repeated write with the
874 * same data and size above the standard PCI config space header. This is
875 * primarily expected to accelerate the MSI-ACK behavior, such as noted
876 * above. Current hardware/drivers should trigger an ioeventfd at config
877 * offset 0x704 (region offset 0x88704), with data 0x0, size 4.
879 * The criteria of 10 successive hits is arbitrary but reliably adds the
880 * MSI-ACK region. Note that as some writes are bypassed via the ioeventfd,
881 * the remaining ones have a greater chance of being seen successively.
882 * To avoid the pathological case of burning up all of QEMU's open file
883 * handles, arbitrarily limit this algorithm from adding no more than 10
884 * ioeventfds, print an error if we would have added an 11th, and then
885 * stop counting.
887 if (!vdev->no_kvm_ioeventfd &&
888 addr >= PCI_STD_HEADER_SIZEOF && last->added <= MAX_DYN_IOEVENTFD) {
889 if (addr != last->addr || data != last->data || size != last->size) {
890 last->addr = addr;
891 last->data = data;
892 last->size = size;
893 last->hits = 1;
894 } else if (++last->hits >= HITS_FOR_IOEVENTFD) {
895 if (last->added < MAX_DYN_IOEVENTFD) {
896 VFIOIOEventFD *ioeventfd;
897 ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size,
898 data, &vdev->bars[mirror->bar].region,
899 mirror->offset + addr, true);
900 if (ioeventfd) {
901 VFIOQuirk *quirk = last->quirk;
903 QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next);
904 last->added++;
906 } else {
907 last->added++;
908 warn_report("NVIDIA ioeventfd queue full for %s, unable to "
909 "accelerate 0x%"HWADDR_PRIx", data 0x%"PRIx64", "
910 "size %u", vdev->vbasedev.name, addr, data, size);
916 static const MemoryRegionOps vfio_nvidia_mirror_quirk = {
917 .read = vfio_generic_quirk_mirror_read,
918 .write = vfio_nvidia_quirk_mirror_write,
919 .endianness = DEVICE_LITTLE_ENDIAN,
922 static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
924 VFIOConfigMirrorQuirk *mirror = quirk->data;
925 LastDataSet *last = (LastDataSet *)&mirror->data;
927 last->addr = last->data = last->size = last->hits = last->added = 0;
929 vfio_drop_dynamic_eventfds(vdev, quirk);
932 static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
934 VFIOQuirk *quirk;
935 VFIOConfigMirrorQuirk *mirror;
936 LastDataSet *last;
938 if (vdev->no_geforce_quirks ||
939 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
940 !vfio_is_vga(vdev) || nr != 0) {
941 return;
944 quirk = vfio_quirk_alloc(1);
945 quirk->reset = vfio_nvidia_bar0_quirk_reset;
946 mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
947 mirror->mem = quirk->mem;
948 mirror->vdev = vdev;
949 mirror->offset = 0x88000;
950 mirror->bar = nr;
951 last = (LastDataSet *)&mirror->data;
952 last->quirk = quirk;
954 memory_region_init_io(mirror->mem, OBJECT(vdev),
955 &vfio_nvidia_mirror_quirk, mirror,
956 "vfio-nvidia-bar0-88000-mirror-quirk",
957 vdev->config_size);
958 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
959 mirror->offset, mirror->mem, 1);
961 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
963 /* The 0x1800 offset mirror only seems to get used by legacy VGA */
964 if (vdev->vga) {
965 quirk = vfio_quirk_alloc(1);
966 quirk->reset = vfio_nvidia_bar0_quirk_reset;
967 mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
968 mirror->mem = quirk->mem;
969 mirror->vdev = vdev;
970 mirror->offset = 0x1800;
971 mirror->bar = nr;
972 last = (LastDataSet *)&mirror->data;
973 last->quirk = quirk;
975 memory_region_init_io(mirror->mem, OBJECT(vdev),
976 &vfio_nvidia_mirror_quirk, mirror,
977 "vfio-nvidia-bar0-1800-mirror-quirk",
978 PCI_CONFIG_SPACE_SIZE);
979 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
980 mirror->offset, mirror->mem, 1);
982 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
985 trace_vfio_quirk_nvidia_bar0_probe(vdev->vbasedev.name);
989 * TODO - Some Nvidia devices provide config access to their companion HDA
990 * device and even to their parent bridge via these config space mirrors.
991 * Add quirks for those regions.
994 #define PCI_VENDOR_ID_REALTEK 0x10ec
997 * RTL8168 devices have a backdoor that can access the MSI-X table. At BAR2
998 * offset 0x70 there is a dword data register, offset 0x74 is a dword address
999 * register. According to the Linux r8169 driver, the MSI-X table is addressed
1000 * when the "type" portion of the address register is set to 0x1. This appears
1001 * to be bits 16:30. Bit 31 is both a write indicator and some sort of
1002 * "address latched" indicator. Bits 12:15 are a mask field, which we can
1003 * ignore because the MSI-X table should always be accessed as a dword (full
1004 * mask). Bits 0:11 is offset within the type.
1006 * Example trace:
1008 * Read from MSI-X table offset 0
1009 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr
1010 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch
1011 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data
1013 * Write 0xfee00000 to MSI-X table offset 0
1014 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data
1015 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write
1016 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete
1018 typedef struct VFIOrtl8168Quirk {
1019 VFIOPCIDevice *vdev;
1020 uint32_t addr;
1021 uint32_t data;
1022 bool enabled;
1023 } VFIOrtl8168Quirk;
1025 static uint64_t vfio_rtl8168_quirk_address_read(void *opaque,
1026 hwaddr addr, unsigned size)
1028 VFIOrtl8168Quirk *rtl = opaque;
1029 VFIOPCIDevice *vdev = rtl->vdev;
1030 uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x74, size);
1032 if (rtl->enabled) {
1033 data = rtl->addr ^ 0x80000000U; /* latch/complete */
1034 trace_vfio_quirk_rtl8168_fake_latch(vdev->vbasedev.name, data);
1037 return data;
1040 static void vfio_rtl8168_quirk_address_write(void *opaque, hwaddr addr,
1041 uint64_t data, unsigned size)
1043 VFIOrtl8168Quirk *rtl = opaque;
1044 VFIOPCIDevice *vdev = rtl->vdev;
1046 rtl->enabled = false;
1048 if ((data & 0x7fff0000) == 0x10000) { /* MSI-X table */
1049 rtl->enabled = true;
1050 rtl->addr = (uint32_t)data;
1052 if (data & 0x80000000U) { /* Do write */
1053 if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) {
1054 hwaddr offset = data & 0xfff;
1055 uint64_t val = rtl->data;
1057 trace_vfio_quirk_rtl8168_msix_write(vdev->vbasedev.name,
1058 (uint16_t)offset, val);
1060 /* Write to the proper guest MSI-X table instead */
1061 memory_region_dispatch_write(&vdev->pdev.msix_table_mmio,
1062 offset, val,
1063 size_memop(size) | MO_LE,
1064 MEMTXATTRS_UNSPECIFIED);
1066 return; /* Do not write guest MSI-X data to hardware */
1070 vfio_region_write(&vdev->bars[2].region, addr + 0x74, data, size);
1073 static const MemoryRegionOps vfio_rtl_address_quirk = {
1074 .read = vfio_rtl8168_quirk_address_read,
1075 .write = vfio_rtl8168_quirk_address_write,
1076 .valid = {
1077 .min_access_size = 4,
1078 .max_access_size = 4,
1079 .unaligned = false,
1081 .endianness = DEVICE_LITTLE_ENDIAN,
1084 static uint64_t vfio_rtl8168_quirk_data_read(void *opaque,
1085 hwaddr addr, unsigned size)
1087 VFIOrtl8168Quirk *rtl = opaque;
1088 VFIOPCIDevice *vdev = rtl->vdev;
1089 uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x70, size);
1091 if (rtl->enabled && (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) {
1092 hwaddr offset = rtl->addr & 0xfff;
1093 memory_region_dispatch_read(&vdev->pdev.msix_table_mmio, offset,
1094 &data, size_memop(size) | MO_LE,
1095 MEMTXATTRS_UNSPECIFIED);
1096 trace_vfio_quirk_rtl8168_msix_read(vdev->vbasedev.name, offset, data);
1099 return data;
1102 static void vfio_rtl8168_quirk_data_write(void *opaque, hwaddr addr,
1103 uint64_t data, unsigned size)
1105 VFIOrtl8168Quirk *rtl = opaque;
1106 VFIOPCIDevice *vdev = rtl->vdev;
1108 rtl->data = (uint32_t)data;
1110 vfio_region_write(&vdev->bars[2].region, addr + 0x70, data, size);
1113 static const MemoryRegionOps vfio_rtl_data_quirk = {
1114 .read = vfio_rtl8168_quirk_data_read,
1115 .write = vfio_rtl8168_quirk_data_write,
1116 .valid = {
1117 .min_access_size = 4,
1118 .max_access_size = 4,
1119 .unaligned = false,
1121 .endianness = DEVICE_LITTLE_ENDIAN,
1124 static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr)
1126 VFIOQuirk *quirk;
1127 VFIOrtl8168Quirk *rtl;
1129 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_REALTEK, 0x8168) || nr != 2) {
1130 return;
1133 quirk = vfio_quirk_alloc(2);
1134 quirk->data = rtl = g_malloc0(sizeof(*rtl));
1135 rtl->vdev = vdev;
1137 memory_region_init_io(&quirk->mem[0], OBJECT(vdev),
1138 &vfio_rtl_address_quirk, rtl,
1139 "vfio-rtl8168-window-address-quirk", 4);
1140 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1141 0x74, &quirk->mem[0], 1);
1143 memory_region_init_io(&quirk->mem[1], OBJECT(vdev),
1144 &vfio_rtl_data_quirk, rtl,
1145 "vfio-rtl8168-window-data-quirk", 4);
1146 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1147 0x70, &quirk->mem[1], 1);
1149 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1151 trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name);
1154 #define IGD_ASLS 0xfc /* ASL Storage Register */
1157 * The OpRegion includes the Video BIOS Table, which seems important for
1158 * telling the driver what sort of outputs it has. Without this, the device
1159 * may work in the guest, but we may not get output. This also requires BIOS
1160 * support to reserve and populate a section of guest memory sufficient for
1161 * the table and to write the base address of that memory to the ASLS register
1162 * of the IGD device.
1164 int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
1165 struct vfio_region_info *info, Error **errp)
1167 int ret;
1169 vdev->igd_opregion = g_malloc0(info->size);
1170 ret = pread(vdev->vbasedev.fd, vdev->igd_opregion,
1171 info->size, info->offset);
1172 if (ret != info->size) {
1173 error_setg(errp, "failed to read IGD OpRegion");
1174 g_free(vdev->igd_opregion);
1175 vdev->igd_opregion = NULL;
1176 return -EINVAL;
1180 * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to
1181 * allocate 32bit reserved memory for, copy these contents into, and write
1182 * the reserved memory base address to the device ASLS register at 0xFC.
1183 * Alignment of this reserved region seems flexible, but using a 4k page
1184 * alignment seems to work well. This interface assumes a single IGD
1185 * device, which may be at VM address 00:02.0 in legacy mode or another
1186 * address in UPT mode.
1188 * NB, there may be future use cases discovered where the VM should have
1189 * direct interaction with the host OpRegion, in which case the write to
1190 * the ASLS register would trigger MemoryRegion setup to enable that.
1192 fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion",
1193 vdev->igd_opregion, info->size);
1195 trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name);
1197 pci_set_long(vdev->pdev.config + IGD_ASLS, 0);
1198 pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0);
1199 pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0);
1201 return 0;
1205 * Common quirk probe entry points.
1207 void vfio_vga_quirk_setup(VFIOPCIDevice *vdev)
1209 vfio_vga_probe_ati_3c3_quirk(vdev);
1210 vfio_vga_probe_nvidia_3d0_quirk(vdev);
1213 void vfio_vga_quirk_exit(VFIOPCIDevice *vdev)
1215 VFIOQuirk *quirk;
1216 int i, j;
1218 for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1219 QLIST_FOREACH(quirk, &vdev->vga->region[i].quirks, next) {
1220 for (j = 0; j < quirk->nr_mem; j++) {
1221 memory_region_del_subregion(&vdev->vga->region[i].mem,
1222 &quirk->mem[j]);
1228 void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev)
1230 int i, j;
1232 for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1233 while (!QLIST_EMPTY(&vdev->vga->region[i].quirks)) {
1234 VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga->region[i].quirks);
1235 QLIST_REMOVE(quirk, next);
1236 for (j = 0; j < quirk->nr_mem; j++) {
1237 object_unparent(OBJECT(&quirk->mem[j]));
1239 g_free(quirk->mem);
1240 g_free(quirk->data);
1241 g_free(quirk);
1246 void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr)
1248 vfio_probe_ati_bar4_quirk(vdev, nr);
1249 vfio_probe_ati_bar2_quirk(vdev, nr);
1250 vfio_probe_nvidia_bar5_quirk(vdev, nr);
1251 vfio_probe_nvidia_bar0_quirk(vdev, nr);
1252 vfio_probe_rtl8168_bar2_quirk(vdev, nr);
1253 #ifdef CONFIG_VFIO_IGD
1254 vfio_probe_igd_bar4_quirk(vdev, nr);
1255 #endif
1258 void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr)
1260 VFIOBAR *bar = &vdev->bars[nr];
1261 VFIOQuirk *quirk;
1262 int i;
1264 QLIST_FOREACH(quirk, &bar->quirks, next) {
1265 while (!QLIST_EMPTY(&quirk->ioeventfds)) {
1266 vfio_ioeventfd_exit(vdev, QLIST_FIRST(&quirk->ioeventfds));
1269 for (i = 0; i < quirk->nr_mem; i++) {
1270 memory_region_del_subregion(bar->region.mem, &quirk->mem[i]);
1275 void vfio_bar_quirk_finalize(VFIOPCIDevice *vdev, int nr)
1277 VFIOBAR *bar = &vdev->bars[nr];
1278 int i;
1280 while (!QLIST_EMPTY(&bar->quirks)) {
1281 VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
1282 QLIST_REMOVE(quirk, next);
1283 for (i = 0; i < quirk->nr_mem; i++) {
1284 object_unparent(OBJECT(&quirk->mem[i]));
1286 g_free(quirk->mem);
1287 g_free(quirk->data);
1288 g_free(quirk);
1293 * Reset quirks
1295 void vfio_quirk_reset(VFIOPCIDevice *vdev)
1297 int i;
1299 for (i = 0; i < PCI_ROM_SLOT; i++) {
1300 VFIOQuirk *quirk;
1301 VFIOBAR *bar = &vdev->bars[i];
1303 QLIST_FOREACH(quirk, &bar->quirks, next) {
1304 if (quirk->reset) {
1305 quirk->reset(vdev, quirk);
1312 * AMD Radeon PCI config reset, based on Linux:
1313 * drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running()
1314 * drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset
1315 * drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc()
1316 * drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock()
1317 * IDs: include/drm/drm_pciids.h
1318 * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0
1320 * Bonaire and Hawaii GPUs do not respond to a bus reset. This is a bug in the
1321 * hardware that should be fixed on future ASICs. The symptom of this is that
1322 * once the accerlated driver loads, Windows guests will bsod on subsequent
1323 * attmpts to load the driver, such as after VM reset or shutdown/restart. To
1324 * work around this, we do an AMD specific PCI config reset, followed by an SMC
1325 * reset. The PCI config reset only works if SMC firmware is running, so we
1326 * have a dependency on the state of the device as to whether this reset will
1327 * be effective. There are still cases where we won't be able to kick the
1328 * device into working, but this greatly improves the usability overall. The
1329 * config reset magic is relatively common on AMD GPUs, but the setup and SMC
1330 * poking is largely ASIC specific.
1332 static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev)
1334 uint32_t clk, pc_c;
1337 * Registers 200h and 204h are index and data registers for accessing
1338 * indirect configuration registers within the device.
1340 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
1341 clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1342 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4);
1343 pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1345 return (!(clk & 1) && (0x20100 <= pc_c));
1349 * The scope of a config reset is controlled by a mode bit in the misc register
1350 * and a fuse, exposed as a bit in another register. The fuse is the default
1351 * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the forumula
1352 * scope = !(misc ^ fuse), where the resulting scope is defined the same as
1353 * the fuse. A truth table therefore tells us that if misc == fuse, we need
1354 * to flip the value of the bit in the misc register.
1356 static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev)
1358 uint32_t misc, fuse;
1359 bool a, b;
1361 vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4);
1362 fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1363 b = fuse & 64;
1365 vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4);
1366 misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1367 a = misc & 2;
1369 if (a == b) {
1370 vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4);
1371 vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */
1375 static int vfio_radeon_reset(VFIOPCIDevice *vdev)
1377 PCIDevice *pdev = &vdev->pdev;
1378 int i, ret = 0;
1379 uint32_t data;
1381 /* Defer to a kernel implemented reset */
1382 if (vdev->vbasedev.reset_works) {
1383 trace_vfio_quirk_ati_bonaire_reset_skipped(vdev->vbasedev.name);
1384 return -ENODEV;
1387 /* Enable only memory BAR access */
1388 vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2);
1390 /* Reset only works if SMC firmware is loaded and running */
1391 if (!vfio_radeon_smc_is_running(vdev)) {
1392 ret = -EINVAL;
1393 trace_vfio_quirk_ati_bonaire_reset_no_smc(vdev->vbasedev.name);
1394 goto out;
1397 /* Make sure only the GFX function is reset */
1398 vfio_radeon_set_gfx_only_reset(vdev);
1400 /* AMD PCI config reset */
1401 vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4);
1402 usleep(100);
1404 /* Read back the memory size to make sure we're out of reset */
1405 for (i = 0; i < 100000; i++) {
1406 if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) {
1407 goto reset_smc;
1409 usleep(1);
1412 trace_vfio_quirk_ati_bonaire_reset_timeout(vdev->vbasedev.name);
1414 reset_smc:
1415 /* Reset SMC */
1416 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4);
1417 data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1418 data |= 1;
1419 vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
1421 /* Disable SMC clock */
1422 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
1423 data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1424 data |= 1;
1425 vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
1427 trace_vfio_quirk_ati_bonaire_reset_done(vdev->vbasedev.name);
1429 out:
1430 /* Restore PCI command register */
1431 vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2);
1433 return ret;
1436 void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev)
1438 switch (vdev->vendor_id) {
1439 case 0x1002:
1440 switch (vdev->device_id) {
1441 /* Bonaire */
1442 case 0x6649: /* Bonaire [FirePro W5100] */
1443 case 0x6650:
1444 case 0x6651:
1445 case 0x6658: /* Bonaire XTX [Radeon R7 260X] */
1446 case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */
1447 case 0x665d: /* Bonaire [Radeon R7 200 Series] */
1448 /* Hawaii */
1449 case 0x67A0: /* Hawaii XT GL [FirePro W9100] */
1450 case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */
1451 case 0x67A2:
1452 case 0x67A8:
1453 case 0x67A9:
1454 case 0x67AA:
1455 case 0x67B0: /* Hawaii XT [Radeon R9 290X] */
1456 case 0x67B1: /* Hawaii PRO [Radeon R9 290] */
1457 case 0x67B8:
1458 case 0x67B9:
1459 case 0x67BA:
1460 case 0x67BE:
1461 vdev->resetfn = vfio_radeon_reset;
1462 trace_vfio_quirk_ati_bonaire_reset(vdev->vbasedev.name);
1463 break;
1465 break;
1470 * The NVIDIA GPUDirect P2P Vendor capability allows the user to specify
1471 * devices as a member of a clique. Devices within the same clique ID
1472 * are capable of direct P2P. It's the user's responsibility that this
1473 * is correct. The spec says that this may reside at any unused config
1474 * offset, but reserves and recommends hypervisors place this at C8h.
1475 * The spec also states that the hypervisor should place this capability
1476 * at the end of the capability list, thus next is defined as 0h.
1478 * +----------------+----------------+----------------+----------------+
1479 * | sig 7:0 ('P') | vndr len (8h) | next (0h) | cap id (9h) |
1480 * +----------------+----------------+----------------+----------------+
1481 * | rsvd 15:7(0h),id 6:3,ver 2:0(0h)| sig 23:8 ('P2') |
1482 * +---------------------------------+---------------------------------+
1484 * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf
1486 static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v,
1487 const char *name, void *opaque,
1488 Error **errp)
1490 DeviceState *dev = DEVICE(obj);
1491 Property *prop = opaque;
1492 uint8_t *ptr = qdev_get_prop_ptr(dev, prop);
1494 visit_type_uint8(v, name, ptr, errp);
1497 static void set_nv_gpudirect_clique_id(Object *obj, Visitor *v,
1498 const char *name, void *opaque,
1499 Error **errp)
1501 DeviceState *dev = DEVICE(obj);
1502 Property *prop = opaque;
1503 uint8_t value, *ptr = qdev_get_prop_ptr(dev, prop);
1504 Error *local_err = NULL;
1506 if (dev->realized) {
1507 qdev_prop_set_after_realize(dev, name, errp);
1508 return;
1511 visit_type_uint8(v, name, &value, &local_err);
1512 if (local_err) {
1513 error_propagate(errp, local_err);
1514 return;
1517 if (value & ~0xF) {
1518 error_setg(errp, "Property %s: valid range 0-15", name);
1519 return;
1522 *ptr = value;
1525 const PropertyInfo qdev_prop_nv_gpudirect_clique = {
1526 .name = "uint4",
1527 .description = "NVIDIA GPUDirect Clique ID (0 - 15)",
1528 .get = get_nv_gpudirect_clique_id,
1529 .set = set_nv_gpudirect_clique_id,
1532 static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp)
1534 PCIDevice *pdev = &vdev->pdev;
1535 int ret, pos = 0xC8;
1537 if (vdev->nv_gpudirect_clique == 0xFF) {
1538 return 0;
1541 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) {
1542 error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid device vendor");
1543 return -EINVAL;
1546 if (pci_get_byte(pdev->config + PCI_CLASS_DEVICE + 1) !=
1547 PCI_BASE_CLASS_DISPLAY) {
1548 error_setg(errp, "NVIDIA GPUDirect Clique ID: unsupported PCI class");
1549 return -EINVAL;
1552 ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp);
1553 if (ret < 0) {
1554 error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: ");
1555 return ret;
1558 memset(vdev->emulated_config_bits + pos, 0xFF, 8);
1559 pos += PCI_CAP_FLAGS;
1560 pci_set_byte(pdev->config + pos++, 8);
1561 pci_set_byte(pdev->config + pos++, 'P');
1562 pci_set_byte(pdev->config + pos++, '2');
1563 pci_set_byte(pdev->config + pos++, 'P');
1564 pci_set_byte(pdev->config + pos++, vdev->nv_gpudirect_clique << 3);
1565 pci_set_byte(pdev->config + pos, 0);
1567 return 0;
1570 int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp)
1572 int ret;
1574 ret = vfio_add_nv_gpudirect_cap(vdev, errp);
1575 if (ret) {
1576 return ret;
1579 return 0;
1582 static void vfio_pci_nvlink2_get_tgt(Object *obj, Visitor *v,
1583 const char *name,
1584 void *opaque, Error **errp)
1586 uint64_t tgt = (uintptr_t) opaque;
1587 visit_type_uint64(v, name, &tgt, errp);
1590 static void vfio_pci_nvlink2_get_link_speed(Object *obj, Visitor *v,
1591 const char *name,
1592 void *opaque, Error **errp)
1594 uint32_t link_speed = (uint32_t)(uintptr_t) opaque;
1595 visit_type_uint32(v, name, &link_speed, errp);
1598 int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp)
1600 int ret;
1601 void *p;
1602 struct vfio_region_info *nv2reg = NULL;
1603 struct vfio_info_cap_header *hdr;
1604 struct vfio_region_info_cap_nvlink2_ssatgt *cap;
1605 VFIOQuirk *quirk;
1607 ret = vfio_get_dev_region_info(&vdev->vbasedev,
1608 VFIO_REGION_TYPE_PCI_VENDOR_TYPE |
1609 PCI_VENDOR_ID_NVIDIA,
1610 VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM,
1611 &nv2reg);
1612 if (ret) {
1613 return ret;
1616 hdr = vfio_get_region_info_cap(nv2reg, VFIO_REGION_INFO_CAP_NVLINK2_SSATGT);
1617 if (!hdr) {
1618 ret = -ENODEV;
1619 goto free_exit;
1621 cap = (void *) hdr;
1623 p = mmap(NULL, nv2reg->size, PROT_READ | PROT_WRITE | PROT_EXEC,
1624 MAP_SHARED, vdev->vbasedev.fd, nv2reg->offset);
1625 if (p == MAP_FAILED) {
1626 ret = -errno;
1627 goto free_exit;
1630 quirk = vfio_quirk_alloc(1);
1631 memory_region_init_ram_ptr(&quirk->mem[0], OBJECT(vdev), "nvlink2-mr",
1632 nv2reg->size, p);
1633 QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next);
1635 object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64",
1636 vfio_pci_nvlink2_get_tgt, NULL, NULL,
1637 (void *) (uintptr_t) cap->tgt, NULL);
1638 trace_vfio_pci_nvidia_gpu_setup_quirk(vdev->vbasedev.name, cap->tgt,
1639 nv2reg->size);
1640 free_exit:
1641 g_free(nv2reg);
1643 return ret;
1646 int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp)
1648 int ret;
1649 void *p;
1650 struct vfio_region_info *atsdreg = NULL;
1651 struct vfio_info_cap_header *hdr;
1652 struct vfio_region_info_cap_nvlink2_ssatgt *captgt;
1653 struct vfio_region_info_cap_nvlink2_lnkspd *capspeed;
1654 VFIOQuirk *quirk;
1656 ret = vfio_get_dev_region_info(&vdev->vbasedev,
1657 VFIO_REGION_TYPE_PCI_VENDOR_TYPE |
1658 PCI_VENDOR_ID_IBM,
1659 VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD,
1660 &atsdreg);
1661 if (ret) {
1662 return ret;
1665 hdr = vfio_get_region_info_cap(atsdreg,
1666 VFIO_REGION_INFO_CAP_NVLINK2_SSATGT);
1667 if (!hdr) {
1668 ret = -ENODEV;
1669 goto free_exit;
1671 captgt = (void *) hdr;
1673 hdr = vfio_get_region_info_cap(atsdreg,
1674 VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD);
1675 if (!hdr) {
1676 ret = -ENODEV;
1677 goto free_exit;
1679 capspeed = (void *) hdr;
1681 /* Some NVLink bridges may not have assigned ATSD */
1682 if (atsdreg->size) {
1683 p = mmap(NULL, atsdreg->size, PROT_READ | PROT_WRITE | PROT_EXEC,
1684 MAP_SHARED, vdev->vbasedev.fd, atsdreg->offset);
1685 if (p == MAP_FAILED) {
1686 ret = -errno;
1687 goto free_exit;
1690 quirk = vfio_quirk_alloc(1);
1691 memory_region_init_ram_device_ptr(&quirk->mem[0], OBJECT(vdev),
1692 "nvlink2-atsd-mr", atsdreg->size, p);
1693 QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next);
1696 object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64",
1697 vfio_pci_nvlink2_get_tgt, NULL, NULL,
1698 (void *) (uintptr_t) captgt->tgt, NULL);
1699 trace_vfio_pci_nvlink2_setup_quirk_ssatgt(vdev->vbasedev.name, captgt->tgt,
1700 atsdreg->size);
1702 object_property_add(OBJECT(vdev), "nvlink2-link-speed", "uint32",
1703 vfio_pci_nvlink2_get_link_speed, NULL, NULL,
1704 (void *) (uintptr_t) capspeed->link_speed, NULL);
1705 trace_vfio_pci_nvlink2_setup_quirk_lnkspd(vdev->vbasedev.name,
1706 capspeed->link_speed);
1707 free_exit:
1708 g_free(atsdreg);
1710 return ret;