2 * device quirks for PCI devices
4 * Copyright Red Hat, Inc. 2012-2015
7 * Alex Williamson <alex.williamson@redhat.com>
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
13 #include "qemu/osdep.h"
14 #include "exec/memop.h"
15 #include "qemu/units.h"
16 #include "qemu/error-report.h"
17 #include "qemu/main-loop.h"
18 #include "qemu/module.h"
19 #include "qemu/range.h"
20 #include "qapi/error.h"
21 #include "qapi/visitor.h"
22 #include <sys/ioctl.h>
24 #include "hw/nvram/fw_cfg.h"
25 #include "hw/qdev-properties.h"
30 * List of device ids/vendor ids for which to disable
31 * option rom loading. This avoids the guest hangs during rom
32 * execution as noticed with the BCM 57810 card for lack of a
33 * more better way to handle such issues.
34 * The user can still override by specifying a romfile or
36 * Please see https://bugs.launchpad.net/qemu/+bug/1284874
37 * for an analysis of the 57810 card hang. When adding
38 * a new vendor id/device id combination below, please also add
39 * your card/environment details and information that could
40 * help in debugging to the bug tracking this issue
46 { 0x14e4, 0x168e }, /* Broadcom BCM 57810 */
49 bool vfio_blacklist_opt_rom(VFIOPCIDevice
*vdev
)
53 for (i
= 0 ; i
< ARRAY_SIZE(romblacklist
); i
++) {
54 if (vfio_pci_is(vdev
, romblacklist
[i
].vendor
, romblacklist
[i
].device
)) {
55 trace_vfio_quirk_rom_blacklisted(vdev
->vbasedev
.name
,
56 romblacklist
[i
].vendor
,
57 romblacklist
[i
].device
);
65 * Device specific region quirks (mostly backdoors to PCI config space)
69 * The generic window quirks operate on an address and data register,
70 * vfio_generic_window_address_quirk handles the address register and
71 * vfio_generic_window_data_quirk handles the data register. These ops
72 * pass reads and writes through to hardware until a value matching the
73 * stored address match/mask is written. When this occurs, the data
74 * register access emulated PCI config space for the device rather than
75 * passing through accesses. This enables devices where PCI config space
76 * is accessible behind a window register to maintain the virtualization
77 * provided through vfio.
79 typedef struct VFIOConfigWindowMatch
{
82 } VFIOConfigWindowMatch
;
84 typedef struct VFIOConfigWindowQuirk
{
85 struct VFIOPCIDevice
*vdev
;
89 uint32_t address_offset
;
95 MemoryRegion
*addr_mem
;
96 MemoryRegion
*data_mem
;
99 VFIOConfigWindowMatch matches
[];
100 } VFIOConfigWindowQuirk
;
102 static uint64_t vfio_generic_window_quirk_address_read(void *opaque
,
106 VFIOConfigWindowQuirk
*window
= opaque
;
107 VFIOPCIDevice
*vdev
= window
->vdev
;
109 return vfio_region_read(&vdev
->bars
[window
->bar
].region
,
110 addr
+ window
->address_offset
, size
);
113 static void vfio_generic_window_quirk_address_write(void *opaque
, hwaddr addr
,
117 VFIOConfigWindowQuirk
*window
= opaque
;
118 VFIOPCIDevice
*vdev
= window
->vdev
;
121 window
->window_enabled
= false;
123 vfio_region_write(&vdev
->bars
[window
->bar
].region
,
124 addr
+ window
->address_offset
, data
, size
);
126 for (i
= 0; i
< window
->nr_matches
; i
++) {
127 if ((data
& ~window
->matches
[i
].mask
) == window
->matches
[i
].match
) {
128 window
->window_enabled
= true;
129 window
->address_val
= data
& window
->matches
[i
].mask
;
130 trace_vfio_quirk_generic_window_address_write(vdev
->vbasedev
.name
,
131 memory_region_name(window
->addr_mem
), data
);
137 static const MemoryRegionOps vfio_generic_window_address_quirk
= {
138 .read
= vfio_generic_window_quirk_address_read
,
139 .write
= vfio_generic_window_quirk_address_write
,
140 .endianness
= DEVICE_LITTLE_ENDIAN
,
143 static uint64_t vfio_generic_window_quirk_data_read(void *opaque
,
144 hwaddr addr
, unsigned size
)
146 VFIOConfigWindowQuirk
*window
= opaque
;
147 VFIOPCIDevice
*vdev
= window
->vdev
;
150 /* Always read data reg, discard if window enabled */
151 data
= vfio_region_read(&vdev
->bars
[window
->bar
].region
,
152 addr
+ window
->data_offset
, size
);
154 if (window
->window_enabled
) {
155 data
= vfio_pci_read_config(&vdev
->pdev
, window
->address_val
, size
);
156 trace_vfio_quirk_generic_window_data_read(vdev
->vbasedev
.name
,
157 memory_region_name(window
->data_mem
), data
);
163 static void vfio_generic_window_quirk_data_write(void *opaque
, hwaddr addr
,
164 uint64_t data
, unsigned size
)
166 VFIOConfigWindowQuirk
*window
= opaque
;
167 VFIOPCIDevice
*vdev
= window
->vdev
;
169 if (window
->window_enabled
) {
170 vfio_pci_write_config(&vdev
->pdev
, window
->address_val
, data
, size
);
171 trace_vfio_quirk_generic_window_data_write(vdev
->vbasedev
.name
,
172 memory_region_name(window
->data_mem
), data
);
176 vfio_region_write(&vdev
->bars
[window
->bar
].region
,
177 addr
+ window
->data_offset
, data
, size
);
180 static const MemoryRegionOps vfio_generic_window_data_quirk
= {
181 .read
= vfio_generic_window_quirk_data_read
,
182 .write
= vfio_generic_window_quirk_data_write
,
183 .endianness
= DEVICE_LITTLE_ENDIAN
,
187 * The generic mirror quirk handles devices which expose PCI config space
188 * through a region within a BAR. When enabled, reads and writes are
189 * redirected through to emulated PCI config space. XXX if PCI config space
190 * used memory regions, this could just be an alias.
192 typedef struct VFIOConfigMirrorQuirk
{
193 struct VFIOPCIDevice
*vdev
;
198 } VFIOConfigMirrorQuirk
;
200 static uint64_t vfio_generic_quirk_mirror_read(void *opaque
,
201 hwaddr addr
, unsigned size
)
203 VFIOConfigMirrorQuirk
*mirror
= opaque
;
204 VFIOPCIDevice
*vdev
= mirror
->vdev
;
207 /* Read and discard in case the hardware cares */
208 (void)vfio_region_read(&vdev
->bars
[mirror
->bar
].region
,
209 addr
+ mirror
->offset
, size
);
211 data
= vfio_pci_read_config(&vdev
->pdev
, addr
, size
);
212 trace_vfio_quirk_generic_mirror_read(vdev
->vbasedev
.name
,
213 memory_region_name(mirror
->mem
),
218 static void vfio_generic_quirk_mirror_write(void *opaque
, hwaddr addr
,
219 uint64_t data
, unsigned size
)
221 VFIOConfigMirrorQuirk
*mirror
= opaque
;
222 VFIOPCIDevice
*vdev
= mirror
->vdev
;
224 vfio_pci_write_config(&vdev
->pdev
, addr
, data
, size
);
225 trace_vfio_quirk_generic_mirror_write(vdev
->vbasedev
.name
,
226 memory_region_name(mirror
->mem
),
230 static const MemoryRegionOps vfio_generic_mirror_quirk
= {
231 .read
= vfio_generic_quirk_mirror_read
,
232 .write
= vfio_generic_quirk_mirror_write
,
233 .endianness
= DEVICE_LITTLE_ENDIAN
,
236 /* Is range1 fully contained within range2? */
237 static bool vfio_range_contained(uint64_t first1
, uint64_t len1
,
238 uint64_t first2
, uint64_t len2
) {
239 return (first1
>= first2
&& first1
+ len1
<= first2
+ len2
);
242 #define PCI_VENDOR_ID_ATI 0x1002
245 * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR
246 * through VGA register 0x3c3. On newer cards, the I/O port BAR is always
247 * BAR4 (older cards like the X550 used BAR1, but we don't care to support
248 * those). Note that on bare metal, a read of 0x3c3 doesn't always return the
249 * I/O port BAR address. Originally this was coded to return the virtual BAR
250 * address only if the physical register read returns the actual BAR address,
251 * but users have reported greater success if we return the virtual address
254 static uint64_t vfio_ati_3c3_quirk_read(void *opaque
,
255 hwaddr addr
, unsigned size
)
257 VFIOPCIDevice
*vdev
= opaque
;
258 uint64_t data
= vfio_pci_read_config(&vdev
->pdev
,
259 PCI_BASE_ADDRESS_4
+ 1, size
);
261 trace_vfio_quirk_ati_3c3_read(vdev
->vbasedev
.name
, data
);
266 static const MemoryRegionOps vfio_ati_3c3_quirk
= {
267 .read
= vfio_ati_3c3_quirk_read
,
268 .endianness
= DEVICE_LITTLE_ENDIAN
,
271 VFIOQuirk
*vfio_quirk_alloc(int nr_mem
)
273 VFIOQuirk
*quirk
= g_new0(VFIOQuirk
, 1);
274 QLIST_INIT(&quirk
->ioeventfds
);
275 quirk
->mem
= g_new0(MemoryRegion
, nr_mem
);
276 quirk
->nr_mem
= nr_mem
;
281 static void vfio_ioeventfd_exit(VFIOPCIDevice
*vdev
, VFIOIOEventFD
*ioeventfd
)
283 QLIST_REMOVE(ioeventfd
, next
);
284 memory_region_del_eventfd(ioeventfd
->mr
, ioeventfd
->addr
, ioeventfd
->size
,
285 true, ioeventfd
->data
, &ioeventfd
->e
);
287 if (ioeventfd
->vfio
) {
288 struct vfio_device_ioeventfd vfio_ioeventfd
;
290 vfio_ioeventfd
.argsz
= sizeof(vfio_ioeventfd
);
291 vfio_ioeventfd
.flags
= ioeventfd
->size
;
292 vfio_ioeventfd
.data
= ioeventfd
->data
;
293 vfio_ioeventfd
.offset
= ioeventfd
->region
->fd_offset
+
294 ioeventfd
->region_addr
;
295 vfio_ioeventfd
.fd
= -1;
297 if (ioctl(vdev
->vbasedev
.fd
, VFIO_DEVICE_IOEVENTFD
, &vfio_ioeventfd
)) {
298 error_report("Failed to remove vfio ioeventfd for %s+0x%"
299 HWADDR_PRIx
"[%d]:0x%"PRIx64
" (%m)",
300 memory_region_name(ioeventfd
->mr
), ioeventfd
->addr
,
301 ioeventfd
->size
, ioeventfd
->data
);
304 qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd
->e
),
308 event_notifier_cleanup(&ioeventfd
->e
);
309 trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd
->mr
),
310 (uint64_t)ioeventfd
->addr
, ioeventfd
->size
,
315 static void vfio_drop_dynamic_eventfds(VFIOPCIDevice
*vdev
, VFIOQuirk
*quirk
)
317 VFIOIOEventFD
*ioeventfd
, *tmp
;
319 QLIST_FOREACH_SAFE(ioeventfd
, &quirk
->ioeventfds
, next
, tmp
) {
320 if (ioeventfd
->dynamic
) {
321 vfio_ioeventfd_exit(vdev
, ioeventfd
);
326 static void vfio_ioeventfd_handler(void *opaque
)
328 VFIOIOEventFD
*ioeventfd
= opaque
;
330 if (event_notifier_test_and_clear(&ioeventfd
->e
)) {
331 vfio_region_write(ioeventfd
->region
, ioeventfd
->region_addr
,
332 ioeventfd
->data
, ioeventfd
->size
);
333 trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd
->mr
),
334 (uint64_t)ioeventfd
->addr
, ioeventfd
->size
,
339 static VFIOIOEventFD
*vfio_ioeventfd_init(VFIOPCIDevice
*vdev
,
340 MemoryRegion
*mr
, hwaddr addr
,
341 unsigned size
, uint64_t data
,
343 hwaddr region_addr
, bool dynamic
)
345 VFIOIOEventFD
*ioeventfd
;
347 if (vdev
->no_kvm_ioeventfd
) {
351 ioeventfd
= g_malloc0(sizeof(*ioeventfd
));
353 if (event_notifier_init(&ioeventfd
->e
, 0)) {
359 * MemoryRegion and relative offset, plus additional ioeventfd setup
360 * parameters for configuring and later tearing down KVM ioeventfd.
363 ioeventfd
->addr
= addr
;
364 ioeventfd
->size
= size
;
365 ioeventfd
->data
= data
;
366 ioeventfd
->dynamic
= dynamic
;
368 * VFIORegion and relative offset for implementing the userspace
369 * handler. data & size fields shared for both uses.
371 ioeventfd
->region
= region
;
372 ioeventfd
->region_addr
= region_addr
;
374 if (!vdev
->no_vfio_ioeventfd
) {
375 struct vfio_device_ioeventfd vfio_ioeventfd
;
377 vfio_ioeventfd
.argsz
= sizeof(vfio_ioeventfd
);
378 vfio_ioeventfd
.flags
= ioeventfd
->size
;
379 vfio_ioeventfd
.data
= ioeventfd
->data
;
380 vfio_ioeventfd
.offset
= ioeventfd
->region
->fd_offset
+
381 ioeventfd
->region_addr
;
382 vfio_ioeventfd
.fd
= event_notifier_get_fd(&ioeventfd
->e
);
384 ioeventfd
->vfio
= !ioctl(vdev
->vbasedev
.fd
,
385 VFIO_DEVICE_IOEVENTFD
, &vfio_ioeventfd
);
388 if (!ioeventfd
->vfio
) {
389 qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd
->e
),
390 vfio_ioeventfd_handler
, NULL
, ioeventfd
);
393 memory_region_add_eventfd(ioeventfd
->mr
, ioeventfd
->addr
, ioeventfd
->size
,
394 true, ioeventfd
->data
, &ioeventfd
->e
);
395 trace_vfio_ioeventfd_init(memory_region_name(mr
), (uint64_t)addr
,
396 size
, data
, ioeventfd
->vfio
);
401 static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice
*vdev
)
406 * As long as the BAR is >= 256 bytes it will be aligned such that the
407 * lower byte is always zero. Filter out anything else, if it exists.
409 if (!vfio_pci_is(vdev
, PCI_VENDOR_ID_ATI
, PCI_ANY_ID
) ||
410 !vdev
->bars
[4].ioport
|| vdev
->bars
[4].region
.size
< 256) {
414 quirk
= vfio_quirk_alloc(1);
416 memory_region_init_io(quirk
->mem
, OBJECT(vdev
), &vfio_ati_3c3_quirk
, vdev
,
417 "vfio-ati-3c3-quirk", 1);
418 memory_region_add_subregion(&vdev
->vga
->region
[QEMU_PCI_VGA_IO_HI
].mem
,
419 3 /* offset 3 bytes from 0x3c0 */, quirk
->mem
);
421 QLIST_INSERT_HEAD(&vdev
->vga
->region
[QEMU_PCI_VGA_IO_HI
].quirks
,
424 trace_vfio_quirk_ati_3c3_probe(vdev
->vbasedev
.name
);
428 * Newer ATI/AMD devices, including HD5450 and HD7850, have a mirror to PCI
429 * config space through MMIO BAR2 at offset 0x4000. Nothing seems to access
430 * the MMIO space directly, but a window to this space is provided through
431 * I/O port BAR4. Offset 0x0 is the address register and offset 0x4 is the
432 * data register. When the address is programmed to a range of 0x4000-0x4fff
433 * PCI configuration space is available. Experimentation seems to indicate
434 * that read-only may be provided by hardware.
436 static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice
*vdev
, int nr
)
439 VFIOConfigWindowQuirk
*window
;
441 /* This windows doesn't seem to be used except by legacy VGA code */
442 if (!vfio_pci_is(vdev
, PCI_VENDOR_ID_ATI
, PCI_ANY_ID
) ||
443 !vdev
->vga
|| nr
!= 4) {
447 quirk
= vfio_quirk_alloc(2);
448 window
= quirk
->data
= g_malloc0(sizeof(*window
) +
449 sizeof(VFIOConfigWindowMatch
));
451 window
->address_offset
= 0;
452 window
->data_offset
= 4;
453 window
->nr_matches
= 1;
454 window
->matches
[0].match
= 0x4000;
455 window
->matches
[0].mask
= vdev
->config_size
- 1;
457 window
->addr_mem
= &quirk
->mem
[0];
458 window
->data_mem
= &quirk
->mem
[1];
460 memory_region_init_io(window
->addr_mem
, OBJECT(vdev
),
461 &vfio_generic_window_address_quirk
, window
,
462 "vfio-ati-bar4-window-address-quirk", 4);
463 memory_region_add_subregion_overlap(vdev
->bars
[nr
].region
.mem
,
464 window
->address_offset
,
465 window
->addr_mem
, 1);
467 memory_region_init_io(window
->data_mem
, OBJECT(vdev
),
468 &vfio_generic_window_data_quirk
, window
,
469 "vfio-ati-bar4-window-data-quirk", 4);
470 memory_region_add_subregion_overlap(vdev
->bars
[nr
].region
.mem
,
472 window
->data_mem
, 1);
474 QLIST_INSERT_HEAD(&vdev
->bars
[nr
].quirks
, quirk
, next
);
476 trace_vfio_quirk_ati_bar4_probe(vdev
->vbasedev
.name
);
480 * Trap the BAR2 MMIO mirror to config space as well.
482 static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice
*vdev
, int nr
)
485 VFIOConfigMirrorQuirk
*mirror
;
487 /* Only enable on newer devices where BAR2 is 64bit */
488 if (!vfio_pci_is(vdev
, PCI_VENDOR_ID_ATI
, PCI_ANY_ID
) ||
489 !vdev
->vga
|| nr
!= 2 || !vdev
->bars
[2].mem64
) {
493 quirk
= vfio_quirk_alloc(1);
494 mirror
= quirk
->data
= g_malloc0(sizeof(*mirror
));
495 mirror
->mem
= quirk
->mem
;
497 mirror
->offset
= 0x4000;
500 memory_region_init_io(mirror
->mem
, OBJECT(vdev
),
501 &vfio_generic_mirror_quirk
, mirror
,
502 "vfio-ati-bar2-4000-quirk", PCI_CONFIG_SPACE_SIZE
);
503 memory_region_add_subregion_overlap(vdev
->bars
[nr
].region
.mem
,
504 mirror
->offset
, mirror
->mem
, 1);
506 QLIST_INSERT_HEAD(&vdev
->bars
[nr
].quirks
, quirk
, next
);
508 trace_vfio_quirk_ati_bar2_probe(vdev
->vbasedev
.name
);
512 * Older ATI/AMD cards like the X550 have a similar window to that above.
513 * I/O port BAR1 provides a window to a mirror of PCI config space located
514 * in BAR2 at offset 0xf00. We don't care to support such older cards, but
515 * note it for future reference.
519 * Nvidia has several different methods to get to config space, the
520 * nouveu project has several of these documented here:
521 * https://github.com/pathscale/envytools/tree/master/hwdocs
523 * The first quirk is actually not documented in envytools and is found
524 * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]). This is an
525 * NV46 chipset. The backdoor uses the legacy VGA I/O ports to access
526 * the mirror of PCI config space found at BAR0 offset 0x1800. The access
527 * sequence first writes 0x338 to I/O port 0x3d4. The target offset is
528 * then written to 0x3d0. Finally 0x538 is written for a read and 0x738
529 * is written for a write to 0x3d4. The BAR0 offset is then accessible
530 * through 0x3d0. This quirk doesn't seem to be necessary on newer cards
531 * that use the I/O port BAR5 window but it doesn't hurt to leave it.
533 typedef enum {NONE
= 0, SELECT
, WINDOW
, READ
, WRITE
} VFIONvidia3d0State
;
534 static const char *nv3d0_states
[] = { "NONE", "SELECT",
535 "WINDOW", "READ", "WRITE" };
537 typedef struct VFIONvidia3d0Quirk
{
539 VFIONvidia3d0State state
;
541 } VFIONvidia3d0Quirk
;
543 static uint64_t vfio_nvidia_3d4_quirk_read(void *opaque
,
544 hwaddr addr
, unsigned size
)
546 VFIONvidia3d0Quirk
*quirk
= opaque
;
547 VFIOPCIDevice
*vdev
= quirk
->vdev
;
551 return vfio_vga_read(&vdev
->vga
->region
[QEMU_PCI_VGA_IO_HI
],
555 static void vfio_nvidia_3d4_quirk_write(void *opaque
, hwaddr addr
,
556 uint64_t data
, unsigned size
)
558 VFIONvidia3d0Quirk
*quirk
= opaque
;
559 VFIOPCIDevice
*vdev
= quirk
->vdev
;
560 VFIONvidia3d0State old_state
= quirk
->state
;
566 if (old_state
== NONE
) {
567 quirk
->state
= SELECT
;
568 trace_vfio_quirk_nvidia_3d0_state(vdev
->vbasedev
.name
,
569 nv3d0_states
[quirk
->state
]);
573 if (old_state
== WINDOW
) {
575 trace_vfio_quirk_nvidia_3d0_state(vdev
->vbasedev
.name
,
576 nv3d0_states
[quirk
->state
]);
580 if (old_state
== WINDOW
) {
581 quirk
->state
= WRITE
;
582 trace_vfio_quirk_nvidia_3d0_state(vdev
->vbasedev
.name
,
583 nv3d0_states
[quirk
->state
]);
588 vfio_vga_write(&vdev
->vga
->region
[QEMU_PCI_VGA_IO_HI
],
589 addr
+ 0x14, data
, size
);
592 static const MemoryRegionOps vfio_nvidia_3d4_quirk
= {
593 .read
= vfio_nvidia_3d4_quirk_read
,
594 .write
= vfio_nvidia_3d4_quirk_write
,
595 .endianness
= DEVICE_LITTLE_ENDIAN
,
598 static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque
,
599 hwaddr addr
, unsigned size
)
601 VFIONvidia3d0Quirk
*quirk
= opaque
;
602 VFIOPCIDevice
*vdev
= quirk
->vdev
;
603 VFIONvidia3d0State old_state
= quirk
->state
;
604 uint64_t data
= vfio_vga_read(&vdev
->vga
->region
[QEMU_PCI_VGA_IO_HI
],
609 if (old_state
== READ
&&
610 (quirk
->offset
& ~(PCI_CONFIG_SPACE_SIZE
- 1)) == 0x1800) {
611 uint8_t offset
= quirk
->offset
& (PCI_CONFIG_SPACE_SIZE
- 1);
613 data
= vfio_pci_read_config(&vdev
->pdev
, offset
, size
);
614 trace_vfio_quirk_nvidia_3d0_read(vdev
->vbasedev
.name
,
621 static void vfio_nvidia_3d0_quirk_write(void *opaque
, hwaddr addr
,
622 uint64_t data
, unsigned size
)
624 VFIONvidia3d0Quirk
*quirk
= opaque
;
625 VFIOPCIDevice
*vdev
= quirk
->vdev
;
626 VFIONvidia3d0State old_state
= quirk
->state
;
630 if (old_state
== SELECT
) {
631 quirk
->offset
= (uint32_t)data
;
632 quirk
->state
= WINDOW
;
633 trace_vfio_quirk_nvidia_3d0_state(vdev
->vbasedev
.name
,
634 nv3d0_states
[quirk
->state
]);
635 } else if (old_state
== WRITE
) {
636 if ((quirk
->offset
& ~(PCI_CONFIG_SPACE_SIZE
- 1)) == 0x1800) {
637 uint8_t offset
= quirk
->offset
& (PCI_CONFIG_SPACE_SIZE
- 1);
639 vfio_pci_write_config(&vdev
->pdev
, offset
, data
, size
);
640 trace_vfio_quirk_nvidia_3d0_write(vdev
->vbasedev
.name
,
646 vfio_vga_write(&vdev
->vga
->region
[QEMU_PCI_VGA_IO_HI
],
647 addr
+ 0x10, data
, size
);
650 static const MemoryRegionOps vfio_nvidia_3d0_quirk
= {
651 .read
= vfio_nvidia_3d0_quirk_read
,
652 .write
= vfio_nvidia_3d0_quirk_write
,
653 .endianness
= DEVICE_LITTLE_ENDIAN
,
656 static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice
*vdev
)
659 VFIONvidia3d0Quirk
*data
;
661 if (vdev
->no_geforce_quirks
||
662 !vfio_pci_is(vdev
, PCI_VENDOR_ID_NVIDIA
, PCI_ANY_ID
) ||
663 !vdev
->bars
[1].region
.size
) {
667 quirk
= vfio_quirk_alloc(2);
668 quirk
->data
= data
= g_malloc0(sizeof(*data
));
671 memory_region_init_io(&quirk
->mem
[0], OBJECT(vdev
), &vfio_nvidia_3d4_quirk
,
672 data
, "vfio-nvidia-3d4-quirk", 2);
673 memory_region_add_subregion(&vdev
->vga
->region
[QEMU_PCI_VGA_IO_HI
].mem
,
674 0x14 /* 0x3c0 + 0x14 */, &quirk
->mem
[0]);
676 memory_region_init_io(&quirk
->mem
[1], OBJECT(vdev
), &vfio_nvidia_3d0_quirk
,
677 data
, "vfio-nvidia-3d0-quirk", 2);
678 memory_region_add_subregion(&vdev
->vga
->region
[QEMU_PCI_VGA_IO_HI
].mem
,
679 0x10 /* 0x3c0 + 0x10 */, &quirk
->mem
[1]);
681 QLIST_INSERT_HEAD(&vdev
->vga
->region
[QEMU_PCI_VGA_IO_HI
].quirks
,
684 trace_vfio_quirk_nvidia_3d0_probe(vdev
->vbasedev
.name
);
688 * The second quirk is documented in envytools. The I/O port BAR5 is just
689 * a set of address/data ports to the MMIO BARs. The BAR we care about is
690 * again BAR0. This backdoor is apparently a bit newer than the one above
691 * so we need to not only trap 256 bytes @0x1800, but all of PCI config
692 * space, including extended space is available at the 4k @0x88000.
694 typedef struct VFIONvidiaBAR5Quirk
{
697 MemoryRegion
*addr_mem
;
698 MemoryRegion
*data_mem
;
700 VFIOConfigWindowQuirk window
; /* last for match data */
701 } VFIONvidiaBAR5Quirk
;
703 static void vfio_nvidia_bar5_enable(VFIONvidiaBAR5Quirk
*bar5
)
705 VFIOPCIDevice
*vdev
= bar5
->window
.vdev
;
707 if (((bar5
->master
& bar5
->enable
) & 0x1) == bar5
->enabled
) {
711 bar5
->enabled
= !bar5
->enabled
;
712 trace_vfio_quirk_nvidia_bar5_state(vdev
->vbasedev
.name
,
713 bar5
->enabled
? "Enable" : "Disable");
714 memory_region_set_enabled(bar5
->addr_mem
, bar5
->enabled
);
715 memory_region_set_enabled(bar5
->data_mem
, bar5
->enabled
);
718 static uint64_t vfio_nvidia_bar5_quirk_master_read(void *opaque
,
719 hwaddr addr
, unsigned size
)
721 VFIONvidiaBAR5Quirk
*bar5
= opaque
;
722 VFIOPCIDevice
*vdev
= bar5
->window
.vdev
;
724 return vfio_region_read(&vdev
->bars
[5].region
, addr
, size
);
727 static void vfio_nvidia_bar5_quirk_master_write(void *opaque
, hwaddr addr
,
728 uint64_t data
, unsigned size
)
730 VFIONvidiaBAR5Quirk
*bar5
= opaque
;
731 VFIOPCIDevice
*vdev
= bar5
->window
.vdev
;
733 vfio_region_write(&vdev
->bars
[5].region
, addr
, data
, size
);
736 vfio_nvidia_bar5_enable(bar5
);
739 static const MemoryRegionOps vfio_nvidia_bar5_quirk_master
= {
740 .read
= vfio_nvidia_bar5_quirk_master_read
,
741 .write
= vfio_nvidia_bar5_quirk_master_write
,
742 .endianness
= DEVICE_LITTLE_ENDIAN
,
745 static uint64_t vfio_nvidia_bar5_quirk_enable_read(void *opaque
,
746 hwaddr addr
, unsigned size
)
748 VFIONvidiaBAR5Quirk
*bar5
= opaque
;
749 VFIOPCIDevice
*vdev
= bar5
->window
.vdev
;
751 return vfio_region_read(&vdev
->bars
[5].region
, addr
+ 4, size
);
754 static void vfio_nvidia_bar5_quirk_enable_write(void *opaque
, hwaddr addr
,
755 uint64_t data
, unsigned size
)
757 VFIONvidiaBAR5Quirk
*bar5
= opaque
;
758 VFIOPCIDevice
*vdev
= bar5
->window
.vdev
;
760 vfio_region_write(&vdev
->bars
[5].region
, addr
+ 4, data
, size
);
763 vfio_nvidia_bar5_enable(bar5
);
766 static const MemoryRegionOps vfio_nvidia_bar5_quirk_enable
= {
767 .read
= vfio_nvidia_bar5_quirk_enable_read
,
768 .write
= vfio_nvidia_bar5_quirk_enable_write
,
769 .endianness
= DEVICE_LITTLE_ENDIAN
,
772 static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice
*vdev
, int nr
)
775 VFIONvidiaBAR5Quirk
*bar5
;
776 VFIOConfigWindowQuirk
*window
;
778 if (vdev
->no_geforce_quirks
||
779 !vfio_pci_is(vdev
, PCI_VENDOR_ID_NVIDIA
, PCI_ANY_ID
) ||
780 !vdev
->vga
|| nr
!= 5 || !vdev
->bars
[5].ioport
) {
784 quirk
= vfio_quirk_alloc(4);
785 bar5
= quirk
->data
= g_malloc0(sizeof(*bar5
) +
786 (sizeof(VFIOConfigWindowMatch
) * 2));
787 window
= &bar5
->window
;
790 window
->address_offset
= 0x8;
791 window
->data_offset
= 0xc;
792 window
->nr_matches
= 2;
793 window
->matches
[0].match
= 0x1800;
794 window
->matches
[0].mask
= PCI_CONFIG_SPACE_SIZE
- 1;
795 window
->matches
[1].match
= 0x88000;
796 window
->matches
[1].mask
= vdev
->config_size
- 1;
798 window
->addr_mem
= bar5
->addr_mem
= &quirk
->mem
[0];
799 window
->data_mem
= bar5
->data_mem
= &quirk
->mem
[1];
801 memory_region_init_io(window
->addr_mem
, OBJECT(vdev
),
802 &vfio_generic_window_address_quirk
, window
,
803 "vfio-nvidia-bar5-window-address-quirk", 4);
804 memory_region_add_subregion_overlap(vdev
->bars
[nr
].region
.mem
,
805 window
->address_offset
,
806 window
->addr_mem
, 1);
807 memory_region_set_enabled(window
->addr_mem
, false);
809 memory_region_init_io(window
->data_mem
, OBJECT(vdev
),
810 &vfio_generic_window_data_quirk
, window
,
811 "vfio-nvidia-bar5-window-data-quirk", 4);
812 memory_region_add_subregion_overlap(vdev
->bars
[nr
].region
.mem
,
814 window
->data_mem
, 1);
815 memory_region_set_enabled(window
->data_mem
, false);
817 memory_region_init_io(&quirk
->mem
[2], OBJECT(vdev
),
818 &vfio_nvidia_bar5_quirk_master
, bar5
,
819 "vfio-nvidia-bar5-master-quirk", 4);
820 memory_region_add_subregion_overlap(vdev
->bars
[nr
].region
.mem
,
821 0, &quirk
->mem
[2], 1);
823 memory_region_init_io(&quirk
->mem
[3], OBJECT(vdev
),
824 &vfio_nvidia_bar5_quirk_enable
, bar5
,
825 "vfio-nvidia-bar5-enable-quirk", 4);
826 memory_region_add_subregion_overlap(vdev
->bars
[nr
].region
.mem
,
827 4, &quirk
->mem
[3], 1);
829 QLIST_INSERT_HEAD(&vdev
->bars
[nr
].quirks
, quirk
, next
);
831 trace_vfio_quirk_nvidia_bar5_probe(vdev
->vbasedev
.name
);
834 typedef struct LastDataSet
{
843 #define MAX_DYN_IOEVENTFD 10
844 #define HITS_FOR_IOEVENTFD 10
847 * Finally, BAR0 itself. We want to redirect any accesses to either
848 * 0x1800 or 0x88000 through the PCI config space access functions.
850 static void vfio_nvidia_quirk_mirror_write(void *opaque
, hwaddr addr
,
851 uint64_t data
, unsigned size
)
853 VFIOConfigMirrorQuirk
*mirror
= opaque
;
854 VFIOPCIDevice
*vdev
= mirror
->vdev
;
855 PCIDevice
*pdev
= &vdev
->pdev
;
856 LastDataSet
*last
= (LastDataSet
*)&mirror
->data
;
858 vfio_generic_quirk_mirror_write(opaque
, addr
, data
, size
);
861 * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
862 * MSI capability ID register. Both the ID and next register are
863 * read-only, so we allow writes covering either of those to real hw.
865 if ((pdev
->cap_present
& QEMU_PCI_CAP_MSI
) &&
866 vfio_range_contained(addr
, size
, pdev
->msi_cap
, PCI_MSI_FLAGS
)) {
867 vfio_region_write(&vdev
->bars
[mirror
->bar
].region
,
868 addr
+ mirror
->offset
, data
, size
);
869 trace_vfio_quirk_nvidia_bar0_msi_ack(vdev
->vbasedev
.name
);
873 * Automatically add an ioeventfd to handle any repeated write with the
874 * same data and size above the standard PCI config space header. This is
875 * primarily expected to accelerate the MSI-ACK behavior, such as noted
876 * above. Current hardware/drivers should trigger an ioeventfd at config
877 * offset 0x704 (region offset 0x88704), with data 0x0, size 4.
879 * The criteria of 10 successive hits is arbitrary but reliably adds the
880 * MSI-ACK region. Note that as some writes are bypassed via the ioeventfd,
881 * the remaining ones have a greater chance of being seen successively.
882 * To avoid the pathological case of burning up all of QEMU's open file
883 * handles, arbitrarily limit this algorithm from adding no more than 10
884 * ioeventfds, print an error if we would have added an 11th, and then
887 if (!vdev
->no_kvm_ioeventfd
&&
888 addr
>= PCI_STD_HEADER_SIZEOF
&& last
->added
<= MAX_DYN_IOEVENTFD
) {
889 if (addr
!= last
->addr
|| data
!= last
->data
|| size
!= last
->size
) {
894 } else if (++last
->hits
>= HITS_FOR_IOEVENTFD
) {
895 if (last
->added
< MAX_DYN_IOEVENTFD
) {
896 VFIOIOEventFD
*ioeventfd
;
897 ioeventfd
= vfio_ioeventfd_init(vdev
, mirror
->mem
, addr
, size
,
898 data
, &vdev
->bars
[mirror
->bar
].region
,
899 mirror
->offset
+ addr
, true);
901 VFIOQuirk
*quirk
= last
->quirk
;
903 QLIST_INSERT_HEAD(&quirk
->ioeventfds
, ioeventfd
, next
);
908 warn_report("NVIDIA ioeventfd queue full for %s, unable to "
909 "accelerate 0x%"HWADDR_PRIx
", data 0x%"PRIx64
", "
910 "size %u", vdev
->vbasedev
.name
, addr
, data
, size
);
916 static const MemoryRegionOps vfio_nvidia_mirror_quirk
= {
917 .read
= vfio_generic_quirk_mirror_read
,
918 .write
= vfio_nvidia_quirk_mirror_write
,
919 .endianness
= DEVICE_LITTLE_ENDIAN
,
922 static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice
*vdev
, VFIOQuirk
*quirk
)
924 VFIOConfigMirrorQuirk
*mirror
= quirk
->data
;
925 LastDataSet
*last
= (LastDataSet
*)&mirror
->data
;
927 last
->addr
= last
->data
= last
->size
= last
->hits
= last
->added
= 0;
929 vfio_drop_dynamic_eventfds(vdev
, quirk
);
932 static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice
*vdev
, int nr
)
935 VFIOConfigMirrorQuirk
*mirror
;
938 if (vdev
->no_geforce_quirks
||
939 !vfio_pci_is(vdev
, PCI_VENDOR_ID_NVIDIA
, PCI_ANY_ID
) ||
940 !vfio_is_vga(vdev
) || nr
!= 0) {
944 quirk
= vfio_quirk_alloc(1);
945 quirk
->reset
= vfio_nvidia_bar0_quirk_reset
;
946 mirror
= quirk
->data
= g_malloc0(sizeof(*mirror
) + sizeof(LastDataSet
));
947 mirror
->mem
= quirk
->mem
;
949 mirror
->offset
= 0x88000;
951 last
= (LastDataSet
*)&mirror
->data
;
954 memory_region_init_io(mirror
->mem
, OBJECT(vdev
),
955 &vfio_nvidia_mirror_quirk
, mirror
,
956 "vfio-nvidia-bar0-88000-mirror-quirk",
958 memory_region_add_subregion_overlap(vdev
->bars
[nr
].region
.mem
,
959 mirror
->offset
, mirror
->mem
, 1);
961 QLIST_INSERT_HEAD(&vdev
->bars
[nr
].quirks
, quirk
, next
);
963 /* The 0x1800 offset mirror only seems to get used by legacy VGA */
965 quirk
= vfio_quirk_alloc(1);
966 quirk
->reset
= vfio_nvidia_bar0_quirk_reset
;
967 mirror
= quirk
->data
= g_malloc0(sizeof(*mirror
) + sizeof(LastDataSet
));
968 mirror
->mem
= quirk
->mem
;
970 mirror
->offset
= 0x1800;
972 last
= (LastDataSet
*)&mirror
->data
;
975 memory_region_init_io(mirror
->mem
, OBJECT(vdev
),
976 &vfio_nvidia_mirror_quirk
, mirror
,
977 "vfio-nvidia-bar0-1800-mirror-quirk",
978 PCI_CONFIG_SPACE_SIZE
);
979 memory_region_add_subregion_overlap(vdev
->bars
[nr
].region
.mem
,
980 mirror
->offset
, mirror
->mem
, 1);
982 QLIST_INSERT_HEAD(&vdev
->bars
[nr
].quirks
, quirk
, next
);
985 trace_vfio_quirk_nvidia_bar0_probe(vdev
->vbasedev
.name
);
989 * TODO - Some Nvidia devices provide config access to their companion HDA
990 * device and even to their parent bridge via these config space mirrors.
991 * Add quirks for those regions.
994 #define PCI_VENDOR_ID_REALTEK 0x10ec
997 * RTL8168 devices have a backdoor that can access the MSI-X table. At BAR2
998 * offset 0x70 there is a dword data register, offset 0x74 is a dword address
999 * register. According to the Linux r8169 driver, the MSI-X table is addressed
1000 * when the "type" portion of the address register is set to 0x1. This appears
1001 * to be bits 16:30. Bit 31 is both a write indicator and some sort of
1002 * "address latched" indicator. Bits 12:15 are a mask field, which we can
1003 * ignore because the MSI-X table should always be accessed as a dword (full
1004 * mask). Bits 0:11 is offset within the type.
1008 * Read from MSI-X table offset 0
1009 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr
1010 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch
1011 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data
1013 * Write 0xfee00000 to MSI-X table offset 0
1014 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data
1015 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write
1016 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete
1018 typedef struct VFIOrtl8168Quirk
{
1019 VFIOPCIDevice
*vdev
;
1025 static uint64_t vfio_rtl8168_quirk_address_read(void *opaque
,
1026 hwaddr addr
, unsigned size
)
1028 VFIOrtl8168Quirk
*rtl
= opaque
;
1029 VFIOPCIDevice
*vdev
= rtl
->vdev
;
1030 uint64_t data
= vfio_region_read(&vdev
->bars
[2].region
, addr
+ 0x74, size
);
1033 data
= rtl
->addr
^ 0x80000000U
; /* latch/complete */
1034 trace_vfio_quirk_rtl8168_fake_latch(vdev
->vbasedev
.name
, data
);
1040 static void vfio_rtl8168_quirk_address_write(void *opaque
, hwaddr addr
,
1041 uint64_t data
, unsigned size
)
1043 VFIOrtl8168Quirk
*rtl
= opaque
;
1044 VFIOPCIDevice
*vdev
= rtl
->vdev
;
1046 rtl
->enabled
= false;
1048 if ((data
& 0x7fff0000) == 0x10000) { /* MSI-X table */
1049 rtl
->enabled
= true;
1050 rtl
->addr
= (uint32_t)data
;
1052 if (data
& 0x80000000U
) { /* Do write */
1053 if (vdev
->pdev
.cap_present
& QEMU_PCI_CAP_MSIX
) {
1054 hwaddr offset
= data
& 0xfff;
1055 uint64_t val
= rtl
->data
;
1057 trace_vfio_quirk_rtl8168_msix_write(vdev
->vbasedev
.name
,
1058 (uint16_t)offset
, val
);
1060 /* Write to the proper guest MSI-X table instead */
1061 memory_region_dispatch_write(&vdev
->pdev
.msix_table_mmio
,
1063 size_memop(size
) | MO_LE
,
1064 MEMTXATTRS_UNSPECIFIED
);
1066 return; /* Do not write guest MSI-X data to hardware */
1070 vfio_region_write(&vdev
->bars
[2].region
, addr
+ 0x74, data
, size
);
1073 static const MemoryRegionOps vfio_rtl_address_quirk
= {
1074 .read
= vfio_rtl8168_quirk_address_read
,
1075 .write
= vfio_rtl8168_quirk_address_write
,
1077 .min_access_size
= 4,
1078 .max_access_size
= 4,
1081 .endianness
= DEVICE_LITTLE_ENDIAN
,
1084 static uint64_t vfio_rtl8168_quirk_data_read(void *opaque
,
1085 hwaddr addr
, unsigned size
)
1087 VFIOrtl8168Quirk
*rtl
= opaque
;
1088 VFIOPCIDevice
*vdev
= rtl
->vdev
;
1089 uint64_t data
= vfio_region_read(&vdev
->bars
[2].region
, addr
+ 0x70, size
);
1091 if (rtl
->enabled
&& (vdev
->pdev
.cap_present
& QEMU_PCI_CAP_MSIX
)) {
1092 hwaddr offset
= rtl
->addr
& 0xfff;
1093 memory_region_dispatch_read(&vdev
->pdev
.msix_table_mmio
, offset
,
1094 &data
, size_memop(size
) | MO_LE
,
1095 MEMTXATTRS_UNSPECIFIED
);
1096 trace_vfio_quirk_rtl8168_msix_read(vdev
->vbasedev
.name
, offset
, data
);
1102 static void vfio_rtl8168_quirk_data_write(void *opaque
, hwaddr addr
,
1103 uint64_t data
, unsigned size
)
1105 VFIOrtl8168Quirk
*rtl
= opaque
;
1106 VFIOPCIDevice
*vdev
= rtl
->vdev
;
1108 rtl
->data
= (uint32_t)data
;
1110 vfio_region_write(&vdev
->bars
[2].region
, addr
+ 0x70, data
, size
);
1113 static const MemoryRegionOps vfio_rtl_data_quirk
= {
1114 .read
= vfio_rtl8168_quirk_data_read
,
1115 .write
= vfio_rtl8168_quirk_data_write
,
1117 .min_access_size
= 4,
1118 .max_access_size
= 4,
1121 .endianness
= DEVICE_LITTLE_ENDIAN
,
1124 static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice
*vdev
, int nr
)
1127 VFIOrtl8168Quirk
*rtl
;
1129 if (!vfio_pci_is(vdev
, PCI_VENDOR_ID_REALTEK
, 0x8168) || nr
!= 2) {
1133 quirk
= vfio_quirk_alloc(2);
1134 quirk
->data
= rtl
= g_malloc0(sizeof(*rtl
));
1137 memory_region_init_io(&quirk
->mem
[0], OBJECT(vdev
),
1138 &vfio_rtl_address_quirk
, rtl
,
1139 "vfio-rtl8168-window-address-quirk", 4);
1140 memory_region_add_subregion_overlap(vdev
->bars
[nr
].region
.mem
,
1141 0x74, &quirk
->mem
[0], 1);
1143 memory_region_init_io(&quirk
->mem
[1], OBJECT(vdev
),
1144 &vfio_rtl_data_quirk
, rtl
,
1145 "vfio-rtl8168-window-data-quirk", 4);
1146 memory_region_add_subregion_overlap(vdev
->bars
[nr
].region
.mem
,
1147 0x70, &quirk
->mem
[1], 1);
1149 QLIST_INSERT_HEAD(&vdev
->bars
[nr
].quirks
, quirk
, next
);
1151 trace_vfio_quirk_rtl8168_probe(vdev
->vbasedev
.name
);
1154 #define IGD_ASLS 0xfc /* ASL Storage Register */
1157 * The OpRegion includes the Video BIOS Table, which seems important for
1158 * telling the driver what sort of outputs it has. Without this, the device
1159 * may work in the guest, but we may not get output. This also requires BIOS
1160 * support to reserve and populate a section of guest memory sufficient for
1161 * the table and to write the base address of that memory to the ASLS register
1162 * of the IGD device.
1164 int vfio_pci_igd_opregion_init(VFIOPCIDevice
*vdev
,
1165 struct vfio_region_info
*info
, Error
**errp
)
1169 vdev
->igd_opregion
= g_malloc0(info
->size
);
1170 ret
= pread(vdev
->vbasedev
.fd
, vdev
->igd_opregion
,
1171 info
->size
, info
->offset
);
1172 if (ret
!= info
->size
) {
1173 error_setg(errp
, "failed to read IGD OpRegion");
1174 g_free(vdev
->igd_opregion
);
1175 vdev
->igd_opregion
= NULL
;
1180 * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to
1181 * allocate 32bit reserved memory for, copy these contents into, and write
1182 * the reserved memory base address to the device ASLS register at 0xFC.
1183 * Alignment of this reserved region seems flexible, but using a 4k page
1184 * alignment seems to work well. This interface assumes a single IGD
1185 * device, which may be at VM address 00:02.0 in legacy mode or another
1186 * address in UPT mode.
1188 * NB, there may be future use cases discovered where the VM should have
1189 * direct interaction with the host OpRegion, in which case the write to
1190 * the ASLS register would trigger MemoryRegion setup to enable that.
1192 fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion",
1193 vdev
->igd_opregion
, info
->size
);
1195 trace_vfio_pci_igd_opregion_enabled(vdev
->vbasedev
.name
);
1197 pci_set_long(vdev
->pdev
.config
+ IGD_ASLS
, 0);
1198 pci_set_long(vdev
->pdev
.wmask
+ IGD_ASLS
, ~0);
1199 pci_set_long(vdev
->emulated_config_bits
+ IGD_ASLS
, ~0);
1205 * Common quirk probe entry points.
1207 void vfio_vga_quirk_setup(VFIOPCIDevice
*vdev
)
1209 vfio_vga_probe_ati_3c3_quirk(vdev
);
1210 vfio_vga_probe_nvidia_3d0_quirk(vdev
);
1213 void vfio_vga_quirk_exit(VFIOPCIDevice
*vdev
)
1218 for (i
= 0; i
< ARRAY_SIZE(vdev
->vga
->region
); i
++) {
1219 QLIST_FOREACH(quirk
, &vdev
->vga
->region
[i
].quirks
, next
) {
1220 for (j
= 0; j
< quirk
->nr_mem
; j
++) {
1221 memory_region_del_subregion(&vdev
->vga
->region
[i
].mem
,
1228 void vfio_vga_quirk_finalize(VFIOPCIDevice
*vdev
)
1232 for (i
= 0; i
< ARRAY_SIZE(vdev
->vga
->region
); i
++) {
1233 while (!QLIST_EMPTY(&vdev
->vga
->region
[i
].quirks
)) {
1234 VFIOQuirk
*quirk
= QLIST_FIRST(&vdev
->vga
->region
[i
].quirks
);
1235 QLIST_REMOVE(quirk
, next
);
1236 for (j
= 0; j
< quirk
->nr_mem
; j
++) {
1237 object_unparent(OBJECT(&quirk
->mem
[j
]));
1240 g_free(quirk
->data
);
1246 void vfio_bar_quirk_setup(VFIOPCIDevice
*vdev
, int nr
)
1248 vfio_probe_ati_bar4_quirk(vdev
, nr
);
1249 vfio_probe_ati_bar2_quirk(vdev
, nr
);
1250 vfio_probe_nvidia_bar5_quirk(vdev
, nr
);
1251 vfio_probe_nvidia_bar0_quirk(vdev
, nr
);
1252 vfio_probe_rtl8168_bar2_quirk(vdev
, nr
);
1253 #ifdef CONFIG_VFIO_IGD
1254 vfio_probe_igd_bar4_quirk(vdev
, nr
);
1258 void vfio_bar_quirk_exit(VFIOPCIDevice
*vdev
, int nr
)
1260 VFIOBAR
*bar
= &vdev
->bars
[nr
];
1264 QLIST_FOREACH(quirk
, &bar
->quirks
, next
) {
1265 while (!QLIST_EMPTY(&quirk
->ioeventfds
)) {
1266 vfio_ioeventfd_exit(vdev
, QLIST_FIRST(&quirk
->ioeventfds
));
1269 for (i
= 0; i
< quirk
->nr_mem
; i
++) {
1270 memory_region_del_subregion(bar
->region
.mem
, &quirk
->mem
[i
]);
1275 void vfio_bar_quirk_finalize(VFIOPCIDevice
*vdev
, int nr
)
1277 VFIOBAR
*bar
= &vdev
->bars
[nr
];
1280 while (!QLIST_EMPTY(&bar
->quirks
)) {
1281 VFIOQuirk
*quirk
= QLIST_FIRST(&bar
->quirks
);
1282 QLIST_REMOVE(quirk
, next
);
1283 for (i
= 0; i
< quirk
->nr_mem
; i
++) {
1284 object_unparent(OBJECT(&quirk
->mem
[i
]));
1287 g_free(quirk
->data
);
1295 void vfio_quirk_reset(VFIOPCIDevice
*vdev
)
1299 for (i
= 0; i
< PCI_ROM_SLOT
; i
++) {
1301 VFIOBAR
*bar
= &vdev
->bars
[i
];
1303 QLIST_FOREACH(quirk
, &bar
->quirks
, next
) {
1305 quirk
->reset(vdev
, quirk
);
1312 * AMD Radeon PCI config reset, based on Linux:
1313 * drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running()
1314 * drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset
1315 * drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc()
1316 * drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock()
1317 * IDs: include/drm/drm_pciids.h
1318 * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0
1320 * Bonaire and Hawaii GPUs do not respond to a bus reset. This is a bug in the
1321 * hardware that should be fixed on future ASICs. The symptom of this is that
1322 * once the accerlated driver loads, Windows guests will bsod on subsequent
1323 * attmpts to load the driver, such as after VM reset or shutdown/restart. To
1324 * work around this, we do an AMD specific PCI config reset, followed by an SMC
1325 * reset. The PCI config reset only works if SMC firmware is running, so we
1326 * have a dependency on the state of the device as to whether this reset will
1327 * be effective. There are still cases where we won't be able to kick the
1328 * device into working, but this greatly improves the usability overall. The
1329 * config reset magic is relatively common on AMD GPUs, but the setup and SMC
1330 * poking is largely ASIC specific.
1332 static bool vfio_radeon_smc_is_running(VFIOPCIDevice
*vdev
)
1337 * Registers 200h and 204h are index and data registers for accessing
1338 * indirect configuration registers within the device.
1340 vfio_region_write(&vdev
->bars
[5].region
, 0x200, 0x80000004, 4);
1341 clk
= vfio_region_read(&vdev
->bars
[5].region
, 0x204, 4);
1342 vfio_region_write(&vdev
->bars
[5].region
, 0x200, 0x80000370, 4);
1343 pc_c
= vfio_region_read(&vdev
->bars
[5].region
, 0x204, 4);
1345 return (!(clk
& 1) && (0x20100 <= pc_c
));
1349 * The scope of a config reset is controlled by a mode bit in the misc register
1350 * and a fuse, exposed as a bit in another register. The fuse is the default
1351 * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the forumula
1352 * scope = !(misc ^ fuse), where the resulting scope is defined the same as
1353 * the fuse. A truth table therefore tells us that if misc == fuse, we need
1354 * to flip the value of the bit in the misc register.
1356 static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice
*vdev
)
1358 uint32_t misc
, fuse
;
1361 vfio_region_write(&vdev
->bars
[5].region
, 0x200, 0xc00c0000, 4);
1362 fuse
= vfio_region_read(&vdev
->bars
[5].region
, 0x204, 4);
1365 vfio_region_write(&vdev
->bars
[5].region
, 0x200, 0xc0000010, 4);
1366 misc
= vfio_region_read(&vdev
->bars
[5].region
, 0x204, 4);
1370 vfio_region_write(&vdev
->bars
[5].region
, 0x204, misc
^ 2, 4);
1371 vfio_region_read(&vdev
->bars
[5].region
, 0x204, 4); /* flush */
1375 static int vfio_radeon_reset(VFIOPCIDevice
*vdev
)
1377 PCIDevice
*pdev
= &vdev
->pdev
;
1381 /* Defer to a kernel implemented reset */
1382 if (vdev
->vbasedev
.reset_works
) {
1383 trace_vfio_quirk_ati_bonaire_reset_skipped(vdev
->vbasedev
.name
);
1387 /* Enable only memory BAR access */
1388 vfio_pci_write_config(pdev
, PCI_COMMAND
, PCI_COMMAND_MEMORY
, 2);
1390 /* Reset only works if SMC firmware is loaded and running */
1391 if (!vfio_radeon_smc_is_running(vdev
)) {
1393 trace_vfio_quirk_ati_bonaire_reset_no_smc(vdev
->vbasedev
.name
);
1397 /* Make sure only the GFX function is reset */
1398 vfio_radeon_set_gfx_only_reset(vdev
);
1400 /* AMD PCI config reset */
1401 vfio_pci_write_config(pdev
, 0x7c, 0x39d5e86b, 4);
1404 /* Read back the memory size to make sure we're out of reset */
1405 for (i
= 0; i
< 100000; i
++) {
1406 if (vfio_region_read(&vdev
->bars
[5].region
, 0x5428, 4) != 0xffffffff) {
1412 trace_vfio_quirk_ati_bonaire_reset_timeout(vdev
->vbasedev
.name
);
1416 vfio_region_write(&vdev
->bars
[5].region
, 0x200, 0x80000000, 4);
1417 data
= vfio_region_read(&vdev
->bars
[5].region
, 0x204, 4);
1419 vfio_region_write(&vdev
->bars
[5].region
, 0x204, data
, 4);
1421 /* Disable SMC clock */
1422 vfio_region_write(&vdev
->bars
[5].region
, 0x200, 0x80000004, 4);
1423 data
= vfio_region_read(&vdev
->bars
[5].region
, 0x204, 4);
1425 vfio_region_write(&vdev
->bars
[5].region
, 0x204, data
, 4);
1427 trace_vfio_quirk_ati_bonaire_reset_done(vdev
->vbasedev
.name
);
1430 /* Restore PCI command register */
1431 vfio_pci_write_config(pdev
, PCI_COMMAND
, 0, 2);
1436 void vfio_setup_resetfn_quirk(VFIOPCIDevice
*vdev
)
1438 switch (vdev
->vendor_id
) {
1440 switch (vdev
->device_id
) {
1442 case 0x6649: /* Bonaire [FirePro W5100] */
1445 case 0x6658: /* Bonaire XTX [Radeon R7 260X] */
1446 case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */
1447 case 0x665d: /* Bonaire [Radeon R7 200 Series] */
1449 case 0x67A0: /* Hawaii XT GL [FirePro W9100] */
1450 case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */
1455 case 0x67B0: /* Hawaii XT [Radeon R9 290X] */
1456 case 0x67B1: /* Hawaii PRO [Radeon R9 290] */
1461 vdev
->resetfn
= vfio_radeon_reset
;
1462 trace_vfio_quirk_ati_bonaire_reset(vdev
->vbasedev
.name
);
1470 * The NVIDIA GPUDirect P2P Vendor capability allows the user to specify
1471 * devices as a member of a clique. Devices within the same clique ID
1472 * are capable of direct P2P. It's the user's responsibility that this
1473 * is correct. The spec says that this may reside at any unused config
1474 * offset, but reserves and recommends hypervisors place this at C8h.
1475 * The spec also states that the hypervisor should place this capability
1476 * at the end of the capability list, thus next is defined as 0h.
1478 * +----------------+----------------+----------------+----------------+
1479 * | sig 7:0 ('P') | vndr len (8h) | next (0h) | cap id (9h) |
1480 * +----------------+----------------+----------------+----------------+
1481 * | rsvd 15:7(0h),id 6:3,ver 2:0(0h)| sig 23:8 ('P2') |
1482 * +---------------------------------+---------------------------------+
1484 * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf
1486 static void get_nv_gpudirect_clique_id(Object
*obj
, Visitor
*v
,
1487 const char *name
, void *opaque
,
1490 DeviceState
*dev
= DEVICE(obj
);
1491 Property
*prop
= opaque
;
1492 uint8_t *ptr
= qdev_get_prop_ptr(dev
, prop
);
1494 visit_type_uint8(v
, name
, ptr
, errp
);
1497 static void set_nv_gpudirect_clique_id(Object
*obj
, Visitor
*v
,
1498 const char *name
, void *opaque
,
1501 DeviceState
*dev
= DEVICE(obj
);
1502 Property
*prop
= opaque
;
1503 uint8_t value
, *ptr
= qdev_get_prop_ptr(dev
, prop
);
1504 Error
*local_err
= NULL
;
1506 if (dev
->realized
) {
1507 qdev_prop_set_after_realize(dev
, name
, errp
);
1511 visit_type_uint8(v
, name
, &value
, &local_err
);
1513 error_propagate(errp
, local_err
);
1518 error_setg(errp
, "Property %s: valid range 0-15", name
);
1525 const PropertyInfo qdev_prop_nv_gpudirect_clique
= {
1527 .description
= "NVIDIA GPUDirect Clique ID (0 - 15)",
1528 .get
= get_nv_gpudirect_clique_id
,
1529 .set
= set_nv_gpudirect_clique_id
,
1532 static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice
*vdev
, Error
**errp
)
1534 PCIDevice
*pdev
= &vdev
->pdev
;
1535 int ret
, pos
= 0xC8;
1537 if (vdev
->nv_gpudirect_clique
== 0xFF) {
1541 if (!vfio_pci_is(vdev
, PCI_VENDOR_ID_NVIDIA
, PCI_ANY_ID
)) {
1542 error_setg(errp
, "NVIDIA GPUDirect Clique ID: invalid device vendor");
1546 if (pci_get_byte(pdev
->config
+ PCI_CLASS_DEVICE
+ 1) !=
1547 PCI_BASE_CLASS_DISPLAY
) {
1548 error_setg(errp
, "NVIDIA GPUDirect Clique ID: unsupported PCI class");
1552 ret
= pci_add_capability(pdev
, PCI_CAP_ID_VNDR
, pos
, 8, errp
);
1554 error_prepend(errp
, "Failed to add NVIDIA GPUDirect cap: ");
1558 memset(vdev
->emulated_config_bits
+ pos
, 0xFF, 8);
1559 pos
+= PCI_CAP_FLAGS
;
1560 pci_set_byte(pdev
->config
+ pos
++, 8);
1561 pci_set_byte(pdev
->config
+ pos
++, 'P');
1562 pci_set_byte(pdev
->config
+ pos
++, '2');
1563 pci_set_byte(pdev
->config
+ pos
++, 'P');
1564 pci_set_byte(pdev
->config
+ pos
++, vdev
->nv_gpudirect_clique
<< 3);
1565 pci_set_byte(pdev
->config
+ pos
, 0);
1570 int vfio_add_virt_caps(VFIOPCIDevice
*vdev
, Error
**errp
)
1574 ret
= vfio_add_nv_gpudirect_cap(vdev
, errp
);
1582 static void vfio_pci_nvlink2_get_tgt(Object
*obj
, Visitor
*v
,
1584 void *opaque
, Error
**errp
)
1586 uint64_t tgt
= (uintptr_t) opaque
;
1587 visit_type_uint64(v
, name
, &tgt
, errp
);
1590 static void vfio_pci_nvlink2_get_link_speed(Object
*obj
, Visitor
*v
,
1592 void *opaque
, Error
**errp
)
1594 uint32_t link_speed
= (uint32_t)(uintptr_t) opaque
;
1595 visit_type_uint32(v
, name
, &link_speed
, errp
);
1598 int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice
*vdev
, Error
**errp
)
1602 struct vfio_region_info
*nv2reg
= NULL
;
1603 struct vfio_info_cap_header
*hdr
;
1604 struct vfio_region_info_cap_nvlink2_ssatgt
*cap
;
1607 ret
= vfio_get_dev_region_info(&vdev
->vbasedev
,
1608 VFIO_REGION_TYPE_PCI_VENDOR_TYPE
|
1609 PCI_VENDOR_ID_NVIDIA
,
1610 VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM
,
1616 hdr
= vfio_get_region_info_cap(nv2reg
, VFIO_REGION_INFO_CAP_NVLINK2_SSATGT
);
1623 p
= mmap(NULL
, nv2reg
->size
, PROT_READ
| PROT_WRITE
| PROT_EXEC
,
1624 MAP_SHARED
, vdev
->vbasedev
.fd
, nv2reg
->offset
);
1625 if (p
== MAP_FAILED
) {
1630 quirk
= vfio_quirk_alloc(1);
1631 memory_region_init_ram_ptr(&quirk
->mem
[0], OBJECT(vdev
), "nvlink2-mr",
1633 QLIST_INSERT_HEAD(&vdev
->bars
[0].quirks
, quirk
, next
);
1635 object_property_add(OBJECT(vdev
), "nvlink2-tgt", "uint64",
1636 vfio_pci_nvlink2_get_tgt
, NULL
, NULL
,
1637 (void *) (uintptr_t) cap
->tgt
, NULL
);
1638 trace_vfio_pci_nvidia_gpu_setup_quirk(vdev
->vbasedev
.name
, cap
->tgt
,
1646 int vfio_pci_nvlink2_init(VFIOPCIDevice
*vdev
, Error
**errp
)
1650 struct vfio_region_info
*atsdreg
= NULL
;
1651 struct vfio_info_cap_header
*hdr
;
1652 struct vfio_region_info_cap_nvlink2_ssatgt
*captgt
;
1653 struct vfio_region_info_cap_nvlink2_lnkspd
*capspeed
;
1656 ret
= vfio_get_dev_region_info(&vdev
->vbasedev
,
1657 VFIO_REGION_TYPE_PCI_VENDOR_TYPE
|
1659 VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD
,
1665 hdr
= vfio_get_region_info_cap(atsdreg
,
1666 VFIO_REGION_INFO_CAP_NVLINK2_SSATGT
);
1671 captgt
= (void *) hdr
;
1673 hdr
= vfio_get_region_info_cap(atsdreg
,
1674 VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD
);
1679 capspeed
= (void *) hdr
;
1681 /* Some NVLink bridges may not have assigned ATSD */
1682 if (atsdreg
->size
) {
1683 p
= mmap(NULL
, atsdreg
->size
, PROT_READ
| PROT_WRITE
| PROT_EXEC
,
1684 MAP_SHARED
, vdev
->vbasedev
.fd
, atsdreg
->offset
);
1685 if (p
== MAP_FAILED
) {
1690 quirk
= vfio_quirk_alloc(1);
1691 memory_region_init_ram_device_ptr(&quirk
->mem
[0], OBJECT(vdev
),
1692 "nvlink2-atsd-mr", atsdreg
->size
, p
);
1693 QLIST_INSERT_HEAD(&vdev
->bars
[0].quirks
, quirk
, next
);
1696 object_property_add(OBJECT(vdev
), "nvlink2-tgt", "uint64",
1697 vfio_pci_nvlink2_get_tgt
, NULL
, NULL
,
1698 (void *) (uintptr_t) captgt
->tgt
, NULL
);
1699 trace_vfio_pci_nvlink2_setup_quirk_ssatgt(vdev
->vbasedev
.name
, captgt
->tgt
,
1702 object_property_add(OBJECT(vdev
), "nvlink2-link-speed", "uint32",
1703 vfio_pci_nvlink2_get_link_speed
, NULL
, NULL
,
1704 (void *) (uintptr_t) capspeed
->link_speed
, NULL
);
1705 trace_vfio_pci_nvlink2_setup_quirk_lnkspd(vdev
->vbasedev
.name
,
1706 capspeed
->link_speed
);