2 * Copyright (c) 2007, Neocleus Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
18 * Assign a PCI device from the host to a guest VM.
20 * Adapted for KVM by Qumranet.
22 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
23 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
24 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
25 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
26 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
31 #include <sys/types.h>
36 #include "qemu-error.h"
38 #include "device-assignment.h"
44 #define MSIX_PAGE_SIZE 0x1000
46 /* From linux/ioport.h */
47 #define IORESOURCE_IO 0x00000100 /* Resource type */
48 #define IORESOURCE_MEM 0x00000200
49 #define IORESOURCE_IRQ 0x00000400
50 #define IORESOURCE_DMA 0x00000800
51 #define IORESOURCE_PREFETCH 0x00002000 /* No side effects */
53 /* #define DEVICE_ASSIGNMENT_DEBUG 1 */
55 #ifdef DEVICE_ASSIGNMENT_DEBUG
56 #define DEBUG(fmt, ...) \
58 fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__); \
61 #define DEBUG(fmt, ...) do { } while(0)
64 static void assigned_dev_load_option_rom(AssignedDevice
*dev
);
66 static void assigned_dev_unregister_msix_mmio(AssignedDevice
*dev
);
68 static uint64_t assigned_dev_ioport_rw(AssignedDevRegion
*dev_region
,
69 target_phys_addr_t addr
, int size
,
73 int fd
= dev_region
->region
->resource_fd
;
77 DEBUG("pwrite data=%lx, size=%d, e_phys=%lx, addr=%lx\n",
78 *data
, size
, addr
, addr
);
79 if (pwrite(fd
, data
, size
, addr
) != size
) {
80 fprintf(stderr
, "%s - pwrite failed %s\n",
81 __func__
, strerror(errno
));
84 if (pread(fd
, &val
, size
, addr
) != size
) {
85 fprintf(stderr
, "%s - pread failed %s\n",
86 __func__
, strerror(errno
));
87 val
= (1UL << (size
* 8)) - 1;
89 DEBUG("pread val=%lx, size=%d, e_phys=%lx, addr=%lx\n",
90 val
, size
, addr
, addr
);
93 uint32_t port
= addr
+ dev_region
->u
.r_baseport
;
96 DEBUG("out data=%lx, size=%d, e_phys=%lx, host=%x\n",
97 *data
, size
, addr
, port
);
121 DEBUG("in data=%lx, size=%d, e_phys=%lx, host=%x\n",
122 val
, size
, addr
, port
);
128 static void assigned_dev_ioport_write(void *opaque
, target_phys_addr_t addr
,
129 uint64_t data
, unsigned size
)
131 assigned_dev_ioport_rw(opaque
, addr
, size
, &data
);
134 static uint64_t assigned_dev_ioport_read(void *opaque
,
135 target_phys_addr_t addr
, unsigned size
)
137 return assigned_dev_ioport_rw(opaque
, addr
, size
, NULL
);
140 static uint32_t slow_bar_readb(void *opaque
, target_phys_addr_t addr
)
142 AssignedDevRegion
*d
= opaque
;
143 uint8_t *in
= d
->u
.r_virtbase
+ addr
;
147 DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx
" val=0x%08x\n", addr
, r
);
152 static uint32_t slow_bar_readw(void *opaque
, target_phys_addr_t addr
)
154 AssignedDevRegion
*d
= opaque
;
155 uint16_t *in
= d
->u
.r_virtbase
+ addr
;
159 DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx
" val=0x%08x\n", addr
, r
);
164 static uint32_t slow_bar_readl(void *opaque
, target_phys_addr_t addr
)
166 AssignedDevRegion
*d
= opaque
;
167 uint32_t *in
= d
->u
.r_virtbase
+ addr
;
171 DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx
" val=0x%08x\n", addr
, r
);
176 static void slow_bar_writeb(void *opaque
, target_phys_addr_t addr
, uint32_t val
)
178 AssignedDevRegion
*d
= opaque
;
179 uint8_t *out
= d
->u
.r_virtbase
+ addr
;
181 DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx
" val=0x%02x\n", addr
, val
);
185 static void slow_bar_writew(void *opaque
, target_phys_addr_t addr
, uint32_t val
)
187 AssignedDevRegion
*d
= opaque
;
188 uint16_t *out
= d
->u
.r_virtbase
+ addr
;
190 DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx
" val=0x%04x\n", addr
, val
);
194 static void slow_bar_writel(void *opaque
, target_phys_addr_t addr
, uint32_t val
)
196 AssignedDevRegion
*d
= opaque
;
197 uint32_t *out
= d
->u
.r_virtbase
+ addr
;
199 DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx
" val=0x%08x\n", addr
, val
);
203 static const MemoryRegionOps slow_bar_ops
= {
205 .read
= { slow_bar_readb
, slow_bar_readw
, slow_bar_readl
, },
206 .write
= { slow_bar_writeb
, slow_bar_writew
, slow_bar_writel
, },
208 .endianness
= DEVICE_NATIVE_ENDIAN
,
211 static void assigned_dev_iomem_setup(PCIDevice
*pci_dev
, int region_num
,
214 AssignedDevice
*r_dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
215 AssignedDevRegion
*region
= &r_dev
->v_addrs
[region_num
];
216 PCIRegion
*real_region
= &r_dev
->real_device
.regions
[region_num
];
219 memory_region_init(®ion
->container
, "assigned-dev-container",
221 memory_region_add_subregion(®ion
->container
, 0, ®ion
->real_iomem
);
223 /* deal with MSI-X MMIO page */
224 if (real_region
->base_addr
<= r_dev
->msix_table_addr
&&
225 real_region
->base_addr
+ real_region
->size
>
226 r_dev
->msix_table_addr
) {
227 int offset
= r_dev
->msix_table_addr
- real_region
->base_addr
;
229 memory_region_add_subregion_overlap(®ion
->container
,
237 static const MemoryRegionOps assigned_dev_ioport_ops
= {
238 .read
= assigned_dev_ioport_read
,
239 .write
= assigned_dev_ioport_write
,
240 .endianness
= DEVICE_NATIVE_ENDIAN
,
243 static void assigned_dev_ioport_setup(PCIDevice
*pci_dev
, int region_num
,
246 AssignedDevice
*r_dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
247 AssignedDevRegion
*region
= &r_dev
->v_addrs
[region_num
];
250 region
->e_size
= size
;
252 if (region
->region
->resource_fd
< 0) {
253 r
= kvm_add_ioport_region(region
->u
.r_baseport
, region
->r_size
,
254 pci_dev
->qdev
.hotplugged
);
256 fprintf(stderr
, "%s: failed to enable ioport access (%m)\n",
260 memory_region_init(®ion
->container
, "assigned-dev-container", size
);
261 memory_region_init_io(®ion
->real_iomem
, &assigned_dev_ioport_ops
,
262 r_dev
->v_addrs
+ region_num
,
263 "assigned-dev-iomem", size
);
264 memory_region_add_subregion(®ion
->container
, 0, ®ion
->real_iomem
);
267 static uint32_t assigned_dev_pci_read(PCIDevice
*d
, int pos
, int len
)
269 AssignedDevice
*pci_dev
= DO_UPCAST(AssignedDevice
, dev
, d
);
272 int fd
= pci_dev
->real_device
.config_fd
;
275 ret
= pread(fd
, &val
, len
, pos
);
277 if ((ret
< 0) && (errno
== EINTR
|| errno
== EAGAIN
))
280 fprintf(stderr
, "%s: pread failed, ret = %zd errno = %d\n",
281 __func__
, ret
, errno
);
289 static uint8_t assigned_dev_pci_read_byte(PCIDevice
*d
, int pos
)
291 return (uint8_t)assigned_dev_pci_read(d
, pos
, 1);
294 static void assigned_dev_pci_write(PCIDevice
*d
, int pos
, uint32_t val
, int len
)
296 AssignedDevice
*pci_dev
= DO_UPCAST(AssignedDevice
, dev
, d
);
298 int fd
= pci_dev
->real_device
.config_fd
;
301 ret
= pwrite(fd
, &val
, len
, pos
);
303 if ((ret
< 0) && (errno
== EINTR
|| errno
== EAGAIN
))
306 fprintf(stderr
, "%s: pwrite failed, ret = %zd errno = %d\n",
307 __func__
, ret
, errno
);
315 static void assigned_dev_emulate_config_read(AssignedDevice
*dev
,
316 uint32_t offset
, uint32_t len
)
318 memset(dev
->emulate_config_read
+ offset
, 0xff, len
);
321 static void assigned_dev_direct_config_read(AssignedDevice
*dev
,
322 uint32_t offset
, uint32_t len
)
324 memset(dev
->emulate_config_read
+ offset
, 0, len
);
327 static void assigned_dev_direct_config_write(AssignedDevice
*dev
,
328 uint32_t offset
, uint32_t len
)
330 memset(dev
->emulate_config_write
+ offset
, 0, len
);
333 static uint8_t pci_find_cap_offset(PCIDevice
*d
, uint8_t cap
, uint8_t start
)
337 int pos
= start
? start
: PCI_CAPABILITY_LIST
;
340 status
= assigned_dev_pci_read_byte(d
, PCI_STATUS
);
341 if ((status
& PCI_STATUS_CAP_LIST
) == 0)
345 pos
= assigned_dev_pci_read_byte(d
, pos
);
350 id
= assigned_dev_pci_read_byte(d
, pos
+ PCI_CAP_LIST_ID
);
357 pos
+= PCI_CAP_LIST_NEXT
;
362 static int assigned_dev_register_regions(PCIRegion
*io_regions
,
363 unsigned long regions_num
,
364 AssignedDevice
*pci_dev
)
367 PCIRegion
*cur_region
= io_regions
;
369 for (i
= 0; i
< regions_num
; i
++, cur_region
++) {
370 if (!cur_region
->valid
)
372 pci_dev
->v_addrs
[i
].num
= i
;
374 /* handle memory io regions */
375 if (cur_region
->type
& IORESOURCE_MEM
) {
376 int t
= cur_region
->type
& IORESOURCE_PREFETCH
377 ? PCI_BASE_ADDRESS_MEM_PREFETCH
378 : PCI_BASE_ADDRESS_SPACE_MEMORY
;
380 /* map physical memory */
381 pci_dev
->v_addrs
[i
].u
.r_virtbase
= mmap(NULL
, cur_region
->size
,
382 PROT_WRITE
| PROT_READ
,
384 cur_region
->resource_fd
,
387 if (pci_dev
->v_addrs
[i
].u
.r_virtbase
== MAP_FAILED
) {
388 pci_dev
->v_addrs
[i
].u
.r_virtbase
= NULL
;
389 fprintf(stderr
, "%s: Error: Couldn't mmap 0x%x!"
391 (uint32_t) (cur_region
->base_addr
));
395 pci_dev
->v_addrs
[i
].r_size
= cur_region
->size
;
396 pci_dev
->v_addrs
[i
].e_size
= 0;
399 pci_dev
->v_addrs
[i
].u
.r_virtbase
+=
400 (cur_region
->base_addr
& 0xFFF);
402 if (cur_region
->size
& 0xFFF) {
403 fprintf(stderr
, "PCI region %d at address 0x%llx "
404 "has size 0x%x, which is not a multiple of 4K. "
405 "You might experience some performance hit "
407 i
, (unsigned long long)cur_region
->base_addr
,
409 memory_region_init_io(&pci_dev
->v_addrs
[i
].real_iomem
,
410 &slow_bar_ops
, &pci_dev
->v_addrs
[i
],
411 "assigned-dev-slow-bar",
414 void *virtbase
= pci_dev
->v_addrs
[i
].u
.r_virtbase
;
416 snprintf(name
, sizeof(name
), "%s.bar%d",
417 object_get_typename(OBJECT(pci_dev
)), i
);
418 memory_region_init_ram_ptr(&pci_dev
->v_addrs
[i
].real_iomem
,
419 name
, cur_region
->size
,
421 vmstate_register_ram(&pci_dev
->v_addrs
[i
].real_iomem
,
425 assigned_dev_iomem_setup(&pci_dev
->dev
, i
, cur_region
->size
);
426 pci_register_bar((PCIDevice
*) pci_dev
, i
, t
,
427 &pci_dev
->v_addrs
[i
].container
);
430 /* handle port io regions */
434 /* Test kernel support for ioport resource read/write. Old
435 * kernels return EIO. New kernels only allow 1/2/4 byte reads
436 * so should return EINVAL for a 3 byte read */
437 ret
= pread(pci_dev
->v_addrs
[i
].region
->resource_fd
, &val
, 3, 0);
439 fprintf(stderr
, "Unexpected return from I/O port read: %d\n",
442 } else if (errno
!= EINVAL
) {
443 fprintf(stderr
, "Using raw in/out ioport access (sysfs - %s)\n",
445 close(pci_dev
->v_addrs
[i
].region
->resource_fd
);
446 pci_dev
->v_addrs
[i
].region
->resource_fd
= -1;
449 pci_dev
->v_addrs
[i
].u
.r_baseport
= cur_region
->base_addr
;
450 pci_dev
->v_addrs
[i
].r_size
= cur_region
->size
;
451 pci_dev
->v_addrs
[i
].e_size
= 0;
453 assigned_dev_ioport_setup(&pci_dev
->dev
, i
, cur_region
->size
);
454 pci_register_bar((PCIDevice
*) pci_dev
, i
,
455 PCI_BASE_ADDRESS_SPACE_IO
,
456 &pci_dev
->v_addrs
[i
].container
);
464 static int get_real_id(const char *devpath
, const char *idname
, uint16_t *val
)
470 snprintf(name
, sizeof(name
), "%s%s", devpath
, idname
);
471 f
= fopen(name
, "r");
473 fprintf(stderr
, "%s: %s: %m\n", __func__
, name
);
476 if (fscanf(f
, "%li\n", &id
) == 1) {
486 static int get_real_vendor_id(const char *devpath
, uint16_t *val
)
488 return get_real_id(devpath
, "vendor", val
);
491 static int get_real_device_id(const char *devpath
, uint16_t *val
)
493 return get_real_id(devpath
, "device", val
);
496 static int get_real_device(AssignedDevice
*pci_dev
, uint16_t r_seg
,
497 uint8_t r_bus
, uint8_t r_dev
, uint8_t r_func
)
499 char dir
[128], name
[128];
502 unsigned long long start
, end
, size
, flags
;
506 PCIDevRegions
*dev
= &pci_dev
->real_device
;
508 dev
->region_number
= 0;
510 snprintf(dir
, sizeof(dir
), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/",
511 r_seg
, r_bus
, r_dev
, r_func
);
513 snprintf(name
, sizeof(name
), "%sconfig", dir
);
515 if (pci_dev
->configfd_name
&& *pci_dev
->configfd_name
) {
516 if (qemu_isdigit(pci_dev
->configfd_name
[0])) {
517 dev
->config_fd
= strtol(pci_dev
->configfd_name
, NULL
, 0);
519 dev
->config_fd
= monitor_get_fd(cur_mon
, pci_dev
->configfd_name
);
520 if (dev
->config_fd
< 0) {
521 fprintf(stderr
, "%s: (%s) unkown\n", __func__
,
522 pci_dev
->configfd_name
);
527 dev
->config_fd
= open(name
, O_RDWR
);
529 if (dev
->config_fd
== -1) {
530 fprintf(stderr
, "%s: %s: %m\n", __func__
, name
);
535 r
= read(dev
->config_fd
, pci_dev
->dev
.config
,
536 pci_config_size(&pci_dev
->dev
));
538 if (errno
== EINTR
|| errno
== EAGAIN
)
540 fprintf(stderr
, "%s: read failed, errno = %d\n", __func__
, errno
);
543 /* Restore or clear multifunction, this is always controlled by qemu */
544 if (pci_dev
->dev
.cap_present
& QEMU_PCI_CAP_MULTIFUNCTION
) {
545 pci_dev
->dev
.config
[PCI_HEADER_TYPE
] |= PCI_HEADER_TYPE_MULTI_FUNCTION
;
547 pci_dev
->dev
.config
[PCI_HEADER_TYPE
] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION
;
550 /* Clear host resource mapping info. If we choose not to register a
551 * BAR, such as might be the case with the option ROM, we can get
552 * confusing, unwritable, residual addresses from the host here. */
553 memset(&pci_dev
->dev
.config
[PCI_BASE_ADDRESS_0
], 0, 24);
554 memset(&pci_dev
->dev
.config
[PCI_ROM_ADDRESS
], 0, 4);
556 snprintf(name
, sizeof(name
), "%sresource", dir
);
558 f
= fopen(name
, "r");
560 fprintf(stderr
, "%s: %s: %m\n", __func__
, name
);
564 for (r
= 0; r
< PCI_ROM_SLOT
; r
++) {
565 if (fscanf(f
, "%lli %lli %lli\n", &start
, &end
, &flags
) != 3)
568 rp
= dev
->regions
+ r
;
570 rp
->resource_fd
= -1;
571 size
= end
- start
+ 1;
572 flags
&= IORESOURCE_IO
| IORESOURCE_MEM
| IORESOURCE_PREFETCH
;
573 if (size
== 0 || (flags
& ~IORESOURCE_PREFETCH
) == 0)
575 if (flags
& IORESOURCE_MEM
) {
576 flags
&= ~IORESOURCE_IO
;
578 flags
&= ~IORESOURCE_PREFETCH
;
580 snprintf(name
, sizeof(name
), "%sresource%d", dir
, r
);
581 fd
= open(name
, O_RDWR
);
584 rp
->resource_fd
= fd
;
588 rp
->base_addr
= start
;
590 pci_dev
->v_addrs
[r
].region
= rp
;
591 DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
592 r
, rp
->size
, start
, rp
->type
, rp
->resource_fd
);
597 /* read and fill vendor ID */
598 v
= get_real_vendor_id(dir
, &id
);
602 pci_dev
->dev
.config
[0] = id
& 0xff;
603 pci_dev
->dev
.config
[1] = (id
& 0xff00) >> 8;
605 /* read and fill device ID */
606 v
= get_real_device_id(dir
, &id
);
610 pci_dev
->dev
.config
[2] = id
& 0xff;
611 pci_dev
->dev
.config
[3] = (id
& 0xff00) >> 8;
613 /* dealing with virtual function device */
614 snprintf(name
, sizeof(name
), "%sphysfn/", dir
);
615 if (!stat(name
, &statbuf
)) {
616 /* always provide the written value on readout */
617 assigned_dev_emulate_config_read(pci_dev
, PCI_COMMAND
, 2);
620 dev
->region_number
= r
;
624 static QLIST_HEAD(, AssignedDevice
) devs
= QLIST_HEAD_INITIALIZER(devs
);
626 static void free_dev_irq_entries(AssignedDevice
*dev
)
630 for (i
= 0; i
< dev
->irq_entries_nr
; i
++) {
631 if (dev
->entry
[i
].type
) {
632 kvm_del_routing_entry(&dev
->entry
[i
]);
637 dev
->irq_entries_nr
= 0;
640 static void free_assigned_device(AssignedDevice
*dev
)
644 if (dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSIX
) {
645 assigned_dev_unregister_msix_mmio(dev
);
647 for (i
= 0; i
< dev
->real_device
.region_number
; i
++) {
648 PCIRegion
*pci_region
= &dev
->real_device
.regions
[i
];
649 AssignedDevRegion
*region
= &dev
->v_addrs
[i
];
651 if (!pci_region
->valid
) {
654 if (pci_region
->type
& IORESOURCE_IO
) {
655 if (pci_region
->resource_fd
< 0) {
656 kvm_remove_ioport_region(region
->u
.r_baseport
, region
->r_size
,
657 dev
->dev
.qdev
.hotplugged
);
659 memory_region_del_subregion(®ion
->container
,
660 ®ion
->real_iomem
);
661 memory_region_destroy(®ion
->real_iomem
);
662 memory_region_destroy(®ion
->container
);
663 } else if (pci_region
->type
& IORESOURCE_MEM
) {
664 if (region
->u
.r_virtbase
) {
665 memory_region_del_subregion(®ion
->container
,
666 ®ion
->real_iomem
);
668 /* Remove MSI-X table subregion */
669 if (pci_region
->base_addr
<= dev
->msix_table_addr
&&
670 pci_region
->base_addr
+ pci_region
->size
>
671 dev
->msix_table_addr
) {
672 memory_region_del_subregion(®ion
->container
,
676 memory_region_destroy(®ion
->real_iomem
);
677 memory_region_destroy(®ion
->container
);
678 if (munmap(region
->u
.r_virtbase
,
679 (pci_region
->size
+ 0xFFF) & 0xFFFFF000)) {
681 "Failed to unmap assigned device region: %s\n",
686 if (pci_region
->resource_fd
>= 0) {
687 close(pci_region
->resource_fd
);
691 if (dev
->real_device
.config_fd
>= 0) {
692 close(dev
->real_device
.config_fd
);
695 free_dev_irq_entries(dev
);
698 static uint32_t calc_assigned_dev_id(AssignedDevice
*dev
)
700 return (uint32_t)dev
->h_segnr
<< 16 | (uint32_t)dev
->h_busnr
<< 8 |
701 (uint32_t)dev
->h_devfn
;
704 static void assign_failed_examine(AssignedDevice
*dev
)
706 char name
[PATH_MAX
], dir
[PATH_MAX
], driver
[PATH_MAX
] = {}, *ns
;
707 uint16_t vendor_id
, device_id
;
710 sprintf(dir
, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
711 dev
->host
.seg
, dev
->host
.bus
, dev
->host
.dev
, dev
->host
.func
);
713 sprintf(name
, "%sdriver", dir
);
715 r
= readlink(name
, driver
, sizeof(driver
));
716 if ((r
<= 0) || r
>= sizeof(driver
) || !(ns
= strrchr(driver
, '/'))) {
722 if (get_real_vendor_id(dir
, &vendor_id
) ||
723 get_real_device_id(dir
, &device_id
)) {
727 fprintf(stderr
, "*** The driver '%s' is occupying your device "
728 "%04x:%02x:%02x.%x.\n",
729 ns
, dev
->host
.seg
, dev
->host
.bus
, dev
->host
.dev
, dev
->host
.func
);
730 fprintf(stderr
, "***\n");
731 fprintf(stderr
, "*** You can try the following commands to free it:\n");
732 fprintf(stderr
, "***\n");
733 fprintf(stderr
, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
734 "new_id\n", vendor_id
, device_id
);
735 fprintf(stderr
, "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
737 dev
->host
.seg
, dev
->host
.bus
, dev
->host
.dev
, dev
->host
.func
, ns
);
738 fprintf(stderr
, "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
740 dev
->host
.seg
, dev
->host
.bus
, dev
->host
.dev
, dev
->host
.func
);
741 fprintf(stderr
, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
742 "/remove_id\n", vendor_id
, device_id
);
743 fprintf(stderr
, "***\n");
748 fprintf(stderr
, "Couldn't find out why.\n");
751 static int assign_device(AssignedDevice
*dev
)
753 struct kvm_assigned_pci_dev assigned_dev_data
;
756 /* Only pass non-zero PCI segment to capable module */
757 if (!kvm_check_extension(kvm_state
, KVM_CAP_PCI_SEGMENT
) &&
759 fprintf(stderr
, "Can't assign device inside non-zero PCI segment "
760 "as this KVM module doesn't support it.\n");
764 memset(&assigned_dev_data
, 0, sizeof(assigned_dev_data
));
765 assigned_dev_data
.assigned_dev_id
= calc_assigned_dev_id(dev
);
766 assigned_dev_data
.segnr
= dev
->h_segnr
;
767 assigned_dev_data
.busnr
= dev
->h_busnr
;
768 assigned_dev_data
.devfn
= dev
->h_devfn
;
770 /* We always enable the IOMMU unless disabled on the command line */
771 if (dev
->features
& ASSIGNED_DEVICE_USE_IOMMU_MASK
) {
772 if (!kvm_check_extension(kvm_state
, KVM_CAP_IOMMU
)) {
773 fprintf(stderr
, "No IOMMU found. Unable to assign device \"%s\"\n",
777 assigned_dev_data
.flags
|= KVM_DEV_ASSIGN_ENABLE_IOMMU
;
779 if (!(dev
->features
& ASSIGNED_DEVICE_USE_IOMMU_MASK
)) {
781 "WARNING: Assigning a device without IOMMU protection can "
782 "cause host memory corruption if the device issues DMA write "
785 if (dev
->features
& ASSIGNED_DEVICE_SHARE_INTX_MASK
&&
786 kvm_has_intx_set_mask()) {
787 assigned_dev_data
.flags
|= KVM_DEV_ASSIGN_PCI_2_3
;
789 /* hide host-side INTx masking from the guest */
790 dev
->emulate_config_read
[PCI_COMMAND
+ 1] |=
791 PCI_COMMAND_INTX_DISABLE
>> 8;
794 r
= kvm_assign_pci_device(kvm_state
, &assigned_dev_data
);
796 fprintf(stderr
, "Failed to assign device \"%s\" : %s\n",
797 dev
->dev
.qdev
.id
, strerror(-r
));
801 assign_failed_examine(dev
);
810 static int assign_irq(AssignedDevice
*dev
)
812 struct kvm_assigned_irq assigned_irq_data
;
815 /* Interrupt PIN 0 means don't use INTx */
816 if (assigned_dev_pci_read_byte(&dev
->dev
, PCI_INTERRUPT_PIN
) == 0)
819 irq
= pci_map_irq(&dev
->dev
, dev
->intpin
);
820 irq
= piix_get_irq(irq
);
822 if (dev
->girq
== irq
)
825 memset(&assigned_irq_data
, 0, sizeof(assigned_irq_data
));
826 assigned_irq_data
.assigned_dev_id
= calc_assigned_dev_id(dev
);
827 assigned_irq_data
.guest_irq
= irq
;
828 assigned_irq_data
.host_irq
= dev
->real_device
.irq
;
829 if (dev
->irq_requested_type
) {
830 assigned_irq_data
.flags
= dev
->irq_requested_type
;
831 r
= kvm_deassign_irq(kvm_state
, &assigned_irq_data
);
833 perror("assign_irq: deassign");
835 dev
->irq_requested_type
= 0;
838 assigned_irq_data
.flags
= KVM_DEV_IRQ_GUEST_INTX
;
839 if (dev
->features
& ASSIGNED_DEVICE_PREFER_MSI_MASK
&&
840 dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSI
)
841 assigned_irq_data
.flags
|= KVM_DEV_IRQ_HOST_MSI
;
843 assigned_irq_data
.flags
|= KVM_DEV_IRQ_HOST_INTX
;
845 r
= kvm_assign_irq(kvm_state
, &assigned_irq_data
);
847 fprintf(stderr
, "Failed to assign irq for \"%s\": %s\n",
848 dev
->dev
.qdev
.id
, strerror(-r
));
849 fprintf(stderr
, "Perhaps you are assigning a device "
850 "that shares an IRQ with another device?\n");
855 dev
->irq_requested_type
= assigned_irq_data
.flags
;
859 static void deassign_device(AssignedDevice
*dev
)
861 struct kvm_assigned_pci_dev assigned_dev_data
;
864 memset(&assigned_dev_data
, 0, sizeof(assigned_dev_data
));
865 assigned_dev_data
.assigned_dev_id
= calc_assigned_dev_id(dev
);
867 r
= kvm_deassign_pci_device(kvm_state
, &assigned_dev_data
);
869 fprintf(stderr
, "Failed to deassign device \"%s\" : %s\n",
870 dev
->dev
.qdev
.id
, strerror(-r
));
874 AssignedDevInfo
*get_assigned_device(int pcibus
, int slot
)
876 AssignedDevice
*assigned_dev
= NULL
;
877 AssignedDevInfo
*adev
= NULL
;
879 QLIST_FOREACH(adev
, &adev_head
, next
) {
880 assigned_dev
= adev
->assigned_dev
;
881 if (pci_bus_num(assigned_dev
->dev
.bus
) == pcibus
&&
882 PCI_SLOT(assigned_dev
->dev
.devfn
) == slot
)
890 /* The pci config space got updated. Check if irq numbers have changed
893 void assigned_dev_update_irqs(void)
895 AssignedDevice
*dev
, *next
;
898 dev
= QLIST_FIRST(&devs
);
900 next
= QLIST_NEXT(dev
, next
);
901 if (dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_INTX
) {
904 qdev_unplug(&dev
->dev
.qdev
);
911 static void assigned_dev_update_msi(PCIDevice
*pci_dev
)
913 struct kvm_assigned_irq assigned_irq_data
;
914 AssignedDevice
*assigned_dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
915 uint8_t ctrl_byte
= pci_get_byte(pci_dev
->config
+ pci_dev
->msi_cap
+
919 memset(&assigned_irq_data
, 0, sizeof assigned_irq_data
);
920 assigned_irq_data
.assigned_dev_id
= calc_assigned_dev_id(assigned_dev
);
922 /* Some guests gratuitously disable MSI even if they're not using it,
923 * try to catch this by only deassigning irqs if the guest is using
924 * MSI or intends to start. */
925 if ((assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_GUEST_MSI
) ||
926 (ctrl_byte
& PCI_MSI_FLAGS_ENABLE
)) {
928 assigned_irq_data
.flags
= assigned_dev
->irq_requested_type
;
929 free_dev_irq_entries(assigned_dev
);
930 r
= kvm_deassign_irq(kvm_state
, &assigned_irq_data
);
931 /* -ENXIO means no assigned irq */
932 if (r
&& r
!= -ENXIO
)
933 perror("assigned_dev_update_msi: deassign irq");
935 assigned_dev
->irq_requested_type
= 0;
938 if (ctrl_byte
& PCI_MSI_FLAGS_ENABLE
) {
939 uint8_t *pos
= pci_dev
->config
+ pci_dev
->msi_cap
;
941 assigned_dev
->entry
= g_malloc0(sizeof(*(assigned_dev
->entry
)));
942 assigned_dev
->entry
->u
.msi
.address_lo
=
943 pci_get_long(pos
+ PCI_MSI_ADDRESS_LO
);
944 assigned_dev
->entry
->u
.msi
.address_hi
= 0;
945 assigned_dev
->entry
->u
.msi
.data
= pci_get_word(pos
+ PCI_MSI_DATA_32
);
946 assigned_dev
->entry
->type
= KVM_IRQ_ROUTING_MSI
;
947 r
= kvm_get_irq_route_gsi();
949 perror("assigned_dev_update_msi: kvm_get_irq_route_gsi");
952 assigned_dev
->entry
->gsi
= r
;
954 kvm_add_routing_entry(kvm_state
, assigned_dev
->entry
);
955 if (kvm_irqchip_commit_routes(kvm_state
) < 0) {
956 perror("assigned_dev_update_msi: kvm_commit_irq_routes");
957 assigned_dev
->cap
.state
&= ~ASSIGNED_DEVICE_MSI_ENABLED
;
960 assigned_dev
->irq_entries_nr
= 1;
962 assigned_irq_data
.guest_irq
= assigned_dev
->entry
->gsi
;
963 assigned_irq_data
.flags
= KVM_DEV_IRQ_HOST_MSI
| KVM_DEV_IRQ_GUEST_MSI
;
964 if (kvm_assign_irq(kvm_state
, &assigned_irq_data
) < 0) {
965 perror("assigned_dev_enable_msi: assign irq");
968 assigned_dev
->girq
= -1;
969 assigned_dev
->irq_requested_type
= assigned_irq_data
.flags
;
971 assign_irq(assigned_dev
);
975 static bool msix_masked(MSIXTableEntry
*entry
)
977 return (entry
->ctrl
& cpu_to_le32(0x1)) != 0;
980 static int assigned_dev_update_msix_mmio(PCIDevice
*pci_dev
)
982 AssignedDevice
*adev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
983 uint16_t entries_nr
= 0;
985 struct kvm_assigned_msix_nr msix_nr
;
986 struct kvm_assigned_msix_entry msix_entry
;
987 MSIXTableEntry
*entry
= adev
->msix_table
;
989 /* Get the usable entry number for allocating */
990 for (i
= 0; i
< adev
->msix_max
; i
++, entry
++) {
991 if (msix_masked(entry
)) {
997 DEBUG("MSI-X entries: %d\n", entries_nr
);
999 /* It's valid to enable MSI-X with all entries masked */
1004 msix_nr
.assigned_dev_id
= calc_assigned_dev_id(adev
);
1005 msix_nr
.entry_nr
= entries_nr
;
1006 r
= kvm_assign_set_msix_nr(kvm_state
, &msix_nr
);
1008 fprintf(stderr
, "fail to set MSI-X entry number for MSIX! %s\n",
1013 free_dev_irq_entries(adev
);
1015 adev
->irq_entries_nr
= adev
->msix_max
;
1016 adev
->entry
= g_malloc0(adev
->msix_max
* sizeof(*(adev
->entry
)));
1018 msix_entry
.assigned_dev_id
= msix_nr
.assigned_dev_id
;
1019 entry
= adev
->msix_table
;
1020 for (i
= 0; i
< adev
->msix_max
; i
++, entry
++) {
1021 if (msix_masked(entry
)) {
1025 r
= kvm_get_irq_route_gsi();
1029 adev
->entry
[i
].gsi
= r
;
1030 adev
->entry
[i
].type
= KVM_IRQ_ROUTING_MSI
;
1031 adev
->entry
[i
].flags
= 0;
1032 adev
->entry
[i
].u
.msi
.address_lo
= entry
->addr_lo
;
1033 adev
->entry
[i
].u
.msi
.address_hi
= entry
->addr_hi
;
1034 adev
->entry
[i
].u
.msi
.data
= entry
->data
;
1036 DEBUG("MSI-X vector %d, gsi %d, addr %08x_%08x, data %08x\n", i
,
1037 r
, entry
->addr_hi
, entry
->addr_lo
, entry
->data
);
1039 kvm_add_routing_entry(kvm_state
, &adev
->entry
[i
]);
1041 msix_entry
.gsi
= adev
->entry
[i
].gsi
;
1042 msix_entry
.entry
= i
;
1043 r
= kvm_assign_set_msix_entry(kvm_state
, &msix_entry
);
1045 fprintf(stderr
, "fail to set MSI-X entry! %s\n", strerror(-r
));
1050 if (r
== 0 && kvm_irqchip_commit_routes(kvm_state
) < 0) {
1051 perror("assigned_dev_update_msix_mmio: kvm_commit_irq_routes");
1058 static void assigned_dev_update_msix(PCIDevice
*pci_dev
)
1060 struct kvm_assigned_irq assigned_irq_data
;
1061 AssignedDevice
*assigned_dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
1062 uint16_t ctrl_word
= pci_get_word(pci_dev
->config
+ pci_dev
->msix_cap
+
1066 memset(&assigned_irq_data
, 0, sizeof assigned_irq_data
);
1067 assigned_irq_data
.assigned_dev_id
= calc_assigned_dev_id(assigned_dev
);
1069 /* Some guests gratuitously disable MSIX even if they're not using it,
1070 * try to catch this by only deassigning irqs if the guest is using
1071 * MSIX or intends to start. */
1072 if ((assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_GUEST_MSIX
) ||
1073 (ctrl_word
& PCI_MSIX_FLAGS_ENABLE
)) {
1075 assigned_irq_data
.flags
= assigned_dev
->irq_requested_type
;
1076 free_dev_irq_entries(assigned_dev
);
1077 r
= kvm_deassign_irq(kvm_state
, &assigned_irq_data
);
1078 /* -ENXIO means no assigned irq */
1079 if (r
&& r
!= -ENXIO
)
1080 perror("assigned_dev_update_msix: deassign irq");
1082 assigned_dev
->irq_requested_type
= 0;
1085 if (ctrl_word
& PCI_MSIX_FLAGS_ENABLE
) {
1086 assigned_irq_data
.flags
= KVM_DEV_IRQ_HOST_MSIX
|
1087 KVM_DEV_IRQ_GUEST_MSIX
;
1089 if (assigned_dev_update_msix_mmio(pci_dev
) < 0) {
1090 perror("assigned_dev_update_msix_mmio");
1094 if (assigned_dev
->irq_entries_nr
) {
1095 if (kvm_assign_irq(kvm_state
, &assigned_irq_data
) < 0) {
1096 perror("assigned_dev_enable_msix: assign irq");
1100 assigned_dev
->girq
= -1;
1101 assigned_dev
->irq_requested_type
= assigned_irq_data
.flags
;
1103 assign_irq(assigned_dev
);
1107 static uint32_t assigned_dev_pci_read_config(PCIDevice
*pci_dev
,
1108 uint32_t address
, int len
)
1110 AssignedDevice
*assigned_dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
1111 uint32_t virt_val
= pci_default_read_config(pci_dev
, address
, len
);
1112 uint32_t real_val
, emulate_mask
, full_emulation_mask
;
1115 memcpy(&emulate_mask
, assigned_dev
->emulate_config_read
+ address
, len
);
1116 emulate_mask
= le32_to_cpu(emulate_mask
);
1118 full_emulation_mask
= 0xffffffff >> (32 - len
* 8);
1120 if (emulate_mask
!= full_emulation_mask
) {
1121 real_val
= assigned_dev_pci_read(pci_dev
, address
, len
);
1122 return (virt_val
& emulate_mask
) | (real_val
& ~emulate_mask
);
1128 static void assigned_dev_pci_write_config(PCIDevice
*pci_dev
, uint32_t address
,
1129 uint32_t val
, int len
)
1131 AssignedDevice
*assigned_dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
1132 uint16_t old_cmd
= pci_get_word(pci_dev
->config
+ PCI_COMMAND
);
1133 uint32_t emulate_mask
, full_emulation_mask
;
1136 pci_default_write_config(pci_dev
, address
, val
, len
);
1138 if (kvm_has_intx_set_mask() &&
1139 range_covers_byte(address
, len
, PCI_COMMAND
+ 1)) {
1140 bool intx_masked
= (pci_get_word(pci_dev
->config
+ PCI_COMMAND
) &
1141 PCI_COMMAND_INTX_DISABLE
);
1143 if (intx_masked
!= !!(old_cmd
& PCI_COMMAND_INTX_DISABLE
)) {
1144 ret
= kvm_device_intx_set_mask(kvm_state
,
1145 calc_assigned_dev_id(assigned_dev
),
1148 perror("assigned_dev_pci_write_config: set intx mask");
1152 if (assigned_dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSI
) {
1153 if (range_covers_byte(address
, len
,
1154 pci_dev
->msi_cap
+ PCI_MSI_FLAGS
)) {
1155 assigned_dev_update_msi(pci_dev
);
1158 if (assigned_dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSIX
) {
1159 if (range_covers_byte(address
, len
,
1160 pci_dev
->msix_cap
+ PCI_MSIX_FLAGS
+ 1)) {
1161 assigned_dev_update_msix(pci_dev
);
1166 memcpy(&emulate_mask
, assigned_dev
->emulate_config_write
+ address
, len
);
1167 emulate_mask
= le32_to_cpu(emulate_mask
);
1169 full_emulation_mask
= 0xffffffff >> (32 - len
* 8);
1171 if (emulate_mask
!= full_emulation_mask
) {
1173 val
&= ~emulate_mask
;
1174 val
|= assigned_dev_pci_read(pci_dev
, address
, len
) & emulate_mask
;
1176 assigned_dev_pci_write(pci_dev
, address
, val
, len
);
1180 static void assigned_dev_setup_cap_read(AssignedDevice
*dev
, uint32_t offset
,
1183 assigned_dev_direct_config_read(dev
, offset
, len
);
1184 assigned_dev_emulate_config_read(dev
, offset
+ PCI_CAP_LIST_NEXT
, 1);
1187 static int assigned_device_pci_cap_init(PCIDevice
*pci_dev
)
1189 AssignedDevice
*dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
1190 PCIRegion
*pci_region
= dev
->real_device
.regions
;
1193 /* Clear initial capabilities pointer and status copied from hw */
1194 pci_set_byte(pci_dev
->config
+ PCI_CAPABILITY_LIST
, 0);
1195 pci_set_word(pci_dev
->config
+ PCI_STATUS
,
1196 pci_get_word(pci_dev
->config
+ PCI_STATUS
) &
1197 ~PCI_STATUS_CAP_LIST
);
1199 /* Expose MSI capability
1200 * MSI capability is the 1st capability in capability config */
1201 pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_MSI
, 0);
1202 if (pos
!= 0 && kvm_check_extension(kvm_state
, KVM_CAP_ASSIGN_DEV_IRQ
)) {
1203 dev
->cap
.available
|= ASSIGNED_DEVICE_CAP_MSI
;
1204 /* Only 32-bit/no-mask currently supported */
1205 if ((ret
= pci_add_capability(pci_dev
, PCI_CAP_ID_MSI
, pos
, 10)) < 0) {
1208 pci_dev
->msi_cap
= pos
;
1210 pci_set_word(pci_dev
->config
+ pos
+ PCI_MSI_FLAGS
,
1211 pci_get_word(pci_dev
->config
+ pos
+ PCI_MSI_FLAGS
) &
1212 PCI_MSI_FLAGS_QMASK
);
1213 pci_set_long(pci_dev
->config
+ pos
+ PCI_MSI_ADDRESS_LO
, 0);
1214 pci_set_word(pci_dev
->config
+ pos
+ PCI_MSI_DATA_32
, 0);
1216 /* Set writable fields */
1217 pci_set_word(pci_dev
->wmask
+ pos
+ PCI_MSI_FLAGS
,
1218 PCI_MSI_FLAGS_QSIZE
| PCI_MSI_FLAGS_ENABLE
);
1219 pci_set_long(pci_dev
->wmask
+ pos
+ PCI_MSI_ADDRESS_LO
, 0xfffffffc);
1220 pci_set_word(pci_dev
->wmask
+ pos
+ PCI_MSI_DATA_32
, 0xffff);
1222 /* Expose MSI-X capability */
1223 pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_MSIX
, 0);
1224 /* Would really like to test kvm_check_extension(, KVM_CAP_DEVICE_MSIX),
1225 * but the kernel doesn't expose it. Instead do a dummy call to
1226 * KVM_ASSIGN_SET_MSIX_NR to see if it exists. */
1227 if (pos
!= 0 && kvm_assign_set_msix_nr(kvm_state
, NULL
) == -EFAULT
) {
1229 uint32_t msix_table_entry
;
1231 dev
->cap
.available
|= ASSIGNED_DEVICE_CAP_MSIX
;
1232 if ((ret
= pci_add_capability(pci_dev
, PCI_CAP_ID_MSIX
, pos
, 12)) < 0) {
1235 pci_dev
->msix_cap
= pos
;
1237 pci_set_word(pci_dev
->config
+ pos
+ PCI_MSIX_FLAGS
,
1238 pci_get_word(pci_dev
->config
+ pos
+ PCI_MSIX_FLAGS
) &
1239 PCI_MSIX_FLAGS_QSIZE
);
1241 /* Only enable and function mask bits are writable */
1242 pci_set_word(pci_dev
->wmask
+ pos
+ PCI_MSIX_FLAGS
,
1243 PCI_MSIX_FLAGS_ENABLE
| PCI_MSIX_FLAGS_MASKALL
);
1245 msix_table_entry
= pci_get_long(pci_dev
->config
+ pos
+ PCI_MSIX_TABLE
);
1246 bar_nr
= msix_table_entry
& PCI_MSIX_FLAGS_BIRMASK
;
1247 msix_table_entry
&= ~PCI_MSIX_FLAGS_BIRMASK
;
1248 dev
->msix_table_addr
= pci_region
[bar_nr
].base_addr
+ msix_table_entry
;
1249 dev
->msix_max
= pci_get_word(pci_dev
->config
+ pos
+ PCI_MSIX_FLAGS
);
1250 dev
->msix_max
&= PCI_MSIX_FLAGS_QSIZE
;
1254 /* Minimal PM support, nothing writable, device appears to NAK changes */
1255 if ((pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_PM
, 0))) {
1257 if ((ret
= pci_add_capability(pci_dev
, PCI_CAP_ID_PM
, pos
,
1258 PCI_PM_SIZEOF
)) < 0) {
1262 assigned_dev_setup_cap_read(dev
, pos
, PCI_PM_SIZEOF
);
1264 pmc
= pci_get_word(pci_dev
->config
+ pos
+ PCI_CAP_FLAGS
);
1265 pmc
&= (PCI_PM_CAP_VER_MASK
| PCI_PM_CAP_DSI
);
1266 pci_set_word(pci_dev
->config
+ pos
+ PCI_CAP_FLAGS
, pmc
);
1268 /* assign_device will bring the device up to D0, so we don't need
1269 * to worry about doing that ourselves here. */
1270 pci_set_word(pci_dev
->config
+ pos
+ PCI_PM_CTRL
,
1271 PCI_PM_CTRL_NO_SOFT_RESET
);
1273 pci_set_byte(pci_dev
->config
+ pos
+ PCI_PM_PPB_EXTENSIONS
, 0);
1274 pci_set_byte(pci_dev
->config
+ pos
+ PCI_PM_DATA_REGISTER
, 0);
1277 if ((pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_EXP
, 0))) {
1278 uint8_t version
, size
= 0;
1279 uint16_t type
, devctl
, lnksta
;
1280 uint32_t devcap
, lnkcap
;
1282 version
= pci_get_byte(pci_dev
->config
+ pos
+ PCI_EXP_FLAGS
);
1283 version
&= PCI_EXP_FLAGS_VERS
;
1286 } else if (version
== 2) {
1288 * Check for non-std size, accept reduced size to 0x34,
1289 * which is what bcm5761 implemented, violating the
1290 * PCIe v3.0 spec that regs should exist and be read as 0,
1291 * not optionally provided and shorten the struct size.
1293 size
= MIN(0x3c, PCI_CONFIG_SPACE_SIZE
- pos
);
1296 "%s: Invalid size PCIe cap-id 0x%x \n",
1297 __func__
, PCI_CAP_ID_EXP
);
1299 } else if (size
!= 0x3c) {
1301 "WARNING, %s: PCIe cap-id 0x%x has "
1302 "non-standard size 0x%x; std size should be 0x3c \n",
1303 __func__
, PCI_CAP_ID_EXP
, size
);
1305 } else if (version
== 0) {
1307 vid
= pci_get_word(pci_dev
->config
+ PCI_VENDOR_ID
);
1308 did
= pci_get_word(pci_dev
->config
+ PCI_DEVICE_ID
);
1309 if (vid
== PCI_VENDOR_ID_INTEL
&& did
== 0x10ed) {
1311 * quirk for Intel 82599 VF with invalid PCIe capability
1312 * version, should really be version 2 (same as PF)
1320 "%s: Unsupported PCI express capability version %d\n",
1325 if ((ret
= pci_add_capability(pci_dev
, PCI_CAP_ID_EXP
,
1330 assigned_dev_setup_cap_read(dev
, pos
, size
);
1332 type
= pci_get_word(pci_dev
->config
+ pos
+ PCI_EXP_FLAGS
);
1333 type
= (type
& PCI_EXP_FLAGS_TYPE
) >> 4;
1334 if (type
!= PCI_EXP_TYPE_ENDPOINT
&&
1335 type
!= PCI_EXP_TYPE_LEG_END
&& type
!= PCI_EXP_TYPE_RC_END
) {
1337 "Device assignment only supports endpoint assignment, "
1338 "device type %d\n", type
);
1342 /* capabilities, pass existing read-only copy
1343 * PCI_EXP_FLAGS_IRQ: updated by hardware, should be direct read */
1345 /* device capabilities: hide FLR */
1346 devcap
= pci_get_long(pci_dev
->config
+ pos
+ PCI_EXP_DEVCAP
);
1347 devcap
&= ~PCI_EXP_DEVCAP_FLR
;
1348 pci_set_long(pci_dev
->config
+ pos
+ PCI_EXP_DEVCAP
, devcap
);
1350 /* device control: clear all error reporting enable bits, leaving
1351 * only a few host values. Note, these are
1352 * all writable, but not passed to hw.
1354 devctl
= pci_get_word(pci_dev
->config
+ pos
+ PCI_EXP_DEVCTL
);
1355 devctl
= (devctl
& (PCI_EXP_DEVCTL_READRQ
| PCI_EXP_DEVCTL_PAYLOAD
)) |
1356 PCI_EXP_DEVCTL_RELAX_EN
| PCI_EXP_DEVCTL_NOSNOOP_EN
;
1357 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_DEVCTL
, devctl
);
1358 devctl
= PCI_EXP_DEVCTL_BCR_FLR
| PCI_EXP_DEVCTL_AUX_PME
;
1359 pci_set_word(pci_dev
->wmask
+ pos
+ PCI_EXP_DEVCTL
, ~devctl
);
1361 /* Clear device status */
1362 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_DEVSTA
, 0);
1364 /* Link capabilities, expose links and latencues, clear reporting */
1365 lnkcap
= pci_get_long(pci_dev
->config
+ pos
+ PCI_EXP_LNKCAP
);
1366 lnkcap
&= (PCI_EXP_LNKCAP_SLS
| PCI_EXP_LNKCAP_MLW
|
1367 PCI_EXP_LNKCAP_ASPMS
| PCI_EXP_LNKCAP_L0SEL
|
1368 PCI_EXP_LNKCAP_L1EL
);
1369 pci_set_long(pci_dev
->config
+ pos
+ PCI_EXP_LNKCAP
, lnkcap
);
1371 /* Link control, pass existing read-only copy. Should be writable? */
1373 /* Link status, only expose current speed and width */
1374 lnksta
= pci_get_word(pci_dev
->config
+ pos
+ PCI_EXP_LNKSTA
);
1375 lnksta
&= (PCI_EXP_LNKSTA_CLS
| PCI_EXP_LNKSTA_NLW
);
1376 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_LNKSTA
, lnksta
);
1379 /* Slot capabilities, control, status - not needed for endpoints */
1380 pci_set_long(pci_dev
->config
+ pos
+ PCI_EXP_SLTCAP
, 0);
1381 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_SLTCTL
, 0);
1382 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_SLTSTA
, 0);
1384 /* Root control, capabilities, status - not needed for endpoints */
1385 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_RTCTL
, 0);
1386 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_RTCAP
, 0);
1387 pci_set_long(pci_dev
->config
+ pos
+ PCI_EXP_RTSTA
, 0);
1389 /* Device capabilities/control 2, pass existing read-only copy */
1390 /* Link control 2, pass existing read-only copy */
1394 if ((pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_PCIX
, 0))) {
1398 /* Only expose the minimum, 8 byte capability */
1399 if ((ret
= pci_add_capability(pci_dev
, PCI_CAP_ID_PCIX
, pos
, 8)) < 0) {
1403 assigned_dev_setup_cap_read(dev
, pos
, 8);
1405 /* Command register, clear upper bits, including extended modes */
1406 cmd
= pci_get_word(pci_dev
->config
+ pos
+ PCI_X_CMD
);
1407 cmd
&= (PCI_X_CMD_DPERR_E
| PCI_X_CMD_ERO
| PCI_X_CMD_MAX_READ
|
1408 PCI_X_CMD_MAX_SPLIT
);
1409 pci_set_word(pci_dev
->config
+ pos
+ PCI_X_CMD
, cmd
);
1411 /* Status register, update with emulated PCI bus location, clear
1412 * error bits, leave the rest. */
1413 status
= pci_get_long(pci_dev
->config
+ pos
+ PCI_X_STATUS
);
1414 status
&= ~(PCI_X_STATUS_BUS
| PCI_X_STATUS_DEVFN
);
1415 status
|= (pci_bus_num(pci_dev
->bus
) << 8) | pci_dev
->devfn
;
1416 status
&= ~(PCI_X_STATUS_SPL_DISC
| PCI_X_STATUS_UNX_SPL
|
1417 PCI_X_STATUS_SPL_ERR
);
1418 pci_set_long(pci_dev
->config
+ pos
+ PCI_X_STATUS
, status
);
1421 if ((pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_VPD
, 0))) {
1422 /* Direct R/W passthrough */
1423 if ((ret
= pci_add_capability(pci_dev
, PCI_CAP_ID_VPD
, pos
, 8)) < 0) {
1427 assigned_dev_setup_cap_read(dev
, pos
, 8);
1429 /* direct write for cap content */
1430 assigned_dev_direct_config_write(dev
, pos
+ 2, 6);
1433 /* Devices can have multiple vendor capabilities, get them all */
1434 for (pos
= 0; (pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_VNDR
, pos
));
1435 pos
+= PCI_CAP_LIST_NEXT
) {
1436 uint8_t len
= pci_get_byte(pci_dev
->config
+ pos
+ PCI_CAP_FLAGS
);
1437 /* Direct R/W passthrough */
1438 if ((ret
= pci_add_capability(pci_dev
, PCI_CAP_ID_VNDR
,
1443 assigned_dev_setup_cap_read(dev
, pos
, len
);
1445 /* direct write for cap content */
1446 assigned_dev_direct_config_write(dev
, pos
+ 2, len
- 2);
1449 /* If real and virtual capability list status bits differ, virtualize the
1451 if ((pci_get_word(pci_dev
->config
+ PCI_STATUS
) & PCI_STATUS_CAP_LIST
) !=
1452 (assigned_dev_pci_read_byte(pci_dev
, PCI_STATUS
) &
1453 PCI_STATUS_CAP_LIST
)) {
1454 dev
->emulate_config_read
[PCI_STATUS
] |= PCI_STATUS_CAP_LIST
;
1460 static uint64_t msix_mmio_read(void *opaque
, target_phys_addr_t addr
,
1463 AssignedDevice
*adev
= opaque
;
1466 memcpy(&val
, (void *)((uint8_t *)adev
->msix_table
+ addr
), size
);
1471 static void msix_mmio_write(void *opaque
, target_phys_addr_t addr
,
1472 uint64_t val
, unsigned size
)
1474 AssignedDevice
*adev
= opaque
;
1475 PCIDevice
*pdev
= &adev
->dev
;
1477 MSIXTableEntry orig
;
1480 if (i
>= adev
->msix_max
) {
1481 return; /* Drop write */
1484 ctrl
= pci_get_word(pdev
->config
+ pdev
->msix_cap
+ PCI_MSIX_FLAGS
);
1486 DEBUG("write to MSI-X table offset 0x%lx, val 0x%lx\n", addr
, val
);
1488 if (ctrl
& PCI_MSIX_FLAGS_ENABLE
) {
1489 orig
= adev
->msix_table
[i
];
1492 memcpy((void *)((uint8_t *)adev
->msix_table
+ addr
), &val
, size
);
1494 if (ctrl
& PCI_MSIX_FLAGS_ENABLE
) {
1495 MSIXTableEntry
*entry
= &adev
->msix_table
[i
];
1497 if (!msix_masked(&orig
) && msix_masked(entry
)) {
1499 * Vector masked, disable it
1501 * XXX It's not clear if we can or should actually attempt
1502 * to mask or disable the interrupt. KVM doesn't have
1503 * support for pending bits and kvm_assign_set_msix_entry
1504 * doesn't modify the device hardware mask. Interrupts
1505 * while masked are simply not injected to the guest, so
1506 * are lost. Can we get away with always injecting an
1507 * interrupt on unmask?
1509 } else if (msix_masked(&orig
) && !msix_masked(entry
)) {
1510 /* Vector unmasked */
1511 if (i
>= adev
->irq_entries_nr
|| !adev
->entry
[i
].type
) {
1512 /* Previously unassigned vector, start from scratch */
1513 assigned_dev_update_msix(pdev
);
1516 /* Update an existing, previously masked vector */
1517 struct kvm_irq_routing_entry orig
= adev
->entry
[i
];
1520 adev
->entry
[i
].u
.msi
.address_lo
= entry
->addr_lo
;
1521 adev
->entry
[i
].u
.msi
.address_hi
= entry
->addr_hi
;
1522 adev
->entry
[i
].u
.msi
.data
= entry
->data
;
1524 ret
= kvm_update_routing_entry(&orig
, &adev
->entry
[i
]);
1527 "Error updating irq routing entry (%d)\n", ret
);
1531 ret
= kvm_irqchip_commit_routes(kvm_state
);
1534 "Error committing irq routes (%d)\n", ret
);
1542 static const MemoryRegionOps msix_mmio_ops
= {
1543 .read
= msix_mmio_read
,
1544 .write
= msix_mmio_write
,
1545 .endianness
= DEVICE_NATIVE_ENDIAN
,
1547 .min_access_size
= 4,
1548 .max_access_size
= 8,
1551 .min_access_size
= 4,
1552 .max_access_size
= 8,
1556 static void msix_reset(AssignedDevice
*dev
)
1558 MSIXTableEntry
*entry
;
1561 if (!dev
->msix_table
) {
1565 memset(dev
->msix_table
, 0, MSIX_PAGE_SIZE
);
1567 for (i
= 0, entry
= dev
->msix_table
; i
< dev
->msix_max
; i
++, entry
++) {
1568 entry
->ctrl
= cpu_to_le32(0x1); /* Masked */
1572 static int assigned_dev_register_msix_mmio(AssignedDevice
*dev
)
1574 dev
->msix_table
= mmap(NULL
, MSIX_PAGE_SIZE
, PROT_READ
|PROT_WRITE
,
1575 MAP_ANONYMOUS
|MAP_PRIVATE
, 0, 0);
1576 if (dev
->msix_table
== MAP_FAILED
) {
1577 fprintf(stderr
, "fail allocate msix_table! %s\n", strerror(errno
));
1583 memory_region_init_io(&dev
->mmio
, &msix_mmio_ops
, dev
,
1584 "assigned-dev-msix", MSIX_PAGE_SIZE
);
1588 static void assigned_dev_unregister_msix_mmio(AssignedDevice
*dev
)
1590 if (!dev
->msix_table
) {
1594 memory_region_destroy(&dev
->mmio
);
1596 if (munmap(dev
->msix_table
, MSIX_PAGE_SIZE
) == -1) {
1597 fprintf(stderr
, "error unmapping msix_table! %s\n",
1600 dev
->msix_table
= NULL
;
1603 static const VMStateDescription vmstate_assigned_device
= {
1604 .name
= "pci-assign",
1608 static void reset_assigned_device(DeviceState
*dev
)
1610 PCIDevice
*pci_dev
= DO_UPCAST(PCIDevice
, qdev
, dev
);
1611 AssignedDevice
*adev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
1612 char reset_file
[64];
1613 const char reset
[] = "1";
1616 snprintf(reset_file
, sizeof(reset_file
),
1617 "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/reset",
1618 adev
->host
.seg
, adev
->host
.bus
, adev
->host
.dev
, adev
->host
.func
);
1621 * Issue a device reset via pci-sysfs. Note that we use write(2) here
1622 * and ignore the return value because some kernels have a bug that
1623 * returns 0 rather than bytes written on success, sending us into an
1624 * infinite retry loop using other write mechanisms.
1626 fd
= open(reset_file
, O_WRONLY
);
1628 ret
= write(fd
, reset
, strlen(reset
));
1634 * When a 0 is written to the command register, the device is logically
1635 * disconnected from the PCI bus. This avoids further DMA transfers.
1637 assigned_dev_pci_write_config(pci_dev
, PCI_COMMAND
, 0, 2);
1640 static int assigned_initfn(struct PCIDevice
*pci_dev
)
1642 AssignedDevice
*dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
1646 if (!kvm_enabled()) {
1647 error_report("pci-assign: error: requires KVM support");
1651 if (!dev
->host
.seg
&& !dev
->host
.bus
&& !dev
->host
.dev
&& !dev
->host
.func
) {
1652 error_report("pci-assign: error: no host device specified");
1657 * Set up basic config space access control. Will be further refined during
1658 * device initialization.
1660 assigned_dev_emulate_config_read(dev
, 0, PCI_CONFIG_SPACE_SIZE
);
1661 assigned_dev_direct_config_read(dev
, PCI_COMMAND
, 2);
1662 assigned_dev_direct_config_read(dev
, PCI_STATUS
, 2);
1663 assigned_dev_direct_config_read(dev
, PCI_REVISION_ID
, 1);
1664 assigned_dev_direct_config_read(dev
, PCI_CLASS_PROG
, 3);
1665 assigned_dev_direct_config_read(dev
, PCI_CACHE_LINE_SIZE
, 1);
1666 assigned_dev_direct_config_read(dev
, PCI_LATENCY_TIMER
, 1);
1667 assigned_dev_direct_config_read(dev
, PCI_BIST
, 1);
1668 assigned_dev_direct_config_read(dev
, PCI_CARDBUS_CIS
, 4);
1669 assigned_dev_direct_config_read(dev
, PCI_SUBSYSTEM_VENDOR_ID
, 2);
1670 assigned_dev_direct_config_read(dev
, PCI_SUBSYSTEM_ID
, 2);
1671 assigned_dev_direct_config_read(dev
, PCI_CAPABILITY_LIST
+ 1, 7);
1672 assigned_dev_direct_config_read(dev
, PCI_MIN_GNT
, 1);
1673 assigned_dev_direct_config_read(dev
, PCI_MAX_LAT
, 1);
1674 memcpy(dev
->emulate_config_write
, dev
->emulate_config_read
,
1675 sizeof(dev
->emulate_config_read
));
1677 if (get_real_device(dev
, dev
->host
.seg
, dev
->host
.bus
,
1678 dev
->host
.dev
, dev
->host
.func
)) {
1679 error_report("pci-assign: Error: Couldn't get real device (%s)!",
1684 if (assigned_device_pci_cap_init(pci_dev
) < 0) {
1688 /* intercept MSI-X entry page in the MMIO */
1689 if (dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSIX
) {
1690 if (assigned_dev_register_msix_mmio(dev
)) {
1695 /* handle real device's MMIO/PIO BARs */
1696 if (assigned_dev_register_regions(dev
->real_device
.regions
,
1697 dev
->real_device
.region_number
,
1701 /* handle interrupt routing */
1702 e_intx
= dev
->dev
.config
[0x3d] - 1;
1703 dev
->intpin
= e_intx
;
1706 dev
->h_segnr
= dev
->host
.seg
;
1707 dev
->h_busnr
= dev
->host
.bus
;
1708 dev
->h_devfn
= PCI_DEVFN(dev
->host
.dev
, dev
->host
.func
);
1710 /* assign device to guest */
1711 r
= assign_device(dev
);
1715 /* assign irq for the device */
1716 r
= assign_irq(dev
);
1720 assigned_dev_load_option_rom(dev
);
1721 QLIST_INSERT_HEAD(&devs
, dev
, next
);
1723 add_boot_device_path(dev
->bootindex
, &pci_dev
->qdev
, NULL
);
1728 deassign_device(dev
);
1730 free_assigned_device(dev
);
1734 static int assigned_exitfn(struct PCIDevice
*pci_dev
)
1736 AssignedDevice
*dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
1738 QLIST_REMOVE(dev
, next
);
1739 deassign_device(dev
);
1740 free_assigned_device(dev
);
1744 static int parse_hostaddr(DeviceState
*dev
, Property
*prop
, const char *str
)
1746 PCIHostDevice
*ptr
= qdev_get_prop_ptr(dev
, prop
);
1749 rc
= pci_parse_host_devaddr(str
, &ptr
->seg
, &ptr
->bus
, &ptr
->dev
, &ptr
->func
);
1755 static int print_hostaddr(DeviceState
*dev
, Property
*prop
, char *dest
, size_t len
)
1757 PCIHostDevice
*ptr
= qdev_get_prop_ptr(dev
, prop
);
1759 return snprintf(dest
, len
, "%02x:%02x.%x", ptr
->bus
, ptr
->dev
, ptr
->func
);
1762 PropertyInfo qdev_prop_hostaddr
= {
1763 .name
= "pci-hostaddr",
1764 .parse
= parse_hostaddr
,
1765 .print
= print_hostaddr
,
1768 static Property da_properties
[] =
1770 DEFINE_PROP("host", AssignedDevice
, host
, qdev_prop_hostaddr
, PCIHostDevice
),
1771 DEFINE_PROP_BIT("iommu", AssignedDevice
, features
,
1772 ASSIGNED_DEVICE_USE_IOMMU_BIT
, true),
1773 DEFINE_PROP_BIT("prefer_msi", AssignedDevice
, features
,
1774 ASSIGNED_DEVICE_PREFER_MSI_BIT
, true),
1775 DEFINE_PROP_BIT("share_intx", AssignedDevice
, features
,
1776 ASSIGNED_DEVICE_SHARE_INTX_BIT
, false),
1777 DEFINE_PROP_INT32("bootindex", AssignedDevice
, bootindex
, -1),
1778 DEFINE_PROP_STRING("configfd", AssignedDevice
, configfd_name
),
1779 DEFINE_PROP_END_OF_LIST(),
1782 static void assign_class_init(ObjectClass
*klass
, void *data
)
1784 PCIDeviceClass
*k
= PCI_DEVICE_CLASS(klass
);
1785 DeviceClass
*dc
= DEVICE_CLASS(klass
);
1787 k
->init
= assigned_initfn
;
1788 k
->exit
= assigned_exitfn
;
1789 k
->config_read
= assigned_dev_pci_read_config
;
1790 k
->config_write
= assigned_dev_pci_write_config
;
1791 dc
->props
= da_properties
;
1792 dc
->vmsd
= &vmstate_assigned_device
;
1793 dc
->reset
= reset_assigned_device
;
1796 static TypeInfo assign_info
= {
1797 .name
= "pci-assign",
1798 .parent
= TYPE_PCI_DEVICE
,
1799 .instance_size
= sizeof(AssignedDevice
),
1800 .class_init
= assign_class_init
,
1803 static void assign_register_types(void)
1805 type_register_static(&assign_info
);
1808 type_init(assign_register_types
)
1811 * Scan the assigned devices for the devices that have an option ROM, and then
1812 * load the corresponding ROM data to RAM. If an error occurs while loading an
1813 * option ROM, we just ignore that option ROM and continue with the next one.
1815 static void assigned_dev_load_option_rom(AssignedDevice
*dev
)
1817 char name
[32], rom_file
[64];
1823 /* If loading ROM from file, pci handles it */
1824 if (dev
->dev
.romfile
|| !dev
->dev
.rom_bar
)
1827 snprintf(rom_file
, sizeof(rom_file
),
1828 "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/rom",
1829 dev
->host
.seg
, dev
->host
.bus
, dev
->host
.dev
, dev
->host
.func
);
1831 if (stat(rom_file
, &st
)) {
1835 if (access(rom_file
, F_OK
)) {
1836 fprintf(stderr
, "pci-assign: Insufficient privileges for %s\n",
1841 /* Write "1" to the ROM file to enable it */
1842 fp
= fopen(rom_file
, "r+");
1847 if (fwrite(&val
, 1, 1, fp
) != 1) {
1850 fseek(fp
, 0, SEEK_SET
);
1852 snprintf(name
, sizeof(name
), "%s.rom",
1853 object_get_typename(OBJECT(dev
)));
1854 memory_region_init_ram(&dev
->dev
.rom
, name
, st
.st_size
);
1855 vmstate_register_ram(&dev
->dev
.rom
, &dev
->dev
.qdev
);
1856 ptr
= memory_region_get_ram_ptr(&dev
->dev
.rom
);
1857 memset(ptr
, 0xff, st
.st_size
);
1859 if (!fread(ptr
, 1, st
.st_size
, fp
)) {
1860 fprintf(stderr
, "pci-assign: Cannot read from host %s\n"
1861 "\tDevice option ROM contents are probably invalid "
1862 "(check dmesg).\n\tSkip option ROM probe with rombar=0, "
1863 "or load from file with romfile=\n", rom_file
);
1864 memory_region_destroy(&dev
->dev
.rom
);
1868 pci_register_bar(&dev
->dev
, PCI_ROM_SLOT
, 0, &dev
->dev
.rom
);
1869 dev
->dev
.has_rom
= true;
1871 /* Write "0" to disable ROM */
1872 fseek(fp
, 0, SEEK_SET
);
1874 if (!fwrite(&val
, 1, 1, fp
)) {
1875 DEBUG("%s\n", "Failed to disable pci-sysfs rom file");