2 * Copyright (c) 2007, Neocleus Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
18 * Assign a PCI device from the host to a guest VM.
20 * Adapted for KVM by Qumranet.
22 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
23 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
24 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
25 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
26 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
31 #include <sys/types.h>
36 #include "qemu-error.h"
38 #include "device-assignment.h"
41 #include <pci/header.h>
43 /* From linux/ioport.h */
44 #define IORESOURCE_IO 0x00000100 /* Resource type */
45 #define IORESOURCE_MEM 0x00000200
46 #define IORESOURCE_IRQ 0x00000400
47 #define IORESOURCE_DMA 0x00000800
48 #define IORESOURCE_PREFETCH 0x00001000 /* No side effects */
50 /* #define DEVICE_ASSIGNMENT_DEBUG 1 */
52 #ifdef DEVICE_ASSIGNMENT_DEBUG
53 #define DEBUG(fmt, ...) \
55 fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__); \
58 #define DEBUG(fmt, ...) do { } while(0)
61 static void assigned_dev_load_option_rom(AssignedDevice
*dev
);
63 static void assigned_dev_unregister_msix_mmio(AssignedDevice
*dev
);
65 static uint32_t assigned_dev_ioport_rw(AssignedDevRegion
*dev_region
,
66 uint32_t addr
, int len
, uint32_t *val
)
69 uint32_t offset
= addr
- dev_region
->e_physbase
;
70 int fd
= dev_region
->region
->resource_fd
;
74 DEBUG("pwrite val=%x, len=%d, e_phys=%x, offset=%x\n",
75 *val
, len
, addr
, offset
);
76 if (pwrite(fd
, val
, len
, offset
) != len
) {
77 fprintf(stderr
, "%s - pwrite failed %s\n",
78 __func__
, strerror(errno
));
81 if (pread(fd
, &ret
, len
, offset
) != len
) {
82 fprintf(stderr
, "%s - pread failed %s\n",
83 __func__
, strerror(errno
));
84 ret
= (1UL << (len
* 8)) - 1;
86 DEBUG("pread ret=%x, len=%d, e_phys=%x, offset=%x\n",
87 ret
, len
, addr
, offset
);
90 uint32_t port
= offset
+ dev_region
->u
.r_baseport
;
93 DEBUG("out val=%x, len=%d, e_phys=%x, host=%x\n",
94 *val
, len
, addr
, port
);
118 DEBUG("in val=%x, len=%d, e_phys=%x, host=%x\n",
119 ret
, len
, addr
, port
);
125 static void assigned_dev_ioport_writeb(void *opaque
, uint32_t addr
,
128 assigned_dev_ioport_rw(opaque
, addr
, 1, &value
);
132 static void assigned_dev_ioport_writew(void *opaque
, uint32_t addr
,
135 assigned_dev_ioport_rw(opaque
, addr
, 2, &value
);
139 static void assigned_dev_ioport_writel(void *opaque
, uint32_t addr
,
142 assigned_dev_ioport_rw(opaque
, addr
, 4, &value
);
146 static uint32_t assigned_dev_ioport_readb(void *opaque
, uint32_t addr
)
148 return assigned_dev_ioport_rw(opaque
, addr
, 1, NULL
);
151 static uint32_t assigned_dev_ioport_readw(void *opaque
, uint32_t addr
)
153 return assigned_dev_ioport_rw(opaque
, addr
, 2, NULL
);
156 static uint32_t assigned_dev_ioport_readl(void *opaque
, uint32_t addr
)
158 return assigned_dev_ioport_rw(opaque
, addr
, 4, NULL
);
161 static uint32_t slow_bar_readb(void *opaque
, target_phys_addr_t addr
)
163 AssignedDevRegion
*d
= opaque
;
164 uint8_t *in
= d
->u
.r_virtbase
+ addr
;
168 DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx
" val=0x%08x\n", addr
, r
);
173 static uint32_t slow_bar_readw(void *opaque
, target_phys_addr_t addr
)
175 AssignedDevRegion
*d
= opaque
;
176 uint16_t *in
= d
->u
.r_virtbase
+ addr
;
180 DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx
" val=0x%08x\n", addr
, r
);
185 static uint32_t slow_bar_readl(void *opaque
, target_phys_addr_t addr
)
187 AssignedDevRegion
*d
= opaque
;
188 uint32_t *in
= d
->u
.r_virtbase
+ addr
;
192 DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx
" val=0x%08x\n", addr
, r
);
197 static void slow_bar_writeb(void *opaque
, target_phys_addr_t addr
, uint32_t val
)
199 AssignedDevRegion
*d
= opaque
;
200 uint8_t *out
= d
->u
.r_virtbase
+ addr
;
202 DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx
" val=0x%02x\n", addr
, val
);
206 static void slow_bar_writew(void *opaque
, target_phys_addr_t addr
, uint32_t val
)
208 AssignedDevRegion
*d
= opaque
;
209 uint16_t *out
= d
->u
.r_virtbase
+ addr
;
211 DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx
" val=0x%04x\n", addr
, val
);
215 static void slow_bar_writel(void *opaque
, target_phys_addr_t addr
, uint32_t val
)
217 AssignedDevRegion
*d
= opaque
;
218 uint32_t *out
= d
->u
.r_virtbase
+ addr
;
220 DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx
" val=0x%08x\n", addr
, val
);
224 static CPUWriteMemoryFunc
* const slow_bar_write
[] = {
230 static CPUReadMemoryFunc
* const slow_bar_read
[] = {
236 static void assigned_dev_iomem_map_slow(PCIDevice
*pci_dev
, int region_num
,
237 pcibus_t e_phys
, pcibus_t e_size
,
240 AssignedDevice
*r_dev
= container_of(pci_dev
, AssignedDevice
, dev
);
241 AssignedDevRegion
*region
= &r_dev
->v_addrs
[region_num
];
242 PCIRegion
*real_region
= &r_dev
->real_device
.regions
[region_num
];
245 DEBUG("%s", "slow map\n");
246 m
= cpu_register_io_memory(slow_bar_read
, slow_bar_write
, region
);
247 cpu_register_physical_memory(e_phys
, e_size
, m
);
249 /* MSI-X MMIO page */
251 real_region
->base_addr
<= r_dev
->msix_table_addr
&&
252 real_region
->base_addr
+ real_region
->size
>= r_dev
->msix_table_addr
) {
253 int offset
= r_dev
->msix_table_addr
- real_region
->base_addr
;
255 cpu_register_physical_memory(e_phys
+ offset
,
256 TARGET_PAGE_SIZE
, r_dev
->mmio_index
);
260 static void assigned_dev_iomem_map(PCIDevice
*pci_dev
, int region_num
,
261 pcibus_t e_phys
, pcibus_t e_size
, int type
)
263 AssignedDevice
*r_dev
= container_of(pci_dev
, AssignedDevice
, dev
);
264 AssignedDevRegion
*region
= &r_dev
->v_addrs
[region_num
];
265 PCIRegion
*real_region
= &r_dev
->real_device
.regions
[region_num
];
268 DEBUG("e_phys=%08" FMT_PCIBUS
" r_virt=%p type=%d len=%08" FMT_PCIBUS
" region_num=%d \n",
269 e_phys
, region
->u
.r_virtbase
, type
, e_size
, region_num
);
271 region
->e_physbase
= e_phys
;
272 region
->e_size
= e_size
;
275 cpu_register_physical_memory(e_phys
, e_size
, region
->memory_index
);
277 /* deal with MSI-X MMIO page */
278 if (real_region
->base_addr
<= r_dev
->msix_table_addr
&&
279 real_region
->base_addr
+ real_region
->size
>=
280 r_dev
->msix_table_addr
) {
281 int offset
= r_dev
->msix_table_addr
- real_region
->base_addr
;
283 cpu_register_physical_memory(e_phys
+ offset
,
284 TARGET_PAGE_SIZE
, r_dev
->mmio_index
);
289 fprintf(stderr
, "%s: Error: create new mapping failed\n", __func__
);
294 static void assigned_dev_ioport_map(PCIDevice
*pci_dev
, int region_num
,
295 pcibus_t addr
, pcibus_t size
, int type
)
297 AssignedDevice
*r_dev
= container_of(pci_dev
, AssignedDevice
, dev
);
298 AssignedDevRegion
*region
= &r_dev
->v_addrs
[region_num
];
299 int first_map
= (region
->e_size
== 0);
302 region
->e_physbase
= addr
;
303 region
->e_size
= size
;
305 DEBUG("e_phys=0x%" FMT_PCIBUS
" r_baseport=%x type=0x%x len=%" FMT_PCIBUS
" region_num=%d \n",
306 addr
, region
->u
.r_baseport
, type
, size
, region_num
);
308 if (first_map
&& region
->region
->resource_fd
< 0) {
309 struct ioperm_data
*data
;
311 data
= qemu_mallocz(sizeof(struct ioperm_data
));
313 fprintf(stderr
, "%s: Out of memory\n", __func__
);
317 data
->start_port
= region
->u
.r_baseport
;
318 data
->num
= region
->r_size
;
321 kvm_add_ioperm_data(data
);
323 for (env
= first_cpu
; env
; env
= env
->next_cpu
)
324 kvm_ioperm(env
, data
);
327 register_ioport_read(addr
, size
, 1, assigned_dev_ioport_readb
,
328 (r_dev
->v_addrs
+ region_num
));
329 register_ioport_read(addr
, size
, 2, assigned_dev_ioport_readw
,
330 (r_dev
->v_addrs
+ region_num
));
331 register_ioport_read(addr
, size
, 4, assigned_dev_ioport_readl
,
332 (r_dev
->v_addrs
+ region_num
));
333 register_ioport_write(addr
, size
, 1, assigned_dev_ioport_writeb
,
334 (r_dev
->v_addrs
+ region_num
));
335 register_ioport_write(addr
, size
, 2, assigned_dev_ioport_writew
,
336 (r_dev
->v_addrs
+ region_num
));
337 register_ioport_write(addr
, size
, 4, assigned_dev_ioport_writel
,
338 (r_dev
->v_addrs
+ region_num
));
341 static uint32_t assigned_dev_pci_read(PCIDevice
*d
, int pos
, int len
)
343 AssignedDevice
*pci_dev
= container_of(d
, AssignedDevice
, dev
);
346 int fd
= pci_dev
->real_device
.config_fd
;
349 ret
= pread(fd
, &val
, len
, pos
);
351 if ((ret
< 0) && (errno
== EINTR
|| errno
== EAGAIN
))
354 fprintf(stderr
, "%s: pread failed, ret = %zd errno = %d\n",
355 __func__
, ret
, errno
);
363 static uint8_t assigned_dev_pci_read_byte(PCIDevice
*d
, int pos
)
365 return (uint8_t)assigned_dev_pci_read(d
, pos
, 1);
368 static uint16_t assigned_dev_pci_read_word(PCIDevice
*d
, int pos
)
370 return (uint16_t)assigned_dev_pci_read(d
, pos
, 2);
373 static uint32_t assigned_dev_pci_read_long(PCIDevice
*d
, int pos
)
375 return assigned_dev_pci_read(d
, pos
, 4);
378 static uint8_t pci_find_cap_offset(PCIDevice
*d
, uint8_t cap
)
382 int pos
= PCI_CAPABILITY_LIST
;
385 status
= assigned_dev_pci_read_byte(d
, PCI_STATUS
);
386 if ((status
& PCI_STATUS_CAP_LIST
) == 0)
390 pos
= assigned_dev_pci_read_byte(d
, pos
);
395 id
= assigned_dev_pci_read_byte(d
, pos
+ PCI_CAP_LIST_ID
);
402 pos
+= PCI_CAP_LIST_NEXT
;
407 static void assigned_dev_pci_write_config(PCIDevice
*d
, uint32_t address
,
408 uint32_t val
, int len
)
412 AssignedDevice
*pci_dev
= container_of(d
, AssignedDevice
, dev
);
414 DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
415 ((d
->devfn
>> 3) & 0x1F), (d
->devfn
& 0x7),
416 (uint16_t) address
, val
, len
);
418 if (address
== 0x4) {
419 pci_default_write_config(d
, address
, val
, len
);
420 /* Continue to program the card */
423 if ((address
>= 0x10 && address
<= 0x24) || address
== 0x30 ||
424 address
== 0x34 || address
== 0x3c || address
== 0x3d ||
425 pci_access_cap_config(d
, address
, len
)) {
426 /* used for update-mappings (BAR emulation) */
427 pci_default_write_config(d
, address
, val
, len
);
431 DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
432 ((d
->devfn
>> 3) & 0x1F), (d
->devfn
& 0x7),
433 (uint16_t) address
, val
, len
);
435 fd
= pci_dev
->real_device
.config_fd
;
438 ret
= pwrite(fd
, &val
, len
, address
);
440 if ((ret
< 0) && (errno
== EINTR
|| errno
== EAGAIN
))
443 fprintf(stderr
, "%s: pwrite failed, ret = %zd errno = %d\n",
444 __func__
, ret
, errno
);
450 static uint32_t assigned_dev_pci_read_config(PCIDevice
*d
, uint32_t address
,
456 AssignedDevice
*pci_dev
= container_of(d
, AssignedDevice
, dev
);
458 if (address
< 0x4 || (pci_dev
->need_emulate_cmd
&& address
== 0x4) ||
459 (address
>= 0x10 && address
<= 0x24) || address
== 0x30 ||
460 address
== 0x34 || address
== 0x3c || address
== 0x3d ||
461 pci_access_cap_config(d
, address
, len
)) {
462 val
= pci_default_read_config(d
, address
, len
);
463 DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
464 (d
->devfn
>> 3) & 0x1F, (d
->devfn
& 0x7), address
, val
, len
);
468 /* vga specific, remove later */
472 fd
= pci_dev
->real_device
.config_fd
;
475 ret
= pread(fd
, &val
, len
, address
);
477 if ((ret
< 0) && (errno
== EINTR
|| errno
== EAGAIN
))
480 fprintf(stderr
, "%s: pread failed, ret = %zd errno = %d\n",
481 __func__
, ret
, errno
);
487 DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
488 (d
->devfn
>> 3) & 0x1F, (d
->devfn
& 0x7), address
, val
, len
);
490 if (!pci_dev
->cap
.available
) {
491 /* kill the special capabilities */
492 if (address
== 4 && len
== 4)
494 else if (address
== 6)
501 static int assigned_dev_register_regions(PCIRegion
*io_regions
,
502 unsigned long regions_num
,
503 AssignedDevice
*pci_dev
)
506 PCIRegion
*cur_region
= io_regions
;
508 for (i
= 0; i
< regions_num
; i
++, cur_region
++) {
509 if (!cur_region
->valid
)
511 pci_dev
->v_addrs
[i
].num
= i
;
513 /* handle memory io regions */
514 if (cur_region
->type
& IORESOURCE_MEM
) {
516 int t
= cur_region
->type
& IORESOURCE_PREFETCH
517 ? PCI_BASE_ADDRESS_MEM_PREFETCH
518 : PCI_BASE_ADDRESS_SPACE_MEMORY
;
520 if (cur_region
->size
& 0xFFF) {
521 fprintf(stderr
, "PCI region %d at address 0x%llx "
522 "has size 0x%x, which is not a multiple of 4K. "
523 "You might experience some performance hit "
525 i
, (unsigned long long)cur_region
->base_addr
,
530 /* map physical memory */
531 pci_dev
->v_addrs
[i
].e_physbase
= cur_region
->base_addr
;
532 pci_dev
->v_addrs
[i
].u
.r_virtbase
= mmap(NULL
, cur_region
->size
,
533 PROT_WRITE
| PROT_READ
,
535 cur_region
->resource_fd
,
538 if (pci_dev
->v_addrs
[i
].u
.r_virtbase
== MAP_FAILED
) {
539 pci_dev
->v_addrs
[i
].u
.r_virtbase
= NULL
;
540 fprintf(stderr
, "%s: Error: Couldn't mmap 0x%x!"
542 (uint32_t) (cur_region
->base_addr
));
546 pci_dev
->v_addrs
[i
].r_size
= cur_region
->size
;
547 pci_dev
->v_addrs
[i
].e_size
= 0;
550 pci_dev
->v_addrs
[i
].u
.r_virtbase
+=
551 (cur_region
->base_addr
& 0xFFF);
555 void *virtbase
= pci_dev
->v_addrs
[i
].u
.r_virtbase
;
557 snprintf(name
, sizeof(name
), "%s.bar%d",
558 pci_dev
->dev
.qdev
.info
->name
, i
);
559 pci_dev
->v_addrs
[i
].memory_index
=
560 qemu_ram_alloc_from_ptr(
562 name
, cur_region
->size
,
565 pci_dev
->v_addrs
[i
].memory_index
= 0;
567 pci_register_bar((PCIDevice
*) pci_dev
, i
,
569 slow_map
? assigned_dev_iomem_map_slow
570 : assigned_dev_iomem_map
);
573 /* handle port io regions */
577 /* Test kernel support for ioport resource read/write. Old
578 * kernels return EIO. New kernels only allow 1/2/4 byte reads
579 * so should return EINVAL for a 3 byte read */
580 ret
= pread(pci_dev
->v_addrs
[i
].region
->resource_fd
, &val
, 3, 0);
582 fprintf(stderr
, "I/O port resource supports 3 byte read?!\n");
584 } else if (errno
!= EINVAL
) {
585 fprintf(stderr
, "Using raw in/out ioport access (sysfs - %s)\n",
587 close(pci_dev
->v_addrs
[i
].region
->resource_fd
);
588 pci_dev
->v_addrs
[i
].region
->resource_fd
= -1;
591 pci_dev
->v_addrs
[i
].e_physbase
= cur_region
->base_addr
;
592 pci_dev
->v_addrs
[i
].u
.r_baseport
= cur_region
->base_addr
;
593 pci_dev
->v_addrs
[i
].r_size
= cur_region
->size
;
594 pci_dev
->v_addrs
[i
].e_size
= 0;
596 pci_register_bar((PCIDevice
*) pci_dev
, i
,
597 cur_region
->size
, PCI_BASE_ADDRESS_SPACE_IO
,
598 assigned_dev_ioport_map
);
600 /* not relevant for port io */
601 pci_dev
->v_addrs
[i
].memory_index
= 0;
609 static int get_real_id(const char *devpath
, const char *idname
, uint16_t *val
)
615 snprintf(name
, sizeof(name
), "%s%s", devpath
, idname
);
616 f
= fopen(name
, "r");
618 fprintf(stderr
, "%s: %s: %m\n", __func__
, name
);
621 if (fscanf(f
, "%li\n", &id
) == 1) {
631 static int get_real_vendor_id(const char *devpath
, uint16_t *val
)
633 return get_real_id(devpath
, "vendor", val
);
636 static int get_real_device_id(const char *devpath
, uint16_t *val
)
638 return get_real_id(devpath
, "device", val
);
641 static int get_real_device(AssignedDevice
*pci_dev
, uint16_t r_seg
,
642 uint8_t r_bus
, uint8_t r_dev
, uint8_t r_func
)
644 char dir
[128], name
[128];
647 unsigned long long start
, end
, size
, flags
;
651 PCIDevRegions
*dev
= &pci_dev
->real_device
;
653 dev
->region_number
= 0;
655 snprintf(dir
, sizeof(dir
), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/",
656 r_seg
, r_bus
, r_dev
, r_func
);
658 snprintf(name
, sizeof(name
), "%sconfig", dir
);
660 if (pci_dev
->configfd_name
&& *pci_dev
->configfd_name
) {
661 if (qemu_isdigit(pci_dev
->configfd_name
[0])) {
662 dev
->config_fd
= strtol(pci_dev
->configfd_name
, NULL
, 0);
664 dev
->config_fd
= monitor_get_fd(cur_mon
, pci_dev
->configfd_name
);
665 if (dev
->config_fd
< 0) {
666 fprintf(stderr
, "%s: (%s) unkown\n", __func__
,
667 pci_dev
->configfd_name
);
672 dev
->config_fd
= open(name
, O_RDWR
);
674 if (dev
->config_fd
== -1) {
675 fprintf(stderr
, "%s: %s: %m\n", __func__
, name
);
680 r
= read(dev
->config_fd
, pci_dev
->dev
.config
,
681 pci_config_size(&pci_dev
->dev
));
683 if (errno
== EINTR
|| errno
== EAGAIN
)
685 fprintf(stderr
, "%s: read failed, errno = %d\n", __func__
, errno
);
688 /* Clear host resource mapping info. If we choose not to register a
689 * BAR, such as might be the case with the option ROM, we can get
690 * confusing, unwritable, residual addresses from the host here. */
691 memset(&pci_dev
->dev
.config
[PCI_BASE_ADDRESS_0
], 0, 24);
692 memset(&pci_dev
->dev
.config
[PCI_ROM_ADDRESS
], 0, 4);
694 snprintf(name
, sizeof(name
), "%sresource", dir
);
696 f
= fopen(name
, "r");
698 fprintf(stderr
, "%s: %s: %m\n", __func__
, name
);
702 for (r
= 0; r
< PCI_ROM_SLOT
; r
++) {
703 if (fscanf(f
, "%lli %lli %lli\n", &start
, &end
, &flags
) != 3)
706 rp
= dev
->regions
+ r
;
708 rp
->resource_fd
= -1;
709 size
= end
- start
+ 1;
710 flags
&= IORESOURCE_IO
| IORESOURCE_MEM
| IORESOURCE_PREFETCH
;
711 if (size
== 0 || (flags
& ~IORESOURCE_PREFETCH
) == 0)
713 if (flags
& IORESOURCE_MEM
) {
714 flags
&= ~IORESOURCE_IO
;
716 flags
&= ~IORESOURCE_PREFETCH
;
718 snprintf(name
, sizeof(name
), "%sresource%d", dir
, r
);
719 fd
= open(name
, O_RDWR
);
722 rp
->resource_fd
= fd
;
726 rp
->base_addr
= start
;
728 pci_dev
->v_addrs
[r
].region
= rp
;
729 DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
730 r
, rp
->size
, start
, rp
->type
, rp
->resource_fd
);
735 /* read and fill vendor ID */
736 v
= get_real_vendor_id(dir
, &id
);
740 pci_dev
->dev
.config
[0] = id
& 0xff;
741 pci_dev
->dev
.config
[1] = (id
& 0xff00) >> 8;
743 /* read and fill device ID */
744 v
= get_real_device_id(dir
, &id
);
748 pci_dev
->dev
.config
[2] = id
& 0xff;
749 pci_dev
->dev
.config
[3] = (id
& 0xff00) >> 8;
751 /* dealing with virtual function device */
752 snprintf(name
, sizeof(name
), "%sphysfn/", dir
);
753 if (!stat(name
, &statbuf
))
754 pci_dev
->need_emulate_cmd
= 1;
756 pci_dev
->need_emulate_cmd
= 0;
758 dev
->region_number
= r
;
762 static QLIST_HEAD(, AssignedDevice
) devs
= QLIST_HEAD_INITIALIZER(devs
);
764 #ifdef KVM_CAP_IRQ_ROUTING
765 static void free_dev_irq_entries(AssignedDevice
*dev
)
769 for (i
= 0; i
< dev
->irq_entries_nr
; i
++)
770 kvm_del_routing_entry(&dev
->entry
[i
]);
773 dev
->irq_entries_nr
= 0;
777 static void free_assigned_device(AssignedDevice
*dev
)
782 for (i
= 0; i
< dev
->real_device
.region_number
; i
++) {
783 PCIRegion
*pci_region
= &dev
->real_device
.regions
[i
];
784 AssignedDevRegion
*region
= &dev
->v_addrs
[i
];
786 if (!pci_region
->valid
)
789 if (pci_region
->type
& IORESOURCE_IO
) {
790 if (pci_region
->resource_fd
< 0) {
791 kvm_remove_ioperm_data(region
->u
.r_baseport
,
794 } else if (pci_region
->type
& IORESOURCE_MEM
) {
795 if (region
->u
.r_virtbase
) {
796 if (region
->memory_index
) {
797 cpu_register_physical_memory(region
->e_physbase
,
800 qemu_ram_unmap(region
->memory_index
);
802 if (munmap(region
->u
.r_virtbase
,
803 (pci_region
->size
+ 0xFFF) & 0xFFFFF000))
805 "Failed to unmap assigned device region: %s\n",
809 if (pci_region
->resource_fd
>= 0) {
810 close(pci_region
->resource_fd
);
814 if (dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSIX
)
815 assigned_dev_unregister_msix_mmio(dev
);
817 if (dev
->real_device
.config_fd
>= 0) {
818 close(dev
->real_device
.config_fd
);
821 #ifdef KVM_CAP_IRQ_ROUTING
822 free_dev_irq_entries(dev
);
827 static uint32_t calc_assigned_dev_id(uint16_t seg
, uint8_t bus
, uint8_t devfn
)
829 return (uint32_t)seg
<< 16 | (uint32_t)bus
<< 8 | (uint32_t)devfn
;
832 static void assign_failed_examine(AssignedDevice
*dev
)
834 char name
[PATH_MAX
], dir
[PATH_MAX
], driver
[PATH_MAX
] = {}, *ns
;
835 uint16_t vendor_id
, device_id
;
838 sprintf(dir
, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
839 dev
->host
.seg
, dev
->host
.bus
, dev
->host
.dev
, dev
->host
.func
);
841 sprintf(name
, "%sdriver", dir
);
843 r
= readlink(name
, driver
, sizeof(driver
));
844 if ((r
<= 0) || r
>= sizeof(driver
) || !(ns
= strrchr(driver
, '/'))) {
850 if (get_real_vendor_id(dir
, &vendor_id
) ||
851 get_real_device_id(dir
, &device_id
)) {
855 fprintf(stderr
, "*** The driver '%s' is occupying your device "
856 "%04x:%02x:%02x.%x.\n",
857 ns
, dev
->host
.seg
, dev
->host
.bus
, dev
->host
.dev
, dev
->host
.func
);
858 fprintf(stderr
, "***\n");
859 fprintf(stderr
, "*** You can try the following commands to free it:\n");
860 fprintf(stderr
, "***\n");
861 fprintf(stderr
, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
862 "new_id\n", vendor_id
, device_id
);
863 fprintf(stderr
, "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
865 dev
->host
.seg
, dev
->host
.bus
, dev
->host
.dev
, dev
->host
.func
, ns
);
866 fprintf(stderr
, "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
868 dev
->host
.seg
, dev
->host
.bus
, dev
->host
.dev
, dev
->host
.func
);
869 fprintf(stderr
, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
870 "/remove_id\n", vendor_id
, device_id
);
871 fprintf(stderr
, "***\n");
876 fprintf(stderr
, "Couldn't find out why.\n");
879 static int assign_device(AssignedDevice
*dev
)
881 struct kvm_assigned_pci_dev assigned_dev_data
;
884 #ifdef KVM_CAP_PCI_SEGMENT
885 /* Only pass non-zero PCI segment to capable module */
886 if (!kvm_check_extension(kvm_state
, KVM_CAP_PCI_SEGMENT
) &&
888 fprintf(stderr
, "Can't assign device inside non-zero PCI segment "
889 "as this KVM module doesn't support it.\n");
894 memset(&assigned_dev_data
, 0, sizeof(assigned_dev_data
));
895 assigned_dev_data
.assigned_dev_id
=
896 calc_assigned_dev_id(dev
->h_segnr
, dev
->h_busnr
, dev
->h_devfn
);
897 #ifdef KVM_CAP_PCI_SEGMENT
898 assigned_dev_data
.segnr
= dev
->h_segnr
;
900 assigned_dev_data
.busnr
= dev
->h_busnr
;
901 assigned_dev_data
.devfn
= dev
->h_devfn
;
904 /* We always enable the IOMMU unless disabled on the command line */
905 if (dev
->features
& ASSIGNED_DEVICE_USE_IOMMU_MASK
) {
906 if (!kvm_check_extension(kvm_state
, KVM_CAP_IOMMU
)) {
907 fprintf(stderr
, "No IOMMU found. Unable to assign device \"%s\"\n",
911 assigned_dev_data
.flags
|= KVM_DEV_ASSIGN_ENABLE_IOMMU
;
914 dev
->features
&= ~ASSIGNED_DEVICE_USE_IOMMU_MASK
;
916 if (!(dev
->features
& ASSIGNED_DEVICE_USE_IOMMU_MASK
)) {
918 "WARNING: Assigning a device without IOMMU protection can "
919 "cause host memory corruption if the device issues DMA write "
923 r
= kvm_assign_pci_device(kvm_context
, &assigned_dev_data
);
925 fprintf(stderr
, "Failed to assign device \"%s\" : %s\n",
926 dev
->dev
.qdev
.id
, strerror(-r
));
930 assign_failed_examine(dev
);
939 static int assign_irq(AssignedDevice
*dev
)
941 struct kvm_assigned_irq assigned_irq_data
;
944 /* Interrupt PIN 0 means don't use INTx */
945 if (assigned_dev_pci_read_byte(&dev
->dev
, PCI_INTERRUPT_PIN
) == 0)
948 irq
= pci_map_irq(&dev
->dev
, dev
->intpin
);
949 irq
= piix_get_irq(irq
);
952 irq
= ipf_map_irq(&dev
->dev
, irq
);
955 if (dev
->girq
== irq
)
958 memset(&assigned_irq_data
, 0, sizeof(assigned_irq_data
));
959 assigned_irq_data
.assigned_dev_id
=
960 calc_assigned_dev_id(dev
->h_segnr
, dev
->h_busnr
, dev
->h_devfn
);
961 assigned_irq_data
.guest_irq
= irq
;
962 assigned_irq_data
.host_irq
= dev
->real_device
.irq
;
963 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
964 if (dev
->irq_requested_type
) {
965 assigned_irq_data
.flags
= dev
->irq_requested_type
;
966 r
= kvm_deassign_irq(kvm_context
, &assigned_irq_data
);
967 /* -ENXIO means no assigned irq */
968 if (r
&& r
!= -ENXIO
)
969 perror("assign_irq: deassign");
972 assigned_irq_data
.flags
= KVM_DEV_IRQ_GUEST_INTX
;
973 if (dev
->features
& ASSIGNED_DEVICE_PREFER_MSI_MASK
&&
974 dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSI
)
975 assigned_irq_data
.flags
|= KVM_DEV_IRQ_HOST_MSI
;
977 assigned_irq_data
.flags
|= KVM_DEV_IRQ_HOST_INTX
;
980 r
= kvm_assign_irq(kvm_context
, &assigned_irq_data
);
982 fprintf(stderr
, "Failed to assign irq for \"%s\": %s\n",
983 dev
->dev
.qdev
.id
, strerror(-r
));
984 fprintf(stderr
, "Perhaps you are assigning a device "
985 "that shares an IRQ with another device?\n");
990 dev
->irq_requested_type
= assigned_irq_data
.flags
;
994 static void deassign_device(AssignedDevice
*dev
)
996 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
997 struct kvm_assigned_pci_dev assigned_dev_data
;
1000 memset(&assigned_dev_data
, 0, sizeof(assigned_dev_data
));
1001 assigned_dev_data
.assigned_dev_id
=
1002 calc_assigned_dev_id(dev
->h_segnr
, dev
->h_busnr
, dev
->h_devfn
);
1004 r
= kvm_deassign_pci_device(kvm_context
, &assigned_dev_data
);
1006 fprintf(stderr
, "Failed to deassign device \"%s\" : %s\n",
1007 dev
->dev
.qdev
.id
, strerror(-r
));
1012 AssignedDevInfo
*get_assigned_device(int pcibus
, int slot
)
1014 AssignedDevice
*assigned_dev
= NULL
;
1015 AssignedDevInfo
*adev
= NULL
;
1017 QLIST_FOREACH(adev
, &adev_head
, next
) {
1018 assigned_dev
= adev
->assigned_dev
;
1019 if (pci_bus_num(assigned_dev
->dev
.bus
) == pcibus
&&
1020 PCI_SLOT(assigned_dev
->dev
.devfn
) == slot
)
1028 /* The pci config space got updated. Check if irq numbers have changed
1031 void assigned_dev_update_irqs(void)
1033 AssignedDevice
*dev
, *next
;
1036 dev
= QLIST_FIRST(&devs
);
1038 next
= QLIST_NEXT(dev
, next
);
1039 r
= assign_irq(dev
);
1041 qdev_unplug(&dev
->dev
.qdev
);
1046 #ifdef KVM_CAP_IRQ_ROUTING
1048 #ifdef KVM_CAP_DEVICE_MSI
1049 static void assigned_dev_update_msi(PCIDevice
*pci_dev
, unsigned int ctrl_pos
)
1051 struct kvm_assigned_irq assigned_irq_data
;
1052 AssignedDevice
*assigned_dev
= container_of(pci_dev
, AssignedDevice
, dev
);
1053 uint8_t ctrl_byte
= pci_dev
->config
[ctrl_pos
];
1056 memset(&assigned_irq_data
, 0, sizeof assigned_irq_data
);
1057 assigned_irq_data
.assigned_dev_id
=
1058 calc_assigned_dev_id(assigned_dev
->h_segnr
, assigned_dev
->h_busnr
,
1059 (uint8_t)assigned_dev
->h_devfn
);
1061 /* Some guests gratuitously disable MSI even if they're not using it,
1062 * try to catch this by only deassigning irqs if the guest is using
1063 * MSI or intends to start. */
1064 if ((assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_GUEST_MSI
) ||
1065 (ctrl_byte
& PCI_MSI_FLAGS_ENABLE
)) {
1067 assigned_irq_data
.flags
= assigned_dev
->irq_requested_type
;
1068 free_dev_irq_entries(assigned_dev
);
1069 r
= kvm_deassign_irq(kvm_context
, &assigned_irq_data
);
1070 /* -ENXIO means no assigned irq */
1071 if (r
&& r
!= -ENXIO
)
1072 perror("assigned_dev_update_msi: deassign irq");
1074 assigned_dev
->irq_requested_type
= 0;
1077 if (ctrl_byte
& PCI_MSI_FLAGS_ENABLE
) {
1078 assigned_dev
->entry
= calloc(1, sizeof(struct kvm_irq_routing_entry
));
1079 if (!assigned_dev
->entry
) {
1080 perror("assigned_dev_update_msi: ");
1083 assigned_dev
->entry
->u
.msi
.address_lo
=
1084 *(uint32_t *)(pci_dev
->config
+ pci_dev
->cap
.start
+
1085 PCI_MSI_ADDRESS_LO
);
1086 assigned_dev
->entry
->u
.msi
.address_hi
= 0;
1087 assigned_dev
->entry
->u
.msi
.data
= *(uint16_t *)(pci_dev
->config
+
1088 pci_dev
->cap
.start
+ PCI_MSI_DATA_32
);
1089 assigned_dev
->entry
->type
= KVM_IRQ_ROUTING_MSI
;
1090 r
= kvm_get_irq_route_gsi();
1092 perror("assigned_dev_update_msi: kvm_get_irq_route_gsi");
1095 assigned_dev
->entry
->gsi
= r
;
1097 kvm_add_routing_entry(assigned_dev
->entry
);
1098 if (kvm_commit_irq_routes() < 0) {
1099 perror("assigned_dev_update_msi: kvm_commit_irq_routes");
1100 assigned_dev
->cap
.state
&= ~ASSIGNED_DEVICE_MSI_ENABLED
;
1103 assigned_dev
->irq_entries_nr
= 1;
1105 assigned_irq_data
.guest_irq
= assigned_dev
->entry
->gsi
;
1106 assigned_irq_data
.flags
= KVM_DEV_IRQ_HOST_MSI
| KVM_DEV_IRQ_GUEST_MSI
;
1107 if (kvm_assign_irq(kvm_context
, &assigned_irq_data
) < 0)
1108 perror("assigned_dev_enable_msi: assign irq");
1110 assigned_dev
->irq_requested_type
= assigned_irq_data
.flags
;
1115 #ifdef KVM_CAP_DEVICE_MSIX
1116 static int assigned_dev_update_msix_mmio(PCIDevice
*pci_dev
)
1118 AssignedDevice
*adev
= container_of(pci_dev
, AssignedDevice
, dev
);
1119 uint16_t entries_nr
= 0, entries_max_nr
;
1120 int pos
= 0, i
, r
= 0;
1121 uint32_t msg_addr
, msg_upper_addr
, msg_data
, msg_ctrl
;
1122 struct kvm_assigned_msix_nr msix_nr
;
1123 struct kvm_assigned_msix_entry msix_entry
;
1124 void *va
= adev
->msix_table_page
;
1126 if (adev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSI
)
1127 pos
= pci_dev
->cap
.start
+ PCI_CAPABILITY_CONFIG_MSI_LENGTH
;
1129 pos
= pci_dev
->cap
.start
;
1131 entries_max_nr
= *(uint16_t *)(pci_dev
->config
+ pos
+ 2);
1132 entries_max_nr
&= PCI_MSIX_TABSIZE
;
1133 entries_max_nr
+= 1;
1135 /* Get the usable entry number for allocating */
1136 for (i
= 0; i
< entries_max_nr
; i
++) {
1137 memcpy(&msg_ctrl
, va
+ i
* 16 + 12, 4);
1138 memcpy(&msg_data
, va
+ i
* 16 + 8, 4);
1139 /* Ignore unused entry even it's unmasked */
1145 if (entries_nr
== 0) {
1146 fprintf(stderr
, "MSI-X entry number is zero!\n");
1149 msix_nr
.assigned_dev_id
= calc_assigned_dev_id(adev
->h_segnr
, adev
->h_busnr
,
1150 (uint8_t)adev
->h_devfn
);
1151 msix_nr
.entry_nr
= entries_nr
;
1152 r
= kvm_assign_set_msix_nr(kvm_context
, &msix_nr
);
1154 fprintf(stderr
, "fail to set MSI-X entry number for MSIX! %s\n",
1159 free_dev_irq_entries(adev
);
1160 adev
->irq_entries_nr
= entries_nr
;
1161 adev
->entry
= calloc(entries_nr
, sizeof(struct kvm_irq_routing_entry
));
1163 perror("assigned_dev_update_msix_mmio: ");
1167 msix_entry
.assigned_dev_id
= msix_nr
.assigned_dev_id
;
1169 for (i
= 0; i
< entries_max_nr
; i
++) {
1170 if (entries_nr
>= msix_nr
.entry_nr
)
1172 memcpy(&msg_ctrl
, va
+ i
* 16 + 12, 4);
1173 memcpy(&msg_data
, va
+ i
* 16 + 8, 4);
1177 memcpy(&msg_addr
, va
+ i
* 16, 4);
1178 memcpy(&msg_upper_addr
, va
+ i
* 16 + 4, 4);
1180 r
= kvm_get_irq_route_gsi();
1184 adev
->entry
[entries_nr
].gsi
= r
;
1185 adev
->entry
[entries_nr
].type
= KVM_IRQ_ROUTING_MSI
;
1186 adev
->entry
[entries_nr
].flags
= 0;
1187 adev
->entry
[entries_nr
].u
.msi
.address_lo
= msg_addr
;
1188 adev
->entry
[entries_nr
].u
.msi
.address_hi
= msg_upper_addr
;
1189 adev
->entry
[entries_nr
].u
.msi
.data
= msg_data
;
1190 DEBUG("MSI-X data 0x%x, MSI-X addr_lo 0x%x\n!", msg_data
, msg_addr
);
1191 kvm_add_routing_entry(&adev
->entry
[entries_nr
]);
1193 msix_entry
.gsi
= adev
->entry
[entries_nr
].gsi
;
1194 msix_entry
.entry
= i
;
1195 r
= kvm_assign_set_msix_entry(kvm_context
, &msix_entry
);
1197 fprintf(stderr
, "fail to set MSI-X entry! %s\n", strerror(-r
));
1200 DEBUG("MSI-X entry gsi 0x%x, entry %d\n!",
1201 msix_entry
.gsi
, msix_entry
.entry
);
1205 if (r
== 0 && kvm_commit_irq_routes() < 0) {
1206 perror("assigned_dev_update_msix_mmio: kvm_commit_irq_routes");
1213 static void assigned_dev_update_msix(PCIDevice
*pci_dev
, unsigned int ctrl_pos
)
1215 struct kvm_assigned_irq assigned_irq_data
;
1216 AssignedDevice
*assigned_dev
= container_of(pci_dev
, AssignedDevice
, dev
);
1217 uint16_t *ctrl_word
= (uint16_t *)(pci_dev
->config
+ ctrl_pos
);
1220 memset(&assigned_irq_data
, 0, sizeof assigned_irq_data
);
1221 assigned_irq_data
.assigned_dev_id
=
1222 calc_assigned_dev_id(assigned_dev
->h_segnr
, assigned_dev
->h_busnr
,
1223 (uint8_t)assigned_dev
->h_devfn
);
1225 /* Some guests gratuitously disable MSIX even if they're not using it,
1226 * try to catch this by only deassigning irqs if the guest is using
1227 * MSIX or intends to start. */
1228 if ((assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_GUEST_MSIX
) ||
1229 (*ctrl_word
& PCI_MSIX_ENABLE
)) {
1231 assigned_irq_data
.flags
= assigned_dev
->irq_requested_type
;
1232 free_dev_irq_entries(assigned_dev
);
1233 r
= kvm_deassign_irq(kvm_context
, &assigned_irq_data
);
1234 /* -ENXIO means no assigned irq */
1235 if (r
&& r
!= -ENXIO
)
1236 perror("assigned_dev_update_msix: deassign irq");
1238 assigned_dev
->irq_requested_type
= 0;
1241 if (*ctrl_word
& PCI_MSIX_ENABLE
) {
1242 assigned_irq_data
.flags
= KVM_DEV_IRQ_HOST_MSIX
|
1243 KVM_DEV_IRQ_GUEST_MSIX
;
1245 if (assigned_dev_update_msix_mmio(pci_dev
) < 0) {
1246 perror("assigned_dev_update_msix_mmio");
1249 if (kvm_assign_irq(kvm_context
, &assigned_irq_data
) < 0) {
1250 perror("assigned_dev_enable_msix: assign irq");
1253 assigned_dev
->irq_requested_type
= assigned_irq_data
.flags
;
1259 static void assigned_device_pci_cap_write_config(PCIDevice
*pci_dev
, uint32_t address
,
1260 uint32_t val
, int len
)
1262 AssignedDevice
*assigned_dev
= container_of(pci_dev
, AssignedDevice
, dev
);
1263 unsigned int pos
= pci_dev
->cap
.start
, ctrl_pos
;
1265 pci_default_cap_write_config(pci_dev
, address
, val
, len
);
1266 #ifdef KVM_CAP_IRQ_ROUTING
1267 #ifdef KVM_CAP_DEVICE_MSI
1268 if (assigned_dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSI
) {
1269 ctrl_pos
= pos
+ PCI_MSI_FLAGS
;
1270 if (address
<= ctrl_pos
&& address
+ len
> ctrl_pos
)
1271 assigned_dev_update_msi(pci_dev
, ctrl_pos
);
1272 pos
+= PCI_CAPABILITY_CONFIG_MSI_LENGTH
;
1275 #ifdef KVM_CAP_DEVICE_MSIX
1276 if (assigned_dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSIX
) {
1278 if (address
<= ctrl_pos
&& address
+ len
> ctrl_pos
) {
1279 ctrl_pos
--; /* control is word long */
1280 assigned_dev_update_msix(pci_dev
, ctrl_pos
);
1282 pos
+= PCI_CAPABILITY_CONFIG_MSIX_LENGTH
;
1289 static int assigned_device_pci_cap_init(PCIDevice
*pci_dev
)
1291 AssignedDevice
*dev
= container_of(pci_dev
, AssignedDevice
, dev
);
1292 PCIRegion
*pci_region
= dev
->real_device
.regions
;
1293 int next_cap_pt
= 0;
1295 pci_dev
->cap
.supported
= 1;
1296 pci_dev
->cap
.start
= PCI_CAPABILITY_CONFIG_DEFAULT_START_ADDR
;
1297 pci_dev
->cap
.length
= 0;
1298 pci_dev
->config
[PCI_STATUS
] |= PCI_STATUS_CAP_LIST
;
1299 pci_dev
->config
[PCI_CAPABILITY_LIST
] = pci_dev
->cap
.start
;
1301 #ifdef KVM_CAP_IRQ_ROUTING
1302 #ifdef KVM_CAP_DEVICE_MSI
1303 /* Expose MSI capability
1304 * MSI capability is the 1st capability in capability config */
1305 if (pci_find_cap_offset(pci_dev
, PCI_CAP_ID_MSI
)) {
1306 dev
->cap
.available
|= ASSIGNED_DEVICE_CAP_MSI
;
1307 memset(&pci_dev
->config
[pci_dev
->cap
.start
+ pci_dev
->cap
.length
],
1308 0, PCI_CAPABILITY_CONFIG_MSI_LENGTH
);
1309 pci_dev
->config
[pci_dev
->cap
.start
+ pci_dev
->cap
.length
] =
1311 pci_dev
->cap
.length
+= PCI_CAPABILITY_CONFIG_MSI_LENGTH
;
1315 #ifdef KVM_CAP_DEVICE_MSIX
1316 /* Expose MSI-X capability */
1317 if (pci_find_cap_offset(pci_dev
, PCI_CAP_ID_MSIX
)) {
1318 int pos
, entry_nr
, bar_nr
;
1319 uint32_t msix_table_entry
;
1320 dev
->cap
.available
|= ASSIGNED_DEVICE_CAP_MSIX
;
1321 memset(&pci_dev
->config
[pci_dev
->cap
.start
+ pci_dev
->cap
.length
],
1322 0, PCI_CAPABILITY_CONFIG_MSIX_LENGTH
);
1323 pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_MSIX
);
1324 entry_nr
= assigned_dev_pci_read_word(pci_dev
, pos
+ 2) &
1326 pci_dev
->config
[pci_dev
->cap
.start
+ pci_dev
->cap
.length
] = 0x11;
1327 *(uint16_t *)(pci_dev
->config
+ pci_dev
->cap
.start
+
1328 pci_dev
->cap
.length
+ 2) = entry_nr
;
1329 msix_table_entry
= assigned_dev_pci_read_long(pci_dev
,
1330 pos
+ PCI_MSIX_TABLE
);
1331 *(uint32_t *)(pci_dev
->config
+ pci_dev
->cap
.start
+
1332 pci_dev
->cap
.length
+ PCI_MSIX_TABLE
) = msix_table_entry
;
1333 *(uint32_t *)(pci_dev
->config
+ pci_dev
->cap
.start
+
1334 pci_dev
->cap
.length
+ PCI_MSIX_PBA
) =
1335 assigned_dev_pci_read_long(pci_dev
, pos
+ PCI_MSIX_PBA
);
1336 bar_nr
= msix_table_entry
& PCI_MSIX_BIR
;
1337 msix_table_entry
&= ~PCI_MSIX_BIR
;
1338 dev
->msix_table_addr
= pci_region
[bar_nr
].base_addr
+ msix_table_entry
;
1339 if (next_cap_pt
!= 0) {
1340 pci_dev
->config
[pci_dev
->cap
.start
+ next_cap_pt
] =
1341 pci_dev
->cap
.start
+ pci_dev
->cap
.length
;
1342 next_cap_pt
+= PCI_CAPABILITY_CONFIG_MSI_LENGTH
;
1345 pci_dev
->cap
.length
+= PCI_CAPABILITY_CONFIG_MSIX_LENGTH
;
1353 static uint32_t msix_mmio_readl(void *opaque
, target_phys_addr_t addr
)
1355 AssignedDevice
*adev
= opaque
;
1356 unsigned int offset
= addr
& 0xfff;
1357 void *page
= adev
->msix_table_page
;
1360 memcpy(&val
, (void *)((char *)page
+ offset
), 4);
1365 static uint32_t msix_mmio_readb(void *opaque
, target_phys_addr_t addr
)
1367 return ((msix_mmio_readl(opaque
, addr
& ~3)) >>
1368 (8 * (addr
& 3))) & 0xff;
1371 static uint32_t msix_mmio_readw(void *opaque
, target_phys_addr_t addr
)
1373 return ((msix_mmio_readl(opaque
, addr
& ~3)) >>
1374 (8 * (addr
& 3))) & 0xffff;
1377 static void msix_mmio_writel(void *opaque
,
1378 target_phys_addr_t addr
, uint32_t val
)
1380 AssignedDevice
*adev
= opaque
;
1381 unsigned int offset
= addr
& 0xfff;
1382 void *page
= adev
->msix_table_page
;
1384 DEBUG("write to MSI-X entry table mmio offset 0x%lx, val 0x%x\n",
1386 memcpy((void *)((char *)page
+ offset
), &val
, 4);
1389 static void msix_mmio_writew(void *opaque
,
1390 target_phys_addr_t addr
, uint32_t val
)
1392 msix_mmio_writel(opaque
, addr
& ~3,
1393 (val
& 0xffff) << (8*(addr
& 3)));
1396 static void msix_mmio_writeb(void *opaque
,
1397 target_phys_addr_t addr
, uint32_t val
)
1399 msix_mmio_writel(opaque
, addr
& ~3,
1400 (val
& 0xff) << (8*(addr
& 3)));
1403 static CPUWriteMemoryFunc
*msix_mmio_write
[] = {
1404 msix_mmio_writeb
, msix_mmio_writew
, msix_mmio_writel
1407 static CPUReadMemoryFunc
*msix_mmio_read
[] = {
1408 msix_mmio_readb
, msix_mmio_readw
, msix_mmio_readl
1411 static int assigned_dev_register_msix_mmio(AssignedDevice
*dev
)
1413 dev
->msix_table_page
= mmap(NULL
, 0x1000,
1414 PROT_READ
|PROT_WRITE
,
1415 MAP_ANONYMOUS
|MAP_PRIVATE
, 0, 0);
1416 if (dev
->msix_table_page
== MAP_FAILED
) {
1417 fprintf(stderr
, "fail allocate msix_table_page! %s\n",
1421 memset(dev
->msix_table_page
, 0, 0x1000);
1422 dev
->mmio_index
= cpu_register_io_memory(
1423 msix_mmio_read
, msix_mmio_write
, dev
);
1427 static void assigned_dev_unregister_msix_mmio(AssignedDevice
*dev
)
1429 if (!dev
->msix_table_page
)
1432 cpu_unregister_io_memory(dev
->mmio_index
);
1433 dev
->mmio_index
= 0;
1435 if (munmap(dev
->msix_table_page
, 0x1000) == -1) {
1436 fprintf(stderr
, "error unmapping msix_table_page! %s\n",
1439 dev
->msix_table_page
= NULL
;
1442 static const VMStateDescription vmstate_assigned_device
= {
1443 .name
= "pci-assign"
1446 static void reset_assigned_device(DeviceState
*dev
)
1448 PCIDevice
*d
= DO_UPCAST(PCIDevice
, qdev
, dev
);
1451 * When a 0 is written to the command register, the device is logically
1452 * disconnected from the PCI bus. This avoids further DMA transfers.
1454 assigned_dev_pci_write_config(d
, PCI_COMMAND
, 0, 2);
1457 static int assigned_initfn(struct PCIDevice
*pci_dev
)
1459 AssignedDevice
*dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
1460 uint8_t e_device
, e_intx
;
1463 if (!kvm_enabled()) {
1464 error_report("pci-assign: error: requires KVM support");
1468 if (!dev
->host
.seg
&& !dev
->host
.bus
&& !dev
->host
.dev
&& !dev
->host
.func
) {
1469 error_report("pci-assign: error: no host device specified");
1473 if (get_real_device(dev
, dev
->host
.seg
, dev
->host
.bus
,
1474 dev
->host
.dev
, dev
->host
.func
)) {
1475 error_report("pci-assign: Error: Couldn't get real device (%s)!",
1480 /* handle real device's MMIO/PIO BARs */
1481 if (assigned_dev_register_regions(dev
->real_device
.regions
,
1482 dev
->real_device
.region_number
,
1486 /* handle interrupt routing */
1487 e_device
= (dev
->dev
.devfn
>> 3) & 0x1f;
1488 e_intx
= dev
->dev
.config
[0x3d] - 1;
1489 dev
->intpin
= e_intx
;
1492 dev
->h_segnr
= dev
->host
.seg
;
1493 dev
->h_busnr
= dev
->host
.bus
;
1494 dev
->h_devfn
= PCI_DEVFN(dev
->host
.dev
, dev
->host
.func
);
1496 pci_register_capability_handlers(pci_dev
, NULL
,
1497 assigned_device_pci_cap_write_config
);
1499 if (assigned_device_pci_cap_init(pci_dev
) < 0)
1502 /* assign device to guest */
1503 r
= assign_device(dev
);
1507 /* assign irq for the device */
1508 r
= assign_irq(dev
);
1512 /* intercept MSI-X entry page in the MMIO */
1513 if (dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSIX
)
1514 if (assigned_dev_register_msix_mmio(dev
))
1517 assigned_dev_load_option_rom(dev
);
1518 QLIST_INSERT_HEAD(&devs
, dev
, next
);
1520 /* Register a vmsd so that we can mark it unmigratable. */
1521 vmstate_register(&dev
->dev
.qdev
, 0, &vmstate_assigned_device
, dev
);
1522 register_device_unmigratable(&dev
->dev
.qdev
,
1523 vmstate_assigned_device
.name
, dev
);
1528 deassign_device(dev
);
1530 free_assigned_device(dev
);
1534 static int assigned_exitfn(struct PCIDevice
*pci_dev
)
1536 AssignedDevice
*dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
1538 vmstate_unregister(&dev
->dev
.qdev
, &vmstate_assigned_device
, dev
);
1539 QLIST_REMOVE(dev
, next
);
1540 deassign_device(dev
);
1541 free_assigned_device(dev
);
1545 static int parse_hostaddr(DeviceState
*dev
, Property
*prop
, const char *str
)
1547 PCIHostDevice
*ptr
= qdev_get_prop_ptr(dev
, prop
);
1550 rc
= pci_parse_host_devaddr(str
, &ptr
->seg
, &ptr
->bus
, &ptr
->dev
, &ptr
->func
);
1556 static int print_hostaddr(DeviceState
*dev
, Property
*prop
, char *dest
, size_t len
)
1558 PCIHostDevice
*ptr
= qdev_get_prop_ptr(dev
, prop
);
1560 return snprintf(dest
, len
, "%02x:%02x.%x", ptr
->bus
, ptr
->dev
, ptr
->func
);
1563 PropertyInfo qdev_prop_hostaddr
= {
1564 .name
= "pci-hostaddr",
1566 .size
= sizeof(PCIHostDevice
),
1567 .parse
= parse_hostaddr
,
1568 .print
= print_hostaddr
,
1571 static PCIDeviceInfo assign_info
= {
1572 .qdev
.name
= "pci-assign",
1573 .qdev
.desc
= "pass through host pci devices to the guest",
1574 .qdev
.size
= sizeof(AssignedDevice
),
1575 .qdev
.reset
= reset_assigned_device
,
1576 .init
= assigned_initfn
,
1577 .exit
= assigned_exitfn
,
1578 .config_read
= assigned_dev_pci_read_config
,
1579 .config_write
= assigned_dev_pci_write_config
,
1580 .qdev
.props
= (Property
[]) {
1581 DEFINE_PROP("host", AssignedDevice
, host
, qdev_prop_hostaddr
, PCIHostDevice
),
1582 DEFINE_PROP_BIT("iommu", AssignedDevice
, features
,
1583 ASSIGNED_DEVICE_USE_IOMMU_BIT
, true),
1584 DEFINE_PROP_BIT("prefer_msi", AssignedDevice
, features
,
1585 ASSIGNED_DEVICE_PREFER_MSI_BIT
, true),
1586 DEFINE_PROP_STRING("configfd", AssignedDevice
, configfd_name
),
1587 DEFINE_PROP_END_OF_LIST(),
1591 static void assign_register_devices(void)
1593 pci_qdev_register(&assign_info
);
1596 device_init(assign_register_devices
)
1599 * Scan the assigned devices for the devices that have an option ROM, and then
1600 * load the corresponding ROM data to RAM. If an error occurs while loading an
1601 * option ROM, we just ignore that option ROM and continue with the next one.
1603 static void assigned_dev_load_option_rom(AssignedDevice
*dev
)
1605 char name
[32], rom_file
[64];
1611 /* If loading ROM from file, pci handles it */
1612 if (dev
->dev
.romfile
|| !dev
->dev
.rom_bar
)
1615 snprintf(rom_file
, sizeof(rom_file
),
1616 "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/rom",
1617 dev
->host
.seg
, dev
->host
.bus
, dev
->host
.dev
, dev
->host
.func
);
1619 if (stat(rom_file
, &st
)) {
1623 if (access(rom_file
, F_OK
)) {
1624 fprintf(stderr
, "pci-assign: Insufficient privileges for %s\n",
1629 /* Write "1" to the ROM file to enable it */
1630 fp
= fopen(rom_file
, "r+");
1635 if (fwrite(&val
, 1, 1, fp
) != 1) {
1638 fseek(fp
, 0, SEEK_SET
);
1640 snprintf(name
, sizeof(name
), "%s.rom", dev
->dev
.qdev
.info
->name
);
1641 dev
->dev
.rom_offset
= qemu_ram_alloc(&dev
->dev
.qdev
, name
, st
.st_size
);
1642 ptr
= qemu_get_ram_ptr(dev
->dev
.rom_offset
);
1643 memset(ptr
, 0xff, st
.st_size
);
1645 if (!fread(ptr
, 1, st
.st_size
, fp
)) {
1646 fprintf(stderr
, "pci-assign: Cannot read from host %s\n"
1647 "\tDevice option ROM contents are probably invalid "
1648 "(check dmesg).\n\tSkip option ROM probe with rombar=0, "
1649 "or load from file with romfile=\n", rom_file
);
1650 qemu_ram_free(dev
->dev
.rom_offset
);
1651 dev
->dev
.rom_offset
= 0;
1655 pci_register_bar(&dev
->dev
, PCI_ROM_SLOT
,
1656 st
.st_size
, 0, pci_map_option_rom
);
1658 /* Write "0" to disable ROM */
1659 fseek(fp
, 0, SEEK_SET
);
1661 if (!fwrite(&val
, 1, 1, fp
)) {
1662 DEBUG("%s\n", "Failed to disable pci-sysfs rom file");