pci: Remove pci_enable_capability_support()
[qemu-kvm/stefanha.git] / hw / device-assignment.c
bloba297cb4d5dc13007d36385089fadb7e725246cd9
1 /*
2 * Copyright (c) 2007, Neocleus Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
18 * Assign a PCI device from the host to a guest VM.
20 * Adapted for KVM by Qumranet.
22 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
23 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
24 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
25 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
26 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
28 #include <stdio.h>
29 #include <unistd.h>
30 #include <sys/io.h>
31 #include <sys/types.h>
32 #include <sys/stat.h>
33 #include "qemu-kvm.h"
34 #include "hw.h"
35 #include "pc.h"
36 #include "qemu-error.h"
37 #include "console.h"
38 #include "device-assignment.h"
39 #include "loader.h"
40 #include "monitor.h"
41 #include <pci/header.h>
43 /* From linux/ioport.h */
44 #define IORESOURCE_IO 0x00000100 /* Resource type */
45 #define IORESOURCE_MEM 0x00000200
46 #define IORESOURCE_IRQ 0x00000400
47 #define IORESOURCE_DMA 0x00000800
48 #define IORESOURCE_PREFETCH 0x00001000 /* No side effects */
50 /* #define DEVICE_ASSIGNMENT_DEBUG 1 */
52 #ifdef DEVICE_ASSIGNMENT_DEBUG
53 #define DEBUG(fmt, ...) \
54 do { \
55 fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__); \
56 } while (0)
57 #else
58 #define DEBUG(fmt, ...) do { } while(0)
59 #endif
61 static void assigned_dev_load_option_rom(AssignedDevice *dev);
63 static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev);
65 static uint32_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region,
66 uint32_t addr, int len, uint32_t *val)
68 uint32_t ret = 0;
69 uint32_t offset = addr - dev_region->e_physbase;
70 int fd = dev_region->region->resource_fd;
72 if (fd >= 0) {
73 if (val) {
74 DEBUG("pwrite val=%x, len=%d, e_phys=%x, offset=%x\n",
75 *val, len, addr, offset);
76 if (pwrite(fd, val, len, offset) != len) {
77 fprintf(stderr, "%s - pwrite failed %s\n",
78 __func__, strerror(errno));
80 } else {
81 if (pread(fd, &ret, len, offset) != len) {
82 fprintf(stderr, "%s - pread failed %s\n",
83 __func__, strerror(errno));
84 ret = (1UL << (len * 8)) - 1;
86 DEBUG("pread ret=%x, len=%d, e_phys=%x, offset=%x\n",
87 ret, len, addr, offset);
89 } else {
90 uint32_t port = offset + dev_region->u.r_baseport;
92 if (val) {
93 DEBUG("out val=%x, len=%d, e_phys=%x, host=%x\n",
94 *val, len, addr, port);
95 switch (len) {
96 case 1:
97 outb(*val, port);
98 break;
99 case 2:
100 outw(*val, port);
101 break;
102 case 4:
103 outl(*val, port);
104 break;
106 } else {
107 switch (len) {
108 case 1:
109 ret = inb(port);
110 break;
111 case 2:
112 ret = inw(port);
113 break;
114 case 4:
115 ret = inl(port);
116 break;
118 DEBUG("in val=%x, len=%d, e_phys=%x, host=%x\n",
119 ret, len, addr, port);
122 return ret;
125 static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
126 uint32_t value)
128 assigned_dev_ioport_rw(opaque, addr, 1, &value);
129 return;
132 static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
133 uint32_t value)
135 assigned_dev_ioport_rw(opaque, addr, 2, &value);
136 return;
139 static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
140 uint32_t value)
142 assigned_dev_ioport_rw(opaque, addr, 4, &value);
143 return;
146 static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t addr)
148 return assigned_dev_ioport_rw(opaque, addr, 1, NULL);
151 static uint32_t assigned_dev_ioport_readw(void *opaque, uint32_t addr)
153 return assigned_dev_ioport_rw(opaque, addr, 2, NULL);
156 static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
158 return assigned_dev_ioport_rw(opaque, addr, 4, NULL);
161 static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
163 AssignedDevRegion *d = opaque;
164 uint8_t *in = d->u.r_virtbase + addr;
165 uint32_t r;
167 r = *in;
168 DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
170 return r;
173 static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
175 AssignedDevRegion *d = opaque;
176 uint16_t *in = d->u.r_virtbase + addr;
177 uint32_t r;
179 r = *in;
180 DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
182 return r;
185 static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
187 AssignedDevRegion *d = opaque;
188 uint32_t *in = d->u.r_virtbase + addr;
189 uint32_t r;
191 r = *in;
192 DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
194 return r;
197 static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
199 AssignedDevRegion *d = opaque;
200 uint8_t *out = d->u.r_virtbase + addr;
202 DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
203 *out = val;
206 static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
208 AssignedDevRegion *d = opaque;
209 uint16_t *out = d->u.r_virtbase + addr;
211 DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
212 *out = val;
215 static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
217 AssignedDevRegion *d = opaque;
218 uint32_t *out = d->u.r_virtbase + addr;
220 DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
221 *out = val;
224 static CPUWriteMemoryFunc * const slow_bar_write[] = {
225 &slow_bar_writeb,
226 &slow_bar_writew,
227 &slow_bar_writel
230 static CPUReadMemoryFunc * const slow_bar_read[] = {
231 &slow_bar_readb,
232 &slow_bar_readw,
233 &slow_bar_readl
236 static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
237 pcibus_t e_phys, pcibus_t e_size,
238 int type)
240 AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
241 AssignedDevRegion *region = &r_dev->v_addrs[region_num];
242 PCIRegion *real_region = &r_dev->real_device.regions[region_num];
243 int m;
245 DEBUG("%s", "slow map\n");
246 m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
247 cpu_register_physical_memory(e_phys, e_size, m);
249 /* MSI-X MMIO page */
250 if ((e_size > 0) &&
251 real_region->base_addr <= r_dev->msix_table_addr &&
252 real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
253 int offset = r_dev->msix_table_addr - real_region->base_addr;
255 cpu_register_physical_memory(e_phys + offset,
256 TARGET_PAGE_SIZE, r_dev->mmio_index);
260 static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
261 pcibus_t e_phys, pcibus_t e_size, int type)
263 AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
264 AssignedDevRegion *region = &r_dev->v_addrs[region_num];
265 PCIRegion *real_region = &r_dev->real_device.regions[region_num];
266 int ret = 0;
268 DEBUG("e_phys=%08" FMT_PCIBUS " r_virt=%p type=%d len=%08" FMT_PCIBUS " region_num=%d \n",
269 e_phys, region->u.r_virtbase, type, e_size, region_num);
271 region->e_physbase = e_phys;
272 region->e_size = e_size;
274 if (e_size > 0) {
275 cpu_register_physical_memory(e_phys, e_size, region->memory_index);
277 /* deal with MSI-X MMIO page */
278 if (real_region->base_addr <= r_dev->msix_table_addr &&
279 real_region->base_addr + real_region->size >=
280 r_dev->msix_table_addr) {
281 int offset = r_dev->msix_table_addr - real_region->base_addr;
283 cpu_register_physical_memory(e_phys + offset,
284 TARGET_PAGE_SIZE, r_dev->mmio_index);
288 if (ret != 0) {
289 fprintf(stderr, "%s: Error: create new mapping failed\n", __func__);
290 exit(1);
294 static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
295 pcibus_t addr, pcibus_t size, int type)
297 AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
298 AssignedDevRegion *region = &r_dev->v_addrs[region_num];
299 int first_map = (region->e_size == 0);
300 CPUState *env;
302 region->e_physbase = addr;
303 region->e_size = size;
305 DEBUG("e_phys=0x%" FMT_PCIBUS " r_baseport=%x type=0x%x len=%" FMT_PCIBUS " region_num=%d \n",
306 addr, region->u.r_baseport, type, size, region_num);
308 if (first_map && region->region->resource_fd < 0) {
309 struct ioperm_data *data;
311 data = qemu_mallocz(sizeof(struct ioperm_data));
312 if (data == NULL) {
313 fprintf(stderr, "%s: Out of memory\n", __func__);
314 exit(1);
317 data->start_port = region->u.r_baseport;
318 data->num = region->r_size;
319 data->turn_on = 1;
321 kvm_add_ioperm_data(data);
323 for (env = first_cpu; env; env = env->next_cpu)
324 kvm_ioperm(env, data);
327 register_ioport_read(addr, size, 1, assigned_dev_ioport_readb,
328 (r_dev->v_addrs + region_num));
329 register_ioport_read(addr, size, 2, assigned_dev_ioport_readw,
330 (r_dev->v_addrs + region_num));
331 register_ioport_read(addr, size, 4, assigned_dev_ioport_readl,
332 (r_dev->v_addrs + region_num));
333 register_ioport_write(addr, size, 1, assigned_dev_ioport_writeb,
334 (r_dev->v_addrs + region_num));
335 register_ioport_write(addr, size, 2, assigned_dev_ioport_writew,
336 (r_dev->v_addrs + region_num));
337 register_ioport_write(addr, size, 4, assigned_dev_ioport_writel,
338 (r_dev->v_addrs + region_num));
341 static uint32_t assigned_dev_pci_read(PCIDevice *d, int pos, int len)
343 AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
344 uint32_t val;
345 ssize_t ret;
346 int fd = pci_dev->real_device.config_fd;
348 again:
349 ret = pread(fd, &val, len, pos);
350 if (ret != len) {
351 if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
352 goto again;
354 fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n",
355 __func__, ret, errno);
357 exit(1);
360 return val;
363 static uint8_t assigned_dev_pci_read_byte(PCIDevice *d, int pos)
365 return (uint8_t)assigned_dev_pci_read(d, pos, 1);
368 static uint16_t assigned_dev_pci_read_word(PCIDevice *d, int pos)
370 return (uint16_t)assigned_dev_pci_read(d, pos, 2);
373 static uint32_t assigned_dev_pci_read_long(PCIDevice *d, int pos)
375 return assigned_dev_pci_read(d, pos, 4);
378 static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap)
380 int id;
381 int max_cap = 48;
382 int pos = PCI_CAPABILITY_LIST;
383 int status;
385 status = assigned_dev_pci_read_byte(d, PCI_STATUS);
386 if ((status & PCI_STATUS_CAP_LIST) == 0)
387 return 0;
389 while (max_cap--) {
390 pos = assigned_dev_pci_read_byte(d, pos);
391 if (pos < 0x40)
392 break;
394 pos &= ~3;
395 id = assigned_dev_pci_read_byte(d, pos + PCI_CAP_LIST_ID);
397 if (id == 0xff)
398 break;
399 if (id == cap)
400 return pos;
402 pos += PCI_CAP_LIST_NEXT;
404 return 0;
407 static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address,
408 uint32_t val, int len)
410 int fd;
411 ssize_t ret;
412 AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
414 DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
415 ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
416 (uint16_t) address, val, len);
418 if (address == 0x4) {
419 pci_default_write_config(d, address, val, len);
420 /* Continue to program the card */
423 if ((address >= 0x10 && address <= 0x24) || address == 0x30 ||
424 address == 0x34 || address == 0x3c || address == 0x3d ||
425 pci_access_cap_config(d, address, len)) {
426 /* used for update-mappings (BAR emulation) */
427 pci_default_write_config(d, address, val, len);
428 return;
431 DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
432 ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
433 (uint16_t) address, val, len);
435 fd = pci_dev->real_device.config_fd;
437 again:
438 ret = pwrite(fd, &val, len, address);
439 if (ret != len) {
440 if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
441 goto again;
443 fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
444 __func__, ret, errno);
446 exit(1);
450 static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address,
451 int len)
453 uint32_t val = 0;
454 int fd;
455 ssize_t ret;
456 AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
458 if (address < 0x4 || (pci_dev->need_emulate_cmd && address == 0x4) ||
459 (address >= 0x10 && address <= 0x24) || address == 0x30 ||
460 address == 0x34 || address == 0x3c || address == 0x3d ||
461 pci_access_cap_config(d, address, len)) {
462 val = pci_default_read_config(d, address, len);
463 DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
464 (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
465 return val;
468 /* vga specific, remove later */
469 if (address == 0xFC)
470 goto do_log;
472 fd = pci_dev->real_device.config_fd;
474 again:
475 ret = pread(fd, &val, len, address);
476 if (ret != len) {
477 if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
478 goto again;
480 fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n",
481 __func__, ret, errno);
483 exit(1);
486 do_log:
487 DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
488 (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
490 if (!pci_dev->cap.available) {
491 /* kill the special capabilities */
492 if (address == 4 && len == 4)
493 val &= ~0x100000;
494 else if (address == 6)
495 val &= ~0x10;
498 return val;
501 static int assigned_dev_register_regions(PCIRegion *io_regions,
502 unsigned long regions_num,
503 AssignedDevice *pci_dev)
505 uint32_t i;
506 PCIRegion *cur_region = io_regions;
508 for (i = 0; i < regions_num; i++, cur_region++) {
509 if (!cur_region->valid)
510 continue;
511 pci_dev->v_addrs[i].num = i;
513 /* handle memory io regions */
514 if (cur_region->type & IORESOURCE_MEM) {
515 int slow_map = 0;
516 int t = cur_region->type & IORESOURCE_PREFETCH
517 ? PCI_BASE_ADDRESS_MEM_PREFETCH
518 : PCI_BASE_ADDRESS_SPACE_MEMORY;
520 if (cur_region->size & 0xFFF) {
521 fprintf(stderr, "PCI region %d at address 0x%llx "
522 "has size 0x%x, which is not a multiple of 4K. "
523 "You might experience some performance hit "
524 "due to that.\n",
525 i, (unsigned long long)cur_region->base_addr,
526 cur_region->size);
527 slow_map = 1;
530 /* map physical memory */
531 pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
532 pci_dev->v_addrs[i].u.r_virtbase = mmap(NULL, cur_region->size,
533 PROT_WRITE | PROT_READ,
534 MAP_SHARED,
535 cur_region->resource_fd,
536 (off_t)0);
538 if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) {
539 pci_dev->v_addrs[i].u.r_virtbase = NULL;
540 fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
541 "\n", __func__,
542 (uint32_t) (cur_region->base_addr));
543 return -1;
546 pci_dev->v_addrs[i].r_size = cur_region->size;
547 pci_dev->v_addrs[i].e_size = 0;
549 /* add offset */
550 pci_dev->v_addrs[i].u.r_virtbase +=
551 (cur_region->base_addr & 0xFFF);
554 if (!slow_map) {
555 void *virtbase = pci_dev->v_addrs[i].u.r_virtbase;
556 char name[32];
557 snprintf(name, sizeof(name), "%s.bar%d",
558 pci_dev->dev.qdev.info->name, i);
559 pci_dev->v_addrs[i].memory_index =
560 qemu_ram_alloc_from_ptr(
561 &pci_dev->dev.qdev,
562 name, cur_region->size,
563 virtbase);
564 } else
565 pci_dev->v_addrs[i].memory_index = 0;
567 pci_register_bar((PCIDevice *) pci_dev, i,
568 cur_region->size, t,
569 slow_map ? assigned_dev_iomem_map_slow
570 : assigned_dev_iomem_map);
571 continue;
572 } else {
573 /* handle port io regions */
574 uint32_t val;
575 int ret;
577 /* Test kernel support for ioport resource read/write. Old
578 * kernels return EIO. New kernels only allow 1/2/4 byte reads
579 * so should return EINVAL for a 3 byte read */
580 ret = pread(pci_dev->v_addrs[i].region->resource_fd, &val, 3, 0);
581 if (ret == 3) {
582 fprintf(stderr, "I/O port resource supports 3 byte read?!\n");
583 abort();
584 } else if (errno != EINVAL) {
585 fprintf(stderr, "Using raw in/out ioport access (sysfs - %s)\n",
586 strerror(errno));
587 close(pci_dev->v_addrs[i].region->resource_fd);
588 pci_dev->v_addrs[i].region->resource_fd = -1;
591 pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
592 pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
593 pci_dev->v_addrs[i].r_size = cur_region->size;
594 pci_dev->v_addrs[i].e_size = 0;
596 pci_register_bar((PCIDevice *) pci_dev, i,
597 cur_region->size, PCI_BASE_ADDRESS_SPACE_IO,
598 assigned_dev_ioport_map);
600 /* not relevant for port io */
601 pci_dev->v_addrs[i].memory_index = 0;
605 /* success */
606 return 0;
609 static int get_real_id(const char *devpath, const char *idname, uint16_t *val)
611 FILE *f;
612 char name[128];
613 long id;
615 snprintf(name, sizeof(name), "%s%s", devpath, idname);
616 f = fopen(name, "r");
617 if (f == NULL) {
618 fprintf(stderr, "%s: %s: %m\n", __func__, name);
619 return -1;
621 if (fscanf(f, "%li\n", &id) == 1) {
622 *val = id;
623 } else {
624 return -1;
626 fclose(f);
628 return 0;
631 static int get_real_vendor_id(const char *devpath, uint16_t *val)
633 return get_real_id(devpath, "vendor", val);
636 static int get_real_device_id(const char *devpath, uint16_t *val)
638 return get_real_id(devpath, "device", val);
641 static int get_real_device(AssignedDevice *pci_dev, uint16_t r_seg,
642 uint8_t r_bus, uint8_t r_dev, uint8_t r_func)
644 char dir[128], name[128];
645 int fd, r = 0, v;
646 FILE *f;
647 unsigned long long start, end, size, flags;
648 uint16_t id;
649 struct stat statbuf;
650 PCIRegion *rp;
651 PCIDevRegions *dev = &pci_dev->real_device;
653 dev->region_number = 0;
655 snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/",
656 r_seg, r_bus, r_dev, r_func);
658 snprintf(name, sizeof(name), "%sconfig", dir);
660 if (pci_dev->configfd_name && *pci_dev->configfd_name) {
661 if (qemu_isdigit(pci_dev->configfd_name[0])) {
662 dev->config_fd = strtol(pci_dev->configfd_name, NULL, 0);
663 } else {
664 dev->config_fd = monitor_get_fd(cur_mon, pci_dev->configfd_name);
665 if (dev->config_fd < 0) {
666 fprintf(stderr, "%s: (%s) unkown\n", __func__,
667 pci_dev->configfd_name);
668 return 1;
671 } else {
672 dev->config_fd = open(name, O_RDWR);
674 if (dev->config_fd == -1) {
675 fprintf(stderr, "%s: %s: %m\n", __func__, name);
676 return 1;
679 again:
680 r = read(dev->config_fd, pci_dev->dev.config,
681 pci_config_size(&pci_dev->dev));
682 if (r < 0) {
683 if (errno == EINTR || errno == EAGAIN)
684 goto again;
685 fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
688 /* Clear host resource mapping info. If we choose not to register a
689 * BAR, such as might be the case with the option ROM, we can get
690 * confusing, unwritable, residual addresses from the host here. */
691 memset(&pci_dev->dev.config[PCI_BASE_ADDRESS_0], 0, 24);
692 memset(&pci_dev->dev.config[PCI_ROM_ADDRESS], 0, 4);
694 snprintf(name, sizeof(name), "%sresource", dir);
696 f = fopen(name, "r");
697 if (f == NULL) {
698 fprintf(stderr, "%s: %s: %m\n", __func__, name);
699 return 1;
702 for (r = 0; r < PCI_ROM_SLOT; r++) {
703 if (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) != 3)
704 break;
706 rp = dev->regions + r;
707 rp->valid = 0;
708 rp->resource_fd = -1;
709 size = end - start + 1;
710 flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
711 if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
712 continue;
713 if (flags & IORESOURCE_MEM) {
714 flags &= ~IORESOURCE_IO;
715 } else {
716 flags &= ~IORESOURCE_PREFETCH;
718 snprintf(name, sizeof(name), "%sresource%d", dir, r);
719 fd = open(name, O_RDWR);
720 if (fd == -1)
721 continue;
722 rp->resource_fd = fd;
724 rp->type = flags;
725 rp->valid = 1;
726 rp->base_addr = start;
727 rp->size = size;
728 pci_dev->v_addrs[r].region = rp;
729 DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
730 r, rp->size, start, rp->type, rp->resource_fd);
733 fclose(f);
735 /* read and fill vendor ID */
736 v = get_real_vendor_id(dir, &id);
737 if (v) {
738 return 1;
740 pci_dev->dev.config[0] = id & 0xff;
741 pci_dev->dev.config[1] = (id & 0xff00) >> 8;
743 /* read and fill device ID */
744 v = get_real_device_id(dir, &id);
745 if (v) {
746 return 1;
748 pci_dev->dev.config[2] = id & 0xff;
749 pci_dev->dev.config[3] = (id & 0xff00) >> 8;
751 /* dealing with virtual function device */
752 snprintf(name, sizeof(name), "%sphysfn/", dir);
753 if (!stat(name, &statbuf))
754 pci_dev->need_emulate_cmd = 1;
755 else
756 pci_dev->need_emulate_cmd = 0;
758 dev->region_number = r;
759 return 0;
762 static QLIST_HEAD(, AssignedDevice) devs = QLIST_HEAD_INITIALIZER(devs);
764 #ifdef KVM_CAP_IRQ_ROUTING
765 static void free_dev_irq_entries(AssignedDevice *dev)
767 int i;
769 for (i = 0; i < dev->irq_entries_nr; i++)
770 kvm_del_routing_entry(&dev->entry[i]);
771 free(dev->entry);
772 dev->entry = NULL;
773 dev->irq_entries_nr = 0;
775 #endif
777 static void free_assigned_device(AssignedDevice *dev)
779 if (dev) {
780 int i;
782 for (i = 0; i < dev->real_device.region_number; i++) {
783 PCIRegion *pci_region = &dev->real_device.regions[i];
784 AssignedDevRegion *region = &dev->v_addrs[i];
786 if (!pci_region->valid)
787 continue;
789 if (pci_region->type & IORESOURCE_IO) {
790 if (pci_region->resource_fd < 0) {
791 kvm_remove_ioperm_data(region->u.r_baseport,
792 region->r_size);
794 } else if (pci_region->type & IORESOURCE_MEM) {
795 if (region->u.r_virtbase) {
796 if (region->memory_index) {
797 cpu_register_physical_memory(region->e_physbase,
798 region->e_size,
799 IO_MEM_UNASSIGNED);
800 qemu_ram_unmap(region->memory_index);
802 if (munmap(region->u.r_virtbase,
803 (pci_region->size + 0xFFF) & 0xFFFFF000))
804 fprintf(stderr,
805 "Failed to unmap assigned device region: %s\n",
806 strerror(errno));
809 if (pci_region->resource_fd >= 0) {
810 close(pci_region->resource_fd);
814 if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX)
815 assigned_dev_unregister_msix_mmio(dev);
817 if (dev->real_device.config_fd >= 0) {
818 close(dev->real_device.config_fd);
821 #ifdef KVM_CAP_IRQ_ROUTING
822 free_dev_irq_entries(dev);
823 #endif
827 static uint32_t calc_assigned_dev_id(uint16_t seg, uint8_t bus, uint8_t devfn)
829 return (uint32_t)seg << 16 | (uint32_t)bus << 8 | (uint32_t)devfn;
832 static void assign_failed_examine(AssignedDevice *dev)
834 char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
835 uint16_t vendor_id, device_id;
836 int r;
838 sprintf(dir, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
839 dev->host.seg, dev->host.bus, dev->host.dev, dev->host.func);
841 sprintf(name, "%sdriver", dir);
843 r = readlink(name, driver, sizeof(driver));
844 if ((r <= 0) || r >= sizeof(driver) || !(ns = strrchr(driver, '/'))) {
845 goto fail;
848 ns++;
850 if (get_real_vendor_id(dir, &vendor_id) ||
851 get_real_device_id(dir, &device_id)) {
852 goto fail;
855 fprintf(stderr, "*** The driver '%s' is occupying your device "
856 "%04x:%02x:%02x.%x.\n",
857 ns, dev->host.seg, dev->host.bus, dev->host.dev, dev->host.func);
858 fprintf(stderr, "***\n");
859 fprintf(stderr, "*** You can try the following commands to free it:\n");
860 fprintf(stderr, "***\n");
861 fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
862 "new_id\n", vendor_id, device_id);
863 fprintf(stderr, "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
864 "%s/unbind\n",
865 dev->host.seg, dev->host.bus, dev->host.dev, dev->host.func, ns);
866 fprintf(stderr, "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
867 "pci-stub/bind\n",
868 dev->host.seg, dev->host.bus, dev->host.dev, dev->host.func);
869 fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
870 "/remove_id\n", vendor_id, device_id);
871 fprintf(stderr, "***\n");
873 return;
875 fail:
876 fprintf(stderr, "Couldn't find out why.\n");
879 static int assign_device(AssignedDevice *dev)
881 struct kvm_assigned_pci_dev assigned_dev_data;
882 int r;
884 #ifdef KVM_CAP_PCI_SEGMENT
885 /* Only pass non-zero PCI segment to capable module */
886 if (!kvm_check_extension(kvm_state, KVM_CAP_PCI_SEGMENT) &&
887 dev->h_segnr) {
888 fprintf(stderr, "Can't assign device inside non-zero PCI segment "
889 "as this KVM module doesn't support it.\n");
890 return -ENODEV;
892 #endif
894 memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
895 assigned_dev_data.assigned_dev_id =
896 calc_assigned_dev_id(dev->h_segnr, dev->h_busnr, dev->h_devfn);
897 #ifdef KVM_CAP_PCI_SEGMENT
898 assigned_dev_data.segnr = dev->h_segnr;
899 #endif
900 assigned_dev_data.busnr = dev->h_busnr;
901 assigned_dev_data.devfn = dev->h_devfn;
903 #ifdef KVM_CAP_IOMMU
904 /* We always enable the IOMMU unless disabled on the command line */
905 if (dev->features & ASSIGNED_DEVICE_USE_IOMMU_MASK) {
906 if (!kvm_check_extension(kvm_state, KVM_CAP_IOMMU)) {
907 fprintf(stderr, "No IOMMU found. Unable to assign device \"%s\"\n",
908 dev->dev.qdev.id);
909 return -ENODEV;
911 assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU;
913 #else
914 dev->features &= ~ASSIGNED_DEVICE_USE_IOMMU_MASK;
915 #endif
916 if (!(dev->features & ASSIGNED_DEVICE_USE_IOMMU_MASK)) {
917 fprintf(stderr,
918 "WARNING: Assigning a device without IOMMU protection can "
919 "cause host memory corruption if the device issues DMA write "
920 "requests!\n");
923 r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
924 if (r < 0) {
925 fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
926 dev->dev.qdev.id, strerror(-r));
928 switch (r) {
929 case -EBUSY:
930 assign_failed_examine(dev);
931 break;
932 default:
933 break;
936 return r;
939 static int assign_irq(AssignedDevice *dev)
941 struct kvm_assigned_irq assigned_irq_data;
942 int irq, r = 0;
944 /* Interrupt PIN 0 means don't use INTx */
945 if (assigned_dev_pci_read_byte(&dev->dev, PCI_INTERRUPT_PIN) == 0)
946 return 0;
948 irq = pci_map_irq(&dev->dev, dev->intpin);
949 irq = piix_get_irq(irq);
951 #ifdef TARGET_IA64
952 irq = ipf_map_irq(&dev->dev, irq);
953 #endif
955 if (dev->girq == irq)
956 return r;
958 memset(&assigned_irq_data, 0, sizeof(assigned_irq_data));
959 assigned_irq_data.assigned_dev_id =
960 calc_assigned_dev_id(dev->h_segnr, dev->h_busnr, dev->h_devfn);
961 assigned_irq_data.guest_irq = irq;
962 assigned_irq_data.host_irq = dev->real_device.irq;
963 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
964 if (dev->irq_requested_type) {
965 assigned_irq_data.flags = dev->irq_requested_type;
966 r = kvm_deassign_irq(kvm_context, &assigned_irq_data);
967 /* -ENXIO means no assigned irq */
968 if (r && r != -ENXIO)
969 perror("assign_irq: deassign");
972 assigned_irq_data.flags = KVM_DEV_IRQ_GUEST_INTX;
973 if (dev->features & ASSIGNED_DEVICE_PREFER_MSI_MASK &&
974 dev->cap.available & ASSIGNED_DEVICE_CAP_MSI)
975 assigned_irq_data.flags |= KVM_DEV_IRQ_HOST_MSI;
976 else
977 assigned_irq_data.flags |= KVM_DEV_IRQ_HOST_INTX;
978 #endif
980 r = kvm_assign_irq(kvm_context, &assigned_irq_data);
981 if (r < 0) {
982 fprintf(stderr, "Failed to assign irq for \"%s\": %s\n",
983 dev->dev.qdev.id, strerror(-r));
984 fprintf(stderr, "Perhaps you are assigning a device "
985 "that shares an IRQ with another device?\n");
986 return r;
989 dev->girq = irq;
990 dev->irq_requested_type = assigned_irq_data.flags;
991 return r;
994 static void deassign_device(AssignedDevice *dev)
996 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
997 struct kvm_assigned_pci_dev assigned_dev_data;
998 int r;
1000 memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
1001 assigned_dev_data.assigned_dev_id =
1002 calc_assigned_dev_id(dev->h_segnr, dev->h_busnr, dev->h_devfn);
1004 r = kvm_deassign_pci_device(kvm_context, &assigned_dev_data);
1005 if (r < 0)
1006 fprintf(stderr, "Failed to deassign device \"%s\" : %s\n",
1007 dev->dev.qdev.id, strerror(-r));
1008 #endif
1011 #if 0
1012 AssignedDevInfo *get_assigned_device(int pcibus, int slot)
1014 AssignedDevice *assigned_dev = NULL;
1015 AssignedDevInfo *adev = NULL;
1017 QLIST_FOREACH(adev, &adev_head, next) {
1018 assigned_dev = adev->assigned_dev;
1019 if (pci_bus_num(assigned_dev->dev.bus) == pcibus &&
1020 PCI_SLOT(assigned_dev->dev.devfn) == slot)
1021 return adev;
1024 return NULL;
1026 #endif
1028 /* The pci config space got updated. Check if irq numbers have changed
1029 * for our devices
1031 void assigned_dev_update_irqs(void)
1033 AssignedDevice *dev, *next;
1034 int r;
1036 dev = QLIST_FIRST(&devs);
1037 while (dev) {
1038 next = QLIST_NEXT(dev, next);
1039 r = assign_irq(dev);
1040 if (r < 0)
1041 qdev_unplug(&dev->dev.qdev);
1042 dev = next;
1046 #ifdef KVM_CAP_IRQ_ROUTING
1048 #ifdef KVM_CAP_DEVICE_MSI
1049 static void assigned_dev_update_msi(PCIDevice *pci_dev, unsigned int ctrl_pos)
1051 struct kvm_assigned_irq assigned_irq_data;
1052 AssignedDevice *assigned_dev = container_of(pci_dev, AssignedDevice, dev);
1053 uint8_t ctrl_byte = pci_dev->config[ctrl_pos];
1054 int r;
1056 memset(&assigned_irq_data, 0, sizeof assigned_irq_data);
1057 assigned_irq_data.assigned_dev_id =
1058 calc_assigned_dev_id(assigned_dev->h_segnr, assigned_dev->h_busnr,
1059 (uint8_t)assigned_dev->h_devfn);
1061 /* Some guests gratuitously disable MSI even if they're not using it,
1062 * try to catch this by only deassigning irqs if the guest is using
1063 * MSI or intends to start. */
1064 if ((assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MSI) ||
1065 (ctrl_byte & PCI_MSI_FLAGS_ENABLE)) {
1067 assigned_irq_data.flags = assigned_dev->irq_requested_type;
1068 free_dev_irq_entries(assigned_dev);
1069 r = kvm_deassign_irq(kvm_context, &assigned_irq_data);
1070 /* -ENXIO means no assigned irq */
1071 if (r && r != -ENXIO)
1072 perror("assigned_dev_update_msi: deassign irq");
1074 assigned_dev->irq_requested_type = 0;
1077 if (ctrl_byte & PCI_MSI_FLAGS_ENABLE) {
1078 assigned_dev->entry = calloc(1, sizeof(struct kvm_irq_routing_entry));
1079 if (!assigned_dev->entry) {
1080 perror("assigned_dev_update_msi: ");
1081 return;
1083 assigned_dev->entry->u.msi.address_lo =
1084 *(uint32_t *)(pci_dev->config + pci_dev->cap.start +
1085 PCI_MSI_ADDRESS_LO);
1086 assigned_dev->entry->u.msi.address_hi = 0;
1087 assigned_dev->entry->u.msi.data = *(uint16_t *)(pci_dev->config +
1088 pci_dev->cap.start + PCI_MSI_DATA_32);
1089 assigned_dev->entry->type = KVM_IRQ_ROUTING_MSI;
1090 r = kvm_get_irq_route_gsi();
1091 if (r < 0) {
1092 perror("assigned_dev_update_msi: kvm_get_irq_route_gsi");
1093 return;
1095 assigned_dev->entry->gsi = r;
1097 kvm_add_routing_entry(assigned_dev->entry);
1098 if (kvm_commit_irq_routes() < 0) {
1099 perror("assigned_dev_update_msi: kvm_commit_irq_routes");
1100 assigned_dev->cap.state &= ~ASSIGNED_DEVICE_MSI_ENABLED;
1101 return;
1103 assigned_dev->irq_entries_nr = 1;
1105 assigned_irq_data.guest_irq = assigned_dev->entry->gsi;
1106 assigned_irq_data.flags = KVM_DEV_IRQ_HOST_MSI | KVM_DEV_IRQ_GUEST_MSI;
1107 if (kvm_assign_irq(kvm_context, &assigned_irq_data) < 0)
1108 perror("assigned_dev_enable_msi: assign irq");
1110 assigned_dev->irq_requested_type = assigned_irq_data.flags;
1113 #endif
1115 #ifdef KVM_CAP_DEVICE_MSIX
1116 static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev)
1118 AssignedDevice *adev = container_of(pci_dev, AssignedDevice, dev);
1119 uint16_t entries_nr = 0, entries_max_nr;
1120 int pos = 0, i, r = 0;
1121 uint32_t msg_addr, msg_upper_addr, msg_data, msg_ctrl;
1122 struct kvm_assigned_msix_nr msix_nr;
1123 struct kvm_assigned_msix_entry msix_entry;
1124 void *va = adev->msix_table_page;
1126 if (adev->cap.available & ASSIGNED_DEVICE_CAP_MSI)
1127 pos = pci_dev->cap.start + PCI_CAPABILITY_CONFIG_MSI_LENGTH;
1128 else
1129 pos = pci_dev->cap.start;
1131 entries_max_nr = *(uint16_t *)(pci_dev->config + pos + 2);
1132 entries_max_nr &= PCI_MSIX_TABSIZE;
1133 entries_max_nr += 1;
1135 /* Get the usable entry number for allocating */
1136 for (i = 0; i < entries_max_nr; i++) {
1137 memcpy(&msg_ctrl, va + i * 16 + 12, 4);
1138 memcpy(&msg_data, va + i * 16 + 8, 4);
1139 /* Ignore unused entry even it's unmasked */
1140 if (msg_data == 0)
1141 continue;
1142 entries_nr ++;
1145 if (entries_nr == 0) {
1146 fprintf(stderr, "MSI-X entry number is zero!\n");
1147 return -EINVAL;
1149 msix_nr.assigned_dev_id = calc_assigned_dev_id(adev->h_segnr, adev->h_busnr,
1150 (uint8_t)adev->h_devfn);
1151 msix_nr.entry_nr = entries_nr;
1152 r = kvm_assign_set_msix_nr(kvm_context, &msix_nr);
1153 if (r != 0) {
1154 fprintf(stderr, "fail to set MSI-X entry number for MSIX! %s\n",
1155 strerror(-r));
1156 return r;
1159 free_dev_irq_entries(adev);
1160 adev->irq_entries_nr = entries_nr;
1161 adev->entry = calloc(entries_nr, sizeof(struct kvm_irq_routing_entry));
1162 if (!adev->entry) {
1163 perror("assigned_dev_update_msix_mmio: ");
1164 return -errno;
1167 msix_entry.assigned_dev_id = msix_nr.assigned_dev_id;
1168 entries_nr = 0;
1169 for (i = 0; i < entries_max_nr; i++) {
1170 if (entries_nr >= msix_nr.entry_nr)
1171 break;
1172 memcpy(&msg_ctrl, va + i * 16 + 12, 4);
1173 memcpy(&msg_data, va + i * 16 + 8, 4);
1174 if (msg_data == 0)
1175 continue;
1177 memcpy(&msg_addr, va + i * 16, 4);
1178 memcpy(&msg_upper_addr, va + i * 16 + 4, 4);
1180 r = kvm_get_irq_route_gsi();
1181 if (r < 0)
1182 return r;
1184 adev->entry[entries_nr].gsi = r;
1185 adev->entry[entries_nr].type = KVM_IRQ_ROUTING_MSI;
1186 adev->entry[entries_nr].flags = 0;
1187 adev->entry[entries_nr].u.msi.address_lo = msg_addr;
1188 adev->entry[entries_nr].u.msi.address_hi = msg_upper_addr;
1189 adev->entry[entries_nr].u.msi.data = msg_data;
1190 DEBUG("MSI-X data 0x%x, MSI-X addr_lo 0x%x\n!", msg_data, msg_addr);
1191 kvm_add_routing_entry(&adev->entry[entries_nr]);
1193 msix_entry.gsi = adev->entry[entries_nr].gsi;
1194 msix_entry.entry = i;
1195 r = kvm_assign_set_msix_entry(kvm_context, &msix_entry);
1196 if (r) {
1197 fprintf(stderr, "fail to set MSI-X entry! %s\n", strerror(-r));
1198 break;
1200 DEBUG("MSI-X entry gsi 0x%x, entry %d\n!",
1201 msix_entry.gsi, msix_entry.entry);
1202 entries_nr ++;
1205 if (r == 0 && kvm_commit_irq_routes() < 0) {
1206 perror("assigned_dev_update_msix_mmio: kvm_commit_irq_routes");
1207 return -EINVAL;
1210 return r;
1213 static void assigned_dev_update_msix(PCIDevice *pci_dev, unsigned int ctrl_pos)
1215 struct kvm_assigned_irq assigned_irq_data;
1216 AssignedDevice *assigned_dev = container_of(pci_dev, AssignedDevice, dev);
1217 uint16_t *ctrl_word = (uint16_t *)(pci_dev->config + ctrl_pos);
1218 int r;
1220 memset(&assigned_irq_data, 0, sizeof assigned_irq_data);
1221 assigned_irq_data.assigned_dev_id =
1222 calc_assigned_dev_id(assigned_dev->h_segnr, assigned_dev->h_busnr,
1223 (uint8_t)assigned_dev->h_devfn);
1225 /* Some guests gratuitously disable MSIX even if they're not using it,
1226 * try to catch this by only deassigning irqs if the guest is using
1227 * MSIX or intends to start. */
1228 if ((assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MSIX) ||
1229 (*ctrl_word & PCI_MSIX_ENABLE)) {
1231 assigned_irq_data.flags = assigned_dev->irq_requested_type;
1232 free_dev_irq_entries(assigned_dev);
1233 r = kvm_deassign_irq(kvm_context, &assigned_irq_data);
1234 /* -ENXIO means no assigned irq */
1235 if (r && r != -ENXIO)
1236 perror("assigned_dev_update_msix: deassign irq");
1238 assigned_dev->irq_requested_type = 0;
1241 if (*ctrl_word & PCI_MSIX_ENABLE) {
1242 assigned_irq_data.flags = KVM_DEV_IRQ_HOST_MSIX |
1243 KVM_DEV_IRQ_GUEST_MSIX;
1245 if (assigned_dev_update_msix_mmio(pci_dev) < 0) {
1246 perror("assigned_dev_update_msix_mmio");
1247 return;
1249 if (kvm_assign_irq(kvm_context, &assigned_irq_data) < 0) {
1250 perror("assigned_dev_enable_msix: assign irq");
1251 return;
1253 assigned_dev->irq_requested_type = assigned_irq_data.flags;
1256 #endif
1257 #endif
1259 static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev, uint32_t address,
1260 uint32_t val, int len)
1262 AssignedDevice *assigned_dev = container_of(pci_dev, AssignedDevice, dev);
1263 unsigned int pos = pci_dev->cap.start, ctrl_pos;
1265 pci_default_cap_write_config(pci_dev, address, val, len);
1266 #ifdef KVM_CAP_IRQ_ROUTING
1267 #ifdef KVM_CAP_DEVICE_MSI
1268 if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
1269 ctrl_pos = pos + PCI_MSI_FLAGS;
1270 if (address <= ctrl_pos && address + len > ctrl_pos)
1271 assigned_dev_update_msi(pci_dev, ctrl_pos);
1272 pos += PCI_CAPABILITY_CONFIG_MSI_LENGTH;
1274 #endif
1275 #ifdef KVM_CAP_DEVICE_MSIX
1276 if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
1277 ctrl_pos = pos + 3;
1278 if (address <= ctrl_pos && address + len > ctrl_pos) {
1279 ctrl_pos--; /* control is word long */
1280 assigned_dev_update_msix(pci_dev, ctrl_pos);
1282 pos += PCI_CAPABILITY_CONFIG_MSIX_LENGTH;
1284 #endif
1285 #endif
1286 return;
1289 static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
1291 AssignedDevice *dev = container_of(pci_dev, AssignedDevice, dev);
1292 PCIRegion *pci_region = dev->real_device.regions;
1293 int next_cap_pt = 0;
1295 pci_dev->cap.supported = 1;
1296 pci_dev->cap.start = PCI_CAPABILITY_CONFIG_DEFAULT_START_ADDR;
1297 pci_dev->cap.length = 0;
1298 pci_dev->config[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
1299 pci_dev->config[PCI_CAPABILITY_LIST] = pci_dev->cap.start;
1301 #ifdef KVM_CAP_IRQ_ROUTING
1302 #ifdef KVM_CAP_DEVICE_MSI
1303 /* Expose MSI capability
1304 * MSI capability is the 1st capability in capability config */
1305 if (pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSI)) {
1306 dev->cap.available |= ASSIGNED_DEVICE_CAP_MSI;
1307 memset(&pci_dev->config[pci_dev->cap.start + pci_dev->cap.length],
1308 0, PCI_CAPABILITY_CONFIG_MSI_LENGTH);
1309 pci_dev->config[pci_dev->cap.start + pci_dev->cap.length] =
1310 PCI_CAP_ID_MSI;
1311 pci_dev->cap.length += PCI_CAPABILITY_CONFIG_MSI_LENGTH;
1312 next_cap_pt = 1;
1314 #endif
1315 #ifdef KVM_CAP_DEVICE_MSIX
1316 /* Expose MSI-X capability */
1317 if (pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSIX)) {
1318 int pos, entry_nr, bar_nr;
1319 uint32_t msix_table_entry;
1320 dev->cap.available |= ASSIGNED_DEVICE_CAP_MSIX;
1321 memset(&pci_dev->config[pci_dev->cap.start + pci_dev->cap.length],
1322 0, PCI_CAPABILITY_CONFIG_MSIX_LENGTH);
1323 pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSIX);
1324 entry_nr = assigned_dev_pci_read_word(pci_dev, pos + 2) &
1325 PCI_MSIX_TABSIZE;
1326 pci_dev->config[pci_dev->cap.start + pci_dev->cap.length] = 0x11;
1327 *(uint16_t *)(pci_dev->config + pci_dev->cap.start +
1328 pci_dev->cap.length + 2) = entry_nr;
1329 msix_table_entry = assigned_dev_pci_read_long(pci_dev,
1330 pos + PCI_MSIX_TABLE);
1331 *(uint32_t *)(pci_dev->config + pci_dev->cap.start +
1332 pci_dev->cap.length + PCI_MSIX_TABLE) = msix_table_entry;
1333 *(uint32_t *)(pci_dev->config + pci_dev->cap.start +
1334 pci_dev->cap.length + PCI_MSIX_PBA) =
1335 assigned_dev_pci_read_long(pci_dev, pos + PCI_MSIX_PBA);
1336 bar_nr = msix_table_entry & PCI_MSIX_BIR;
1337 msix_table_entry &= ~PCI_MSIX_BIR;
1338 dev->msix_table_addr = pci_region[bar_nr].base_addr + msix_table_entry;
1339 if (next_cap_pt != 0) {
1340 pci_dev->config[pci_dev->cap.start + next_cap_pt] =
1341 pci_dev->cap.start + pci_dev->cap.length;
1342 next_cap_pt += PCI_CAPABILITY_CONFIG_MSI_LENGTH;
1343 } else
1344 next_cap_pt = 1;
1345 pci_dev->cap.length += PCI_CAPABILITY_CONFIG_MSIX_LENGTH;
1347 #endif
1348 #endif
1350 return 0;
1353 static uint32_t msix_mmio_readl(void *opaque, target_phys_addr_t addr)
1355 AssignedDevice *adev = opaque;
1356 unsigned int offset = addr & 0xfff;
1357 void *page = adev->msix_table_page;
1358 uint32_t val = 0;
1360 memcpy(&val, (void *)((char *)page + offset), 4);
1362 return val;
1365 static uint32_t msix_mmio_readb(void *opaque, target_phys_addr_t addr)
1367 return ((msix_mmio_readl(opaque, addr & ~3)) >>
1368 (8 * (addr & 3))) & 0xff;
1371 static uint32_t msix_mmio_readw(void *opaque, target_phys_addr_t addr)
1373 return ((msix_mmio_readl(opaque, addr & ~3)) >>
1374 (8 * (addr & 3))) & 0xffff;
1377 static void msix_mmio_writel(void *opaque,
1378 target_phys_addr_t addr, uint32_t val)
1380 AssignedDevice *adev = opaque;
1381 unsigned int offset = addr & 0xfff;
1382 void *page = adev->msix_table_page;
1384 DEBUG("write to MSI-X entry table mmio offset 0x%lx, val 0x%x\n",
1385 addr, val);
1386 memcpy((void *)((char *)page + offset), &val, 4);
1389 static void msix_mmio_writew(void *opaque,
1390 target_phys_addr_t addr, uint32_t val)
1392 msix_mmio_writel(opaque, addr & ~3,
1393 (val & 0xffff) << (8*(addr & 3)));
1396 static void msix_mmio_writeb(void *opaque,
1397 target_phys_addr_t addr, uint32_t val)
1399 msix_mmio_writel(opaque, addr & ~3,
1400 (val & 0xff) << (8*(addr & 3)));
1403 static CPUWriteMemoryFunc *msix_mmio_write[] = {
1404 msix_mmio_writeb, msix_mmio_writew, msix_mmio_writel
1407 static CPUReadMemoryFunc *msix_mmio_read[] = {
1408 msix_mmio_readb, msix_mmio_readw, msix_mmio_readl
1411 static int assigned_dev_register_msix_mmio(AssignedDevice *dev)
1413 dev->msix_table_page = mmap(NULL, 0x1000,
1414 PROT_READ|PROT_WRITE,
1415 MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
1416 if (dev->msix_table_page == MAP_FAILED) {
1417 fprintf(stderr, "fail allocate msix_table_page! %s\n",
1418 strerror(errno));
1419 return -EFAULT;
1421 memset(dev->msix_table_page, 0, 0x1000);
1422 dev->mmio_index = cpu_register_io_memory(
1423 msix_mmio_read, msix_mmio_write, dev);
1424 return 0;
1427 static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev)
1429 if (!dev->msix_table_page)
1430 return;
1432 cpu_unregister_io_memory(dev->mmio_index);
1433 dev->mmio_index = 0;
1435 if (munmap(dev->msix_table_page, 0x1000) == -1) {
1436 fprintf(stderr, "error unmapping msix_table_page! %s\n",
1437 strerror(errno));
1439 dev->msix_table_page = NULL;
1442 static const VMStateDescription vmstate_assigned_device = {
1443 .name = "pci-assign"
1446 static void reset_assigned_device(DeviceState *dev)
1448 PCIDevice *d = DO_UPCAST(PCIDevice, qdev, dev);
1451 * When a 0 is written to the command register, the device is logically
1452 * disconnected from the PCI bus. This avoids further DMA transfers.
1454 assigned_dev_pci_write_config(d, PCI_COMMAND, 0, 2);
1457 static int assigned_initfn(struct PCIDevice *pci_dev)
1459 AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
1460 uint8_t e_device, e_intx;
1461 int r;
1463 if (!kvm_enabled()) {
1464 error_report("pci-assign: error: requires KVM support");
1465 return -1;
1468 if (!dev->host.seg && !dev->host.bus && !dev->host.dev && !dev->host.func) {
1469 error_report("pci-assign: error: no host device specified");
1470 return -1;
1473 if (get_real_device(dev, dev->host.seg, dev->host.bus,
1474 dev->host.dev, dev->host.func)) {
1475 error_report("pci-assign: Error: Couldn't get real device (%s)!",
1476 dev->dev.qdev.id);
1477 goto out;
1480 /* handle real device's MMIO/PIO BARs */
1481 if (assigned_dev_register_regions(dev->real_device.regions,
1482 dev->real_device.region_number,
1483 dev))
1484 goto out;
1486 /* handle interrupt routing */
1487 e_device = (dev->dev.devfn >> 3) & 0x1f;
1488 e_intx = dev->dev.config[0x3d] - 1;
1489 dev->intpin = e_intx;
1490 dev->run = 0;
1491 dev->girq = -1;
1492 dev->h_segnr = dev->host.seg;
1493 dev->h_busnr = dev->host.bus;
1494 dev->h_devfn = PCI_DEVFN(dev->host.dev, dev->host.func);
1496 pci_register_capability_handlers(pci_dev, NULL,
1497 assigned_device_pci_cap_write_config);
1499 if (assigned_device_pci_cap_init(pci_dev) < 0)
1500 goto out;
1502 /* assign device to guest */
1503 r = assign_device(dev);
1504 if (r < 0)
1505 goto out;
1507 /* assign irq for the device */
1508 r = assign_irq(dev);
1509 if (r < 0)
1510 goto assigned_out;
1512 /* intercept MSI-X entry page in the MMIO */
1513 if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX)
1514 if (assigned_dev_register_msix_mmio(dev))
1515 goto assigned_out;
1517 assigned_dev_load_option_rom(dev);
1518 QLIST_INSERT_HEAD(&devs, dev, next);
1520 /* Register a vmsd so that we can mark it unmigratable. */
1521 vmstate_register(&dev->dev.qdev, 0, &vmstate_assigned_device, dev);
1522 register_device_unmigratable(&dev->dev.qdev,
1523 vmstate_assigned_device.name, dev);
1525 return 0;
1527 assigned_out:
1528 deassign_device(dev);
1529 out:
1530 free_assigned_device(dev);
1531 return -1;
1534 static int assigned_exitfn(struct PCIDevice *pci_dev)
1536 AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
1538 vmstate_unregister(&dev->dev.qdev, &vmstate_assigned_device, dev);
1539 QLIST_REMOVE(dev, next);
1540 deassign_device(dev);
1541 free_assigned_device(dev);
1542 return 0;
1545 static int parse_hostaddr(DeviceState *dev, Property *prop, const char *str)
1547 PCIHostDevice *ptr = qdev_get_prop_ptr(dev, prop);
1548 int rc;
1550 rc = pci_parse_host_devaddr(str, &ptr->seg, &ptr->bus, &ptr->dev, &ptr->func);
1551 if (rc != 0)
1552 return -1;
1553 return 0;
1556 static int print_hostaddr(DeviceState *dev, Property *prop, char *dest, size_t len)
1558 PCIHostDevice *ptr = qdev_get_prop_ptr(dev, prop);
1560 return snprintf(dest, len, "%02x:%02x.%x", ptr->bus, ptr->dev, ptr->func);
1563 PropertyInfo qdev_prop_hostaddr = {
1564 .name = "pci-hostaddr",
1565 .type = -1,
1566 .size = sizeof(PCIHostDevice),
1567 .parse = parse_hostaddr,
1568 .print = print_hostaddr,
1571 static PCIDeviceInfo assign_info = {
1572 .qdev.name = "pci-assign",
1573 .qdev.desc = "pass through host pci devices to the guest",
1574 .qdev.size = sizeof(AssignedDevice),
1575 .qdev.reset = reset_assigned_device,
1576 .init = assigned_initfn,
1577 .exit = assigned_exitfn,
1578 .config_read = assigned_dev_pci_read_config,
1579 .config_write = assigned_dev_pci_write_config,
1580 .qdev.props = (Property[]) {
1581 DEFINE_PROP("host", AssignedDevice, host, qdev_prop_hostaddr, PCIHostDevice),
1582 DEFINE_PROP_BIT("iommu", AssignedDevice, features,
1583 ASSIGNED_DEVICE_USE_IOMMU_BIT, true),
1584 DEFINE_PROP_BIT("prefer_msi", AssignedDevice, features,
1585 ASSIGNED_DEVICE_PREFER_MSI_BIT, true),
1586 DEFINE_PROP_STRING("configfd", AssignedDevice, configfd_name),
1587 DEFINE_PROP_END_OF_LIST(),
1591 static void assign_register_devices(void)
1593 pci_qdev_register(&assign_info);
1596 device_init(assign_register_devices)
1599 * Scan the assigned devices for the devices that have an option ROM, and then
1600 * load the corresponding ROM data to RAM. If an error occurs while loading an
1601 * option ROM, we just ignore that option ROM and continue with the next one.
1603 static void assigned_dev_load_option_rom(AssignedDevice *dev)
1605 char name[32], rom_file[64];
1606 FILE *fp;
1607 uint8_t val;
1608 struct stat st;
1609 void *ptr;
1611 /* If loading ROM from file, pci handles it */
1612 if (dev->dev.romfile || !dev->dev.rom_bar)
1613 return;
1615 snprintf(rom_file, sizeof(rom_file),
1616 "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/rom",
1617 dev->host.seg, dev->host.bus, dev->host.dev, dev->host.func);
1619 if (stat(rom_file, &st)) {
1620 return;
1623 if (access(rom_file, F_OK)) {
1624 fprintf(stderr, "pci-assign: Insufficient privileges for %s\n",
1625 rom_file);
1626 return;
1629 /* Write "1" to the ROM file to enable it */
1630 fp = fopen(rom_file, "r+");
1631 if (fp == NULL) {
1632 return;
1634 val = 1;
1635 if (fwrite(&val, 1, 1, fp) != 1) {
1636 goto close_rom;
1638 fseek(fp, 0, SEEK_SET);
1640 snprintf(name, sizeof(name), "%s.rom", dev->dev.qdev.info->name);
1641 dev->dev.rom_offset = qemu_ram_alloc(&dev->dev.qdev, name, st.st_size);
1642 ptr = qemu_get_ram_ptr(dev->dev.rom_offset);
1643 memset(ptr, 0xff, st.st_size);
1645 if (!fread(ptr, 1, st.st_size, fp)) {
1646 fprintf(stderr, "pci-assign: Cannot read from host %s\n"
1647 "\tDevice option ROM contents are probably invalid "
1648 "(check dmesg).\n\tSkip option ROM probe with rombar=0, "
1649 "or load from file with romfile=\n", rom_file);
1650 qemu_ram_free(dev->dev.rom_offset);
1651 dev->dev.rom_offset = 0;
1652 goto close_rom;
1655 pci_register_bar(&dev->dev, PCI_ROM_SLOT,
1656 st.st_size, 0, pci_map_option_rom);
1657 close_rom:
1658 /* Write "0" to disable ROM */
1659 fseek(fp, 0, SEEK_SET);
1660 val = 0;
1661 if (!fwrite(&val, 1, 1, fp)) {
1662 DEBUG("%s\n", "Failed to disable pci-sysfs rom file");
1664 fclose(fp);