4 * Copyright 2016 - 2018 Red Hat, Inc.
7 * Fam Zheng <famz@redhat.com>
9 * This work is licensed under the terms of the GNU GPL, version 2 or later.
10 * See the COPYING file in the top-level directory.
13 #include "qemu/osdep.h"
14 #include <sys/ioctl.h>
15 #include <linux/vfio.h>
16 #include "qapi/error.h"
17 #include "exec/ramlist.h"
18 #include "exec/cpu-common.h"
20 #include "qemu/error-report.h"
21 #include "standard-headers/linux/pci_regs.h"
22 #include "qemu/event_notifier.h"
23 #include "qemu/vfio-helpers.h"
26 #define QEMU_VFIO_DEBUG 0
28 #define QEMU_VFIO_IOVA_MIN 0x10000ULL
29 /* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface,
30 * we can use a runtime limit; alternatively it's also possible to do platform
31 * specific detection by reading sysfs entries. Until then, 39 is a safe bet.
33 #define QEMU_VFIO_IOVA_MAX (1ULL << 39)
36 /* Page aligned addr. */
42 struct QEMUVFIOState
{
45 /* These fields are protected by BQL */
49 RAMBlockNotifier ram_notifier
;
50 struct vfio_region_info config_region_info
, bar_region_info
[6];
52 /* These fields are protected by @lock */
53 /* VFIO's IO virtual address space is managed by splitting into a few
56 * --------------- <= 0
58 * |-------------| <= QEMU_VFIO_IOVA_MIN
62 * |-------------| <= low_water_mark
66 * |-------------| <= high_water_mark
70 * |-------------| <= QEMU_VFIO_IOVA_MAX
75 * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid;
77 * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of
78 * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be
79 * reclaimed - low_water_mark never shrinks;
81 * - IOVAs in range [low_water_mark, high_water_mark) are free;
83 * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile
84 * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area
85 * is recycled. The caller should make sure I/O's depending on these
86 * mappings are completed before calling.
88 uint64_t low_water_mark
;
89 uint64_t high_water_mark
;
90 IOVAMapping
*mappings
;
95 * Find group file by PCI device address as specified @device, and return the
96 * path. The returned string is owned by caller and should be g_free'ed later.
98 static char *sysfs_find_group_file(const char *device
, Error
**errp
)
105 sysfs_link
= g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device
);
106 sysfs_group
= g_malloc0(PATH_MAX
);
107 if (readlink(sysfs_link
, sysfs_group
, PATH_MAX
- 1) == -1) {
108 error_setg_errno(errp
, errno
, "Failed to find iommu group sysfs path");
111 p
= strrchr(sysfs_group
, '/');
113 error_setg(errp
, "Failed to find iommu group number");
117 path
= g_strdup_printf("/dev/vfio/%s", p
+ 1);
124 static inline void assert_bar_index_valid(QEMUVFIOState
*s
, int index
)
126 assert(index
>= 0 && index
< ARRAY_SIZE(s
->bar_region_info
));
129 static int qemu_vfio_pci_init_bar(QEMUVFIOState
*s
, int index
, Error
**errp
)
131 assert_bar_index_valid(s
, index
);
132 s
->bar_region_info
[index
] = (struct vfio_region_info
) {
133 .index
= VFIO_PCI_BAR0_REGION_INDEX
+ index
,
134 .argsz
= sizeof(struct vfio_region_info
),
136 if (ioctl(s
->device
, VFIO_DEVICE_GET_REGION_INFO
, &s
->bar_region_info
[index
])) {
137 error_setg_errno(errp
, errno
, "Failed to get BAR region info");
145 * Map a PCI bar area.
147 void *qemu_vfio_pci_map_bar(QEMUVFIOState
*s
, int index
,
148 uint64_t offset
, uint64_t size
,
152 assert_bar_index_valid(s
, index
);
153 p
= mmap(NULL
, MIN(size
, s
->bar_region_info
[index
].size
- offset
),
154 PROT_READ
| PROT_WRITE
, MAP_SHARED
,
155 s
->device
, s
->bar_region_info
[index
].offset
+ offset
);
156 if (p
== MAP_FAILED
) {
157 error_setg_errno(errp
, errno
, "Failed to map BAR region");
164 * Unmap a PCI bar area.
166 void qemu_vfio_pci_unmap_bar(QEMUVFIOState
*s
, int index
, void *bar
,
167 uint64_t offset
, uint64_t size
)
170 munmap(bar
, MIN(size
, s
->bar_region_info
[index
].size
- offset
));
175 * Initialize device IRQ with @irq_type and and register an event notifier.
177 int qemu_vfio_pci_init_irq(QEMUVFIOState
*s
, EventNotifier
*e
,
178 int irq_type
, Error
**errp
)
181 struct vfio_irq_set
*irq_set
;
183 struct vfio_irq_info irq_info
= { .argsz
= sizeof(irq_info
) };
185 irq_info
.index
= irq_type
;
186 if (ioctl(s
->device
, VFIO_DEVICE_GET_IRQ_INFO
, &irq_info
)) {
187 error_setg_errno(errp
, errno
, "Failed to get device interrupt info");
190 if (!(irq_info
.flags
& VFIO_IRQ_INFO_EVENTFD
)) {
191 error_setg(errp
, "Device interrupt doesn't support eventfd");
195 irq_set_size
= sizeof(*irq_set
) + sizeof(int);
196 irq_set
= g_malloc0(irq_set_size
);
198 /* Get to a known IRQ state */
199 *irq_set
= (struct vfio_irq_set
) {
200 .argsz
= irq_set_size
,
201 .flags
= VFIO_IRQ_SET_DATA_EVENTFD
| VFIO_IRQ_SET_ACTION_TRIGGER
,
202 .index
= irq_info
.index
,
207 *(int *)&irq_set
->data
= event_notifier_get_fd(e
);
208 r
= ioctl(s
->device
, VFIO_DEVICE_SET_IRQS
, irq_set
);
211 error_setg_errno(errp
, errno
, "Failed to setup device interrupt");
217 static int qemu_vfio_pci_read_config(QEMUVFIOState
*s
, void *buf
,
223 ret
= pread(s
->device
, buf
, size
, s
->config_region_info
.offset
+ ofs
);
224 } while (ret
== -1 && errno
== EINTR
);
225 return ret
== size
? 0 : -errno
;
228 static int qemu_vfio_pci_write_config(QEMUVFIOState
*s
, void *buf
, int size
, int ofs
)
233 ret
= pwrite(s
->device
, buf
, size
, s
->config_region_info
.offset
+ ofs
);
234 } while (ret
== -1 && errno
== EINTR
);
235 return ret
== size
? 0 : -errno
;
238 static int qemu_vfio_init_pci(QEMUVFIOState
*s
, const char *device
,
244 struct vfio_group_status group_status
= { .argsz
= sizeof(group_status
) };
245 struct vfio_iommu_type1_info iommu_info
= { .argsz
= sizeof(iommu_info
) };
246 struct vfio_device_info device_info
= { .argsz
= sizeof(device_info
) };
247 char *group_file
= NULL
;
249 /* Create a new container */
250 s
->container
= open("/dev/vfio/vfio", O_RDWR
);
252 if (s
->container
== -1) {
253 error_setg_errno(errp
, errno
, "Failed to open /dev/vfio/vfio");
256 if (ioctl(s
->container
, VFIO_GET_API_VERSION
) != VFIO_API_VERSION
) {
257 error_setg(errp
, "Invalid VFIO version");
262 if (!ioctl(s
->container
, VFIO_CHECK_EXTENSION
, VFIO_TYPE1_IOMMU
)) {
263 error_setg_errno(errp
, errno
, "VFIO IOMMU check failed");
269 group_file
= sysfs_find_group_file(device
, errp
);
275 s
->group
= open(group_file
, O_RDWR
);
276 if (s
->group
== -1) {
277 error_setg_errno(errp
, errno
, "Failed to open VFIO group file: %s",
285 /* Test the group is viable and available */
286 if (ioctl(s
->group
, VFIO_GROUP_GET_STATUS
, &group_status
)) {
287 error_setg_errno(errp
, errno
, "Failed to get VFIO group status");
292 if (!(group_status
.flags
& VFIO_GROUP_FLAGS_VIABLE
)) {
293 error_setg(errp
, "VFIO group is not viable");
298 /* Add the group to the container */
299 if (ioctl(s
->group
, VFIO_GROUP_SET_CONTAINER
, &s
->container
)) {
300 error_setg_errno(errp
, errno
, "Failed to add group to VFIO container");
305 /* Enable the IOMMU model we want */
306 if (ioctl(s
->container
, VFIO_SET_IOMMU
, VFIO_TYPE1_IOMMU
)) {
307 error_setg_errno(errp
, errno
, "Failed to set VFIO IOMMU type");
312 /* Get additional IOMMU info */
313 if (ioctl(s
->container
, VFIO_IOMMU_GET_INFO
, &iommu_info
)) {
314 error_setg_errno(errp
, errno
, "Failed to get IOMMU info");
319 s
->device
= ioctl(s
->group
, VFIO_GROUP_GET_DEVICE_FD
, device
);
322 error_setg_errno(errp
, errno
, "Failed to get device fd");
327 /* Test and setup the device */
328 if (ioctl(s
->device
, VFIO_DEVICE_GET_INFO
, &device_info
)) {
329 error_setg_errno(errp
, errno
, "Failed to get device info");
334 if (device_info
.num_regions
< VFIO_PCI_CONFIG_REGION_INDEX
) {
335 error_setg(errp
, "Invalid device regions");
340 s
->config_region_info
= (struct vfio_region_info
) {
341 .index
= VFIO_PCI_CONFIG_REGION_INDEX
,
342 .argsz
= sizeof(struct vfio_region_info
),
344 if (ioctl(s
->device
, VFIO_DEVICE_GET_REGION_INFO
, &s
->config_region_info
)) {
345 error_setg_errno(errp
, errno
, "Failed to get config region info");
350 for (i
= 0; i
< ARRAY_SIZE(s
->bar_region_info
); i
++) {
351 ret
= qemu_vfio_pci_init_bar(s
, i
, errp
);
357 /* Enable bus master */
358 ret
= qemu_vfio_pci_read_config(s
, &pci_cmd
, sizeof(pci_cmd
), PCI_COMMAND
);
362 pci_cmd
|= PCI_COMMAND_MASTER
;
363 ret
= qemu_vfio_pci_write_config(s
, &pci_cmd
, sizeof(pci_cmd
), PCI_COMMAND
);
375 static void qemu_vfio_ram_block_added(RAMBlockNotifier
*n
,
376 void *host
, size_t size
)
378 QEMUVFIOState
*s
= container_of(n
, QEMUVFIOState
, ram_notifier
);
379 trace_qemu_vfio_ram_block_added(s
, host
, size
);
380 qemu_vfio_dma_map(s
, host
, size
, false, NULL
);
383 static void qemu_vfio_ram_block_removed(RAMBlockNotifier
*n
,
384 void *host
, size_t size
)
386 QEMUVFIOState
*s
= container_of(n
, QEMUVFIOState
, ram_notifier
);
388 trace_qemu_vfio_ram_block_removed(s
, host
, size
);
389 qemu_vfio_dma_unmap(s
, host
);
393 static int qemu_vfio_init_ramblock(RAMBlock
*rb
, void *opaque
)
395 void *host_addr
= qemu_ram_get_host_addr(rb
);
396 ram_addr_t length
= qemu_ram_get_used_length(rb
);
398 QEMUVFIOState
*s
= opaque
;
403 ret
= qemu_vfio_dma_map(s
, host_addr
, length
, false, NULL
);
405 fprintf(stderr
, "qemu_vfio_init_ramblock: failed %p %" PRId64
"\n",
406 host_addr
, (uint64_t)length
);
411 static void qemu_vfio_open_common(QEMUVFIOState
*s
)
413 qemu_mutex_init(&s
->lock
);
414 s
->ram_notifier
.ram_block_added
= qemu_vfio_ram_block_added
;
415 s
->ram_notifier
.ram_block_removed
= qemu_vfio_ram_block_removed
;
416 ram_block_notifier_add(&s
->ram_notifier
);
417 s
->low_water_mark
= QEMU_VFIO_IOVA_MIN
;
418 s
->high_water_mark
= QEMU_VFIO_IOVA_MAX
;
419 qemu_ram_foreach_block(qemu_vfio_init_ramblock
, s
);
423 * Open a PCI device, e.g. "0000:00:01.0".
425 QEMUVFIOState
*qemu_vfio_open_pci(const char *device
, Error
**errp
)
428 QEMUVFIOState
*s
= g_new0(QEMUVFIOState
, 1);
430 r
= qemu_vfio_init_pci(s
, device
, errp
);
435 qemu_vfio_open_common(s
);
439 static void qemu_vfio_dump_mapping(IOVAMapping
*m
)
441 if (QEMU_VFIO_DEBUG
) {
442 printf(" vfio mapping %p %" PRIx64
" to %" PRIx64
"\n", m
->host
,
443 (uint64_t)m
->size
, (uint64_t)m
->iova
);
447 static void qemu_vfio_dump_mappings(QEMUVFIOState
*s
)
451 if (QEMU_VFIO_DEBUG
) {
452 printf("vfio mappings\n");
453 for (i
= 0; i
< s
->nr_mappings
; ++i
) {
454 qemu_vfio_dump_mapping(&s
->mappings
[i
]);
460 * Find the mapping entry that contains [host, host + size) and set @index to
461 * the position. If no entry contains it, @index is the position _after_ which
462 * to insert the new mapping. IOW, it is the index of the largest element that
463 * is smaller than @host, or -1 if no entry is.
465 static IOVAMapping
*qemu_vfio_find_mapping(QEMUVFIOState
*s
, void *host
,
468 IOVAMapping
*p
= s
->mappings
;
469 IOVAMapping
*q
= p
? p
+ s
->nr_mappings
- 1 : NULL
;
471 trace_qemu_vfio_find_mapping(s
, host
);
477 mid
= p
+ (q
- p
) / 2;
481 if (mid
->host
> host
) {
483 } else if (mid
->host
< host
) {
489 if (mid
->host
> host
) {
491 } else if (mid
< &s
->mappings
[s
->nr_mappings
- 1]
492 && (mid
+ 1)->host
<= host
) {
495 *index
= mid
- &s
->mappings
[0];
496 if (mid
>= &s
->mappings
[0] &&
497 mid
->host
<= host
&& mid
->host
+ mid
->size
> host
) {
498 assert(mid
< &s
->mappings
[s
->nr_mappings
]);
501 /* At this point *index + 1 is the right position to insert the new
507 * Allocate IOVA and and create a new mapping record and insert it in @s.
509 static IOVAMapping
*qemu_vfio_add_mapping(QEMUVFIOState
*s
,
510 void *host
, size_t size
,
511 int index
, uint64_t iova
)
514 IOVAMapping m
= {.host
= host
, .size
= size
, .iova
= iova
};
517 assert(QEMU_IS_ALIGNED(size
, getpagesize()));
518 assert(QEMU_IS_ALIGNED(s
->low_water_mark
, getpagesize()));
519 assert(QEMU_IS_ALIGNED(s
->high_water_mark
, getpagesize()));
520 trace_qemu_vfio_new_mapping(s
, host
, size
, index
, iova
);
524 s
->mappings
= g_renew(IOVAMapping
, s
->mappings
, s
->nr_mappings
);
525 insert
= &s
->mappings
[index
];
526 shift
= s
->nr_mappings
- index
- 1;
528 memmove(insert
+ 1, insert
, shift
* sizeof(s
->mappings
[0]));
534 /* Do the DMA mapping with VFIO. */
535 static int qemu_vfio_do_mapping(QEMUVFIOState
*s
, void *host
, size_t size
,
538 struct vfio_iommu_type1_dma_map dma_map
= {
539 .argsz
= sizeof(dma_map
),
540 .flags
= VFIO_DMA_MAP_FLAG_READ
| VFIO_DMA_MAP_FLAG_WRITE
,
542 .vaddr
= (uintptr_t)host
,
545 trace_qemu_vfio_do_mapping(s
, host
, size
, iova
);
547 if (ioctl(s
->container
, VFIO_IOMMU_MAP_DMA
, &dma_map
)) {
548 error_report("VFIO_MAP_DMA: %d", -errno
);
555 * Undo the DMA mapping from @s with VFIO, and remove from mapping list.
557 static void qemu_vfio_undo_mapping(QEMUVFIOState
*s
, IOVAMapping
*mapping
,
561 struct vfio_iommu_type1_dma_unmap unmap
= {
562 .argsz
= sizeof(unmap
),
564 .iova
= mapping
->iova
,
565 .size
= mapping
->size
,
568 index
= mapping
- s
->mappings
;
569 assert(mapping
->size
> 0);
570 assert(QEMU_IS_ALIGNED(mapping
->size
, getpagesize()));
571 assert(index
>= 0 && index
< s
->nr_mappings
);
572 if (ioctl(s
->container
, VFIO_IOMMU_UNMAP_DMA
, &unmap
)) {
573 error_setg(errp
, "VFIO_UNMAP_DMA failed: %d", -errno
);
575 memmove(mapping
, &s
->mappings
[index
+ 1],
576 sizeof(s
->mappings
[0]) * (s
->nr_mappings
- index
- 1));
578 s
->mappings
= g_renew(IOVAMapping
, s
->mappings
, s
->nr_mappings
);
581 /* Check if the mapping list is (ascending) ordered. */
582 static bool qemu_vfio_verify_mappings(QEMUVFIOState
*s
)
585 if (QEMU_VFIO_DEBUG
) {
586 for (i
= 0; i
< s
->nr_mappings
- 1; ++i
) {
587 if (!(s
->mappings
[i
].host
< s
->mappings
[i
+ 1].host
)) {
588 fprintf(stderr
, "item %d not sorted!\n", i
);
589 qemu_vfio_dump_mappings(s
);
592 if (!(s
->mappings
[i
].host
+ s
->mappings
[i
].size
<=
593 s
->mappings
[i
+ 1].host
)) {
594 fprintf(stderr
, "item %d overlap with next!\n", i
);
595 qemu_vfio_dump_mappings(s
);
603 /* Map [host, host + size) area into a contiguous IOVA address space, and store
604 * the result in @iova if not NULL. The caller need to make sure the area is
605 * aligned to page size, and mustn't overlap with existing mapping areas (split
606 * mapping status within this area is not allowed).
608 int qemu_vfio_dma_map(QEMUVFIOState
*s
, void *host
, size_t size
,
609 bool temporary
, uint64_t *iova
)
613 IOVAMapping
*mapping
;
616 assert(QEMU_PTR_IS_ALIGNED(host
, getpagesize()));
617 assert(QEMU_IS_ALIGNED(size
, getpagesize()));
618 trace_qemu_vfio_dma_map(s
, host
, size
, temporary
, iova
);
619 qemu_mutex_lock(&s
->lock
);
620 mapping
= qemu_vfio_find_mapping(s
, host
, &index
);
622 iova0
= mapping
->iova
+ ((uint8_t *)host
- (uint8_t *)mapping
->host
);
624 if (s
->high_water_mark
- s
->low_water_mark
+ 1 < size
) {
629 iova0
= s
->low_water_mark
;
630 mapping
= qemu_vfio_add_mapping(s
, host
, size
, index
+ 1, iova0
);
635 assert(qemu_vfio_verify_mappings(s
));
636 ret
= qemu_vfio_do_mapping(s
, host
, size
, iova0
);
638 qemu_vfio_undo_mapping(s
, mapping
, NULL
);
641 s
->low_water_mark
+= size
;
642 qemu_vfio_dump_mappings(s
);
644 iova0
= s
->high_water_mark
- size
;
645 ret
= qemu_vfio_do_mapping(s
, host
, size
, iova0
);
649 s
->high_water_mark
-= size
;
656 qemu_mutex_unlock(&s
->lock
);
660 /* Reset the high watermark and free all "temporary" mappings. */
661 int qemu_vfio_dma_reset_temporary(QEMUVFIOState
*s
)
663 struct vfio_iommu_type1_dma_unmap unmap
= {
664 .argsz
= sizeof(unmap
),
666 .iova
= s
->high_water_mark
,
667 .size
= QEMU_VFIO_IOVA_MAX
- s
->high_water_mark
,
669 trace_qemu_vfio_dma_reset_temporary(s
);
670 qemu_mutex_lock(&s
->lock
);
671 if (ioctl(s
->container
, VFIO_IOMMU_UNMAP_DMA
, &unmap
)) {
672 error_report("VFIO_UNMAP_DMA: %d", -errno
);
673 qemu_mutex_unlock(&s
->lock
);
676 s
->high_water_mark
= QEMU_VFIO_IOVA_MAX
;
677 qemu_mutex_unlock(&s
->lock
);
681 /* Unmapping the whole area that was previously mapped with
682 * qemu_vfio_dma_map(). */
683 void qemu_vfio_dma_unmap(QEMUVFIOState
*s
, void *host
)
692 trace_qemu_vfio_dma_unmap(s
, host
);
693 qemu_mutex_lock(&s
->lock
);
694 m
= qemu_vfio_find_mapping(s
, host
, &index
);
698 qemu_vfio_undo_mapping(s
, m
, NULL
);
700 qemu_mutex_unlock(&s
->lock
);
703 static void qemu_vfio_reset(QEMUVFIOState
*s
)
705 ioctl(s
->device
, VFIO_DEVICE_RESET
);
708 /* Close and free the VFIO resources. */
709 void qemu_vfio_close(QEMUVFIOState
*s
)
716 for (i
= 0; i
< s
->nr_mappings
; ++i
) {
717 qemu_vfio_undo_mapping(s
, &s
->mappings
[i
], NULL
);
719 ram_block_notifier_remove(&s
->ram_notifier
);