4 * Copyright 2016 - 2018 Red Hat, Inc.
7 * Fam Zheng <famz@redhat.com>
9 * This work is licensed under the terms of the GNU GPL, version 2 or later.
10 * See the COPYING file in the top-level directory.
13 #include "qemu/osdep.h"
14 #include <sys/ioctl.h>
15 #include <linux/vfio.h>
16 #include "qapi/error.h"
17 #include "exec/ramlist.h"
18 #include "exec/cpu-common.h"
20 #include "qemu/queue.h"
21 #include "qemu/error-report.h"
22 #include "standard-headers/linux/pci_regs.h"
23 #include "qemu/event_notifier.h"
24 #include "qemu/vfio-helpers.h"
27 #define QEMU_VFIO_DEBUG 0
29 #define QEMU_VFIO_IOVA_MIN 0x10000ULL
30 /* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface,
31 * we can use a runtime limit; alternatively it's also possible to do platform
32 * specific detection by reading sysfs entries. Until then, 39 is a safe bet.
34 #define QEMU_VFIO_IOVA_MAX (1ULL << 39)
37 /* Page aligned addr. */
43 struct QEMUVFIOState
{
46 /* These fields are protected by BQL */
50 RAMBlockNotifier ram_notifier
;
51 struct vfio_region_info config_region_info
, bar_region_info
[6];
53 /* These fields are protected by @lock */
54 /* VFIO's IO virtual address space is managed by splitting into a few
57 * --------------- <= 0
59 * |-------------| <= QEMU_VFIO_IOVA_MIN
63 * |-------------| <= low_water_mark
67 * |-------------| <= high_water_mark
71 * |-------------| <= QEMU_VFIO_IOVA_MAX
76 * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid;
78 * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of
79 * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be
80 * reclaimed - low_water_mark never shrinks;
82 * - IOVAs in range [low_water_mark, high_water_mark) are free;
84 * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile
85 * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area
86 * is recycled. The caller should make sure I/O's depending on these
87 * mappings are completed before calling.
89 uint64_t low_water_mark
;
90 uint64_t high_water_mark
;
91 IOVAMapping
*mappings
;
96 * Find group file by PCI device address as specified @device, and return the
97 * path. The returned string is owned by caller and should be g_free'ed later.
99 static char *sysfs_find_group_file(const char *device
, Error
**errp
)
106 sysfs_link
= g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device
);
107 sysfs_group
= g_malloc0(PATH_MAX
);
108 if (readlink(sysfs_link
, sysfs_group
, PATH_MAX
- 1) == -1) {
109 error_setg_errno(errp
, errno
, "Failed to find iommu group sysfs path");
112 p
= strrchr(sysfs_group
, '/');
114 error_setg(errp
, "Failed to find iommu group number");
118 path
= g_strdup_printf("/dev/vfio/%s", p
+ 1);
125 static inline void assert_bar_index_valid(QEMUVFIOState
*s
, int index
)
127 assert(index
>= 0 && index
< ARRAY_SIZE(s
->bar_region_info
));
130 static int qemu_vfio_pci_init_bar(QEMUVFIOState
*s
, int index
, Error
**errp
)
132 assert_bar_index_valid(s
, index
);
133 s
->bar_region_info
[index
] = (struct vfio_region_info
) {
134 .index
= VFIO_PCI_BAR0_REGION_INDEX
+ index
,
135 .argsz
= sizeof(struct vfio_region_info
),
137 if (ioctl(s
->device
, VFIO_DEVICE_GET_REGION_INFO
, &s
->bar_region_info
[index
])) {
138 error_setg_errno(errp
, errno
, "Failed to get BAR region info");
146 * Map a PCI bar area.
148 void *qemu_vfio_pci_map_bar(QEMUVFIOState
*s
, int index
,
149 uint64_t offset
, uint64_t size
,
153 assert_bar_index_valid(s
, index
);
154 p
= mmap(NULL
, MIN(size
, s
->bar_region_info
[index
].size
- offset
),
155 PROT_READ
| PROT_WRITE
, MAP_SHARED
,
156 s
->device
, s
->bar_region_info
[index
].offset
+ offset
);
157 if (p
== MAP_FAILED
) {
158 error_setg_errno(errp
, errno
, "Failed to map BAR region");
165 * Unmap a PCI bar area.
167 void qemu_vfio_pci_unmap_bar(QEMUVFIOState
*s
, int index
, void *bar
,
168 uint64_t offset
, uint64_t size
)
171 munmap(bar
, MIN(size
, s
->bar_region_info
[index
].size
- offset
));
176 * Initialize device IRQ with @irq_type and and register an event notifier.
178 int qemu_vfio_pci_init_irq(QEMUVFIOState
*s
, EventNotifier
*e
,
179 int irq_type
, Error
**errp
)
182 struct vfio_irq_set
*irq_set
;
184 struct vfio_irq_info irq_info
= { .argsz
= sizeof(irq_info
) };
186 irq_info
.index
= irq_type
;
187 if (ioctl(s
->device
, VFIO_DEVICE_GET_IRQ_INFO
, &irq_info
)) {
188 error_setg_errno(errp
, errno
, "Failed to get device interrupt info");
191 if (!(irq_info
.flags
& VFIO_IRQ_INFO_EVENTFD
)) {
192 error_setg(errp
, "Device interrupt doesn't support eventfd");
196 irq_set_size
= sizeof(*irq_set
) + sizeof(int);
197 irq_set
= g_malloc0(irq_set_size
);
199 /* Get to a known IRQ state */
200 *irq_set
= (struct vfio_irq_set
) {
201 .argsz
= irq_set_size
,
202 .flags
= VFIO_IRQ_SET_DATA_EVENTFD
| VFIO_IRQ_SET_ACTION_TRIGGER
,
203 .index
= irq_info
.index
,
208 *(int *)&irq_set
->data
= event_notifier_get_fd(e
);
209 r
= ioctl(s
->device
, VFIO_DEVICE_SET_IRQS
, irq_set
);
212 error_setg_errno(errp
, errno
, "Failed to setup device interrupt");
218 static int qemu_vfio_pci_read_config(QEMUVFIOState
*s
, void *buf
,
224 ret
= pread(s
->device
, buf
, size
, s
->config_region_info
.offset
+ ofs
);
225 } while (ret
== -1 && errno
== EINTR
);
226 return ret
== size
? 0 : -errno
;
229 static int qemu_vfio_pci_write_config(QEMUVFIOState
*s
, void *buf
, int size
, int ofs
)
234 ret
= pwrite(s
->device
, buf
, size
, s
->config_region_info
.offset
+ ofs
);
235 } while (ret
== -1 && errno
== EINTR
);
236 return ret
== size
? 0 : -errno
;
239 static int qemu_vfio_init_pci(QEMUVFIOState
*s
, const char *device
,
245 struct vfio_group_status group_status
= { .argsz
= sizeof(group_status
) };
246 struct vfio_iommu_type1_info iommu_info
= { .argsz
= sizeof(iommu_info
) };
247 struct vfio_device_info device_info
= { .argsz
= sizeof(device_info
) };
248 char *group_file
= NULL
;
250 /* Create a new container */
251 s
->container
= open("/dev/vfio/vfio", O_RDWR
);
253 if (s
->container
== -1) {
254 error_setg_errno(errp
, errno
, "Failed to open /dev/vfio/vfio");
257 if (ioctl(s
->container
, VFIO_GET_API_VERSION
) != VFIO_API_VERSION
) {
258 error_setg(errp
, "Invalid VFIO version");
263 if (!ioctl(s
->container
, VFIO_CHECK_EXTENSION
, VFIO_TYPE1_IOMMU
)) {
264 error_setg_errno(errp
, errno
, "VFIO IOMMU check failed");
270 group_file
= sysfs_find_group_file(device
, errp
);
276 s
->group
= open(group_file
, O_RDWR
);
277 if (s
->group
== -1) {
278 error_setg_errno(errp
, errno
, "Failed to open VFIO group file: %s",
286 /* Test the group is viable and available */
287 if (ioctl(s
->group
, VFIO_GROUP_GET_STATUS
, &group_status
)) {
288 error_setg_errno(errp
, errno
, "Failed to get VFIO group status");
293 if (!(group_status
.flags
& VFIO_GROUP_FLAGS_VIABLE
)) {
294 error_setg(errp
, "VFIO group is not viable");
299 /* Add the group to the container */
300 if (ioctl(s
->group
, VFIO_GROUP_SET_CONTAINER
, &s
->container
)) {
301 error_setg_errno(errp
, errno
, "Failed to add group to VFIO container");
306 /* Enable the IOMMU model we want */
307 if (ioctl(s
->container
, VFIO_SET_IOMMU
, VFIO_TYPE1_IOMMU
)) {
308 error_setg_errno(errp
, errno
, "Failed to set VFIO IOMMU type");
313 /* Get additional IOMMU info */
314 if (ioctl(s
->container
, VFIO_IOMMU_GET_INFO
, &iommu_info
)) {
315 error_setg_errno(errp
, errno
, "Failed to get IOMMU info");
320 s
->device
= ioctl(s
->group
, VFIO_GROUP_GET_DEVICE_FD
, device
);
323 error_setg_errno(errp
, errno
, "Failed to get device fd");
328 /* Test and setup the device */
329 if (ioctl(s
->device
, VFIO_DEVICE_GET_INFO
, &device_info
)) {
330 error_setg_errno(errp
, errno
, "Failed to get device info");
335 if (device_info
.num_regions
< VFIO_PCI_CONFIG_REGION_INDEX
) {
336 error_setg(errp
, "Invalid device regions");
341 s
->config_region_info
= (struct vfio_region_info
) {
342 .index
= VFIO_PCI_CONFIG_REGION_INDEX
,
343 .argsz
= sizeof(struct vfio_region_info
),
345 if (ioctl(s
->device
, VFIO_DEVICE_GET_REGION_INFO
, &s
->config_region_info
)) {
346 error_setg_errno(errp
, errno
, "Failed to get config region info");
351 for (i
= 0; i
< ARRAY_SIZE(s
->bar_region_info
); i
++) {
352 ret
= qemu_vfio_pci_init_bar(s
, i
, errp
);
358 /* Enable bus master */
359 ret
= qemu_vfio_pci_read_config(s
, &pci_cmd
, sizeof(pci_cmd
), PCI_COMMAND
);
363 pci_cmd
|= PCI_COMMAND_MASTER
;
364 ret
= qemu_vfio_pci_write_config(s
, &pci_cmd
, sizeof(pci_cmd
), PCI_COMMAND
);
376 static void qemu_vfio_ram_block_added(RAMBlockNotifier
*n
,
377 void *host
, size_t size
)
379 QEMUVFIOState
*s
= container_of(n
, QEMUVFIOState
, ram_notifier
);
380 trace_qemu_vfio_ram_block_added(s
, host
, size
);
381 qemu_vfio_dma_map(s
, host
, size
, false, NULL
);
384 static void qemu_vfio_ram_block_removed(RAMBlockNotifier
*n
,
385 void *host
, size_t size
)
387 QEMUVFIOState
*s
= container_of(n
, QEMUVFIOState
, ram_notifier
);
389 trace_qemu_vfio_ram_block_removed(s
, host
, size
);
390 qemu_vfio_dma_unmap(s
, host
);
394 static int qemu_vfio_init_ramblock(RAMBlock
*rb
, void *opaque
)
396 void *host_addr
= qemu_ram_get_host_addr(rb
);
397 ram_addr_t length
= qemu_ram_get_used_length(rb
);
399 QEMUVFIOState
*s
= opaque
;
404 ret
= qemu_vfio_dma_map(s
, host_addr
, length
, false, NULL
);
406 fprintf(stderr
, "qemu_vfio_init_ramblock: failed %p %" PRId64
"\n",
407 host_addr
, (uint64_t)length
);
412 static void qemu_vfio_open_common(QEMUVFIOState
*s
)
414 qemu_mutex_init(&s
->lock
);
415 s
->ram_notifier
.ram_block_added
= qemu_vfio_ram_block_added
;
416 s
->ram_notifier
.ram_block_removed
= qemu_vfio_ram_block_removed
;
417 ram_block_notifier_add(&s
->ram_notifier
);
418 s
->low_water_mark
= QEMU_VFIO_IOVA_MIN
;
419 s
->high_water_mark
= QEMU_VFIO_IOVA_MAX
;
420 qemu_ram_foreach_block(qemu_vfio_init_ramblock
, s
);
424 * Open a PCI device, e.g. "0000:00:01.0".
426 QEMUVFIOState
*qemu_vfio_open_pci(const char *device
, Error
**errp
)
429 QEMUVFIOState
*s
= g_new0(QEMUVFIOState
, 1);
431 r
= qemu_vfio_init_pci(s
, device
, errp
);
436 qemu_vfio_open_common(s
);
440 static void qemu_vfio_dump_mapping(IOVAMapping
*m
)
442 if (QEMU_VFIO_DEBUG
) {
443 printf(" vfio mapping %p %" PRIx64
" to %" PRIx64
"\n", m
->host
,
444 (uint64_t)m
->size
, (uint64_t)m
->iova
);
448 static void qemu_vfio_dump_mappings(QEMUVFIOState
*s
)
452 if (QEMU_VFIO_DEBUG
) {
453 printf("vfio mappings\n");
454 for (i
= 0; i
< s
->nr_mappings
; ++i
) {
455 qemu_vfio_dump_mapping(&s
->mappings
[i
]);
461 * Find the mapping entry that contains [host, host + size) and set @index to
462 * the position. If no entry contains it, @index is the position _after_ which
463 * to insert the new mapping. IOW, it is the index of the largest element that
464 * is smaller than @host, or -1 if no entry is.
466 static IOVAMapping
*qemu_vfio_find_mapping(QEMUVFIOState
*s
, void *host
,
469 IOVAMapping
*p
= s
->mappings
;
470 IOVAMapping
*q
= p
? p
+ s
->nr_mappings
- 1 : NULL
;
472 trace_qemu_vfio_find_mapping(s
, host
);
478 mid
= p
+ (q
- p
) / 2;
482 if (mid
->host
> host
) {
484 } else if (mid
->host
< host
) {
490 if (mid
->host
> host
) {
492 } else if (mid
< &s
->mappings
[s
->nr_mappings
- 1]
493 && (mid
+ 1)->host
<= host
) {
496 *index
= mid
- &s
->mappings
[0];
497 if (mid
>= &s
->mappings
[0] &&
498 mid
->host
<= host
&& mid
->host
+ mid
->size
> host
) {
499 assert(mid
< &s
->mappings
[s
->nr_mappings
]);
502 /* At this point *index + 1 is the right position to insert the new
508 * Allocate IOVA and and create a new mapping record and insert it in @s.
510 static IOVAMapping
*qemu_vfio_add_mapping(QEMUVFIOState
*s
,
511 void *host
, size_t size
,
512 int index
, uint64_t iova
)
515 IOVAMapping m
= {.host
= host
, .size
= size
, .iova
= iova
};
518 assert(QEMU_IS_ALIGNED(size
, getpagesize()));
519 assert(QEMU_IS_ALIGNED(s
->low_water_mark
, getpagesize()));
520 assert(QEMU_IS_ALIGNED(s
->high_water_mark
, getpagesize()));
521 trace_qemu_vfio_new_mapping(s
, host
, size
, index
, iova
);
525 s
->mappings
= g_renew(IOVAMapping
, s
->mappings
, s
->nr_mappings
);
526 insert
= &s
->mappings
[index
];
527 shift
= s
->nr_mappings
- index
- 1;
529 memmove(insert
+ 1, insert
, shift
* sizeof(s
->mappings
[0]));
535 /* Do the DMA mapping with VFIO. */
536 static int qemu_vfio_do_mapping(QEMUVFIOState
*s
, void *host
, size_t size
,
539 struct vfio_iommu_type1_dma_map dma_map
= {
540 .argsz
= sizeof(dma_map
),
541 .flags
= VFIO_DMA_MAP_FLAG_READ
| VFIO_DMA_MAP_FLAG_WRITE
,
543 .vaddr
= (uintptr_t)host
,
546 trace_qemu_vfio_do_mapping(s
, host
, size
, iova
);
548 if (ioctl(s
->container
, VFIO_IOMMU_MAP_DMA
, &dma_map
)) {
549 error_report("VFIO_MAP_DMA: %d", -errno
);
556 * Undo the DMA mapping from @s with VFIO, and remove from mapping list.
558 static void qemu_vfio_undo_mapping(QEMUVFIOState
*s
, IOVAMapping
*mapping
,
562 struct vfio_iommu_type1_dma_unmap unmap
= {
563 .argsz
= sizeof(unmap
),
565 .iova
= mapping
->iova
,
566 .size
= mapping
->size
,
569 index
= mapping
- s
->mappings
;
570 assert(mapping
->size
> 0);
571 assert(QEMU_IS_ALIGNED(mapping
->size
, getpagesize()));
572 assert(index
>= 0 && index
< s
->nr_mappings
);
573 if (ioctl(s
->container
, VFIO_IOMMU_UNMAP_DMA
, &unmap
)) {
574 error_setg(errp
, "VFIO_UNMAP_DMA failed: %d", -errno
);
576 memmove(mapping
, &s
->mappings
[index
+ 1],
577 sizeof(s
->mappings
[0]) * (s
->nr_mappings
- index
- 1));
579 s
->mappings
= g_renew(IOVAMapping
, s
->mappings
, s
->nr_mappings
);
582 /* Check if the mapping list is (ascending) ordered. */
583 static bool qemu_vfio_verify_mappings(QEMUVFIOState
*s
)
586 if (QEMU_VFIO_DEBUG
) {
587 for (i
= 0; i
< s
->nr_mappings
- 1; ++i
) {
588 if (!(s
->mappings
[i
].host
< s
->mappings
[i
+ 1].host
)) {
589 fprintf(stderr
, "item %d not sorted!\n", i
);
590 qemu_vfio_dump_mappings(s
);
593 if (!(s
->mappings
[i
].host
+ s
->mappings
[i
].size
<=
594 s
->mappings
[i
+ 1].host
)) {
595 fprintf(stderr
, "item %d overlap with next!\n", i
);
596 qemu_vfio_dump_mappings(s
);
604 /* Map [host, host + size) area into a contiguous IOVA address space, and store
605 * the result in @iova if not NULL. The caller need to make sure the area is
606 * aligned to page size, and mustn't overlap with existing mapping areas (split
607 * mapping status within this area is not allowed).
609 int qemu_vfio_dma_map(QEMUVFIOState
*s
, void *host
, size_t size
,
610 bool temporary
, uint64_t *iova
)
614 IOVAMapping
*mapping
;
617 assert(QEMU_PTR_IS_ALIGNED(host
, getpagesize()));
618 assert(QEMU_IS_ALIGNED(size
, getpagesize()));
619 trace_qemu_vfio_dma_map(s
, host
, size
, temporary
, iova
);
620 qemu_mutex_lock(&s
->lock
);
621 mapping
= qemu_vfio_find_mapping(s
, host
, &index
);
623 iova0
= mapping
->iova
+ ((uint8_t *)host
- (uint8_t *)mapping
->host
);
625 if (s
->high_water_mark
- s
->low_water_mark
+ 1 < size
) {
630 iova0
= s
->low_water_mark
;
631 mapping
= qemu_vfio_add_mapping(s
, host
, size
, index
+ 1, iova0
);
636 assert(qemu_vfio_verify_mappings(s
));
637 ret
= qemu_vfio_do_mapping(s
, host
, size
, iova0
);
639 qemu_vfio_undo_mapping(s
, mapping
, NULL
);
642 s
->low_water_mark
+= size
;
643 qemu_vfio_dump_mappings(s
);
645 iova0
= s
->high_water_mark
- size
;
646 ret
= qemu_vfio_do_mapping(s
, host
, size
, iova0
);
650 s
->high_water_mark
-= size
;
657 qemu_mutex_unlock(&s
->lock
);
661 /* Reset the high watermark and free all "temporary" mappings. */
662 int qemu_vfio_dma_reset_temporary(QEMUVFIOState
*s
)
664 struct vfio_iommu_type1_dma_unmap unmap
= {
665 .argsz
= sizeof(unmap
),
667 .iova
= s
->high_water_mark
,
668 .size
= QEMU_VFIO_IOVA_MAX
- s
->high_water_mark
,
670 trace_qemu_vfio_dma_reset_temporary(s
);
671 qemu_mutex_lock(&s
->lock
);
672 if (ioctl(s
->container
, VFIO_IOMMU_UNMAP_DMA
, &unmap
)) {
673 error_report("VFIO_UNMAP_DMA: %d", -errno
);
674 qemu_mutex_unlock(&s
->lock
);
677 s
->high_water_mark
= QEMU_VFIO_IOVA_MAX
;
678 qemu_mutex_unlock(&s
->lock
);
682 /* Unmapping the whole area that was previously mapped with
683 * qemu_vfio_dma_map(). */
684 void qemu_vfio_dma_unmap(QEMUVFIOState
*s
, void *host
)
693 trace_qemu_vfio_dma_unmap(s
, host
);
694 qemu_mutex_lock(&s
->lock
);
695 m
= qemu_vfio_find_mapping(s
, host
, &index
);
699 qemu_vfio_undo_mapping(s
, m
, NULL
);
701 qemu_mutex_unlock(&s
->lock
);
704 static void qemu_vfio_reset(QEMUVFIOState
*s
)
706 ioctl(s
->device
, VFIO_DEVICE_RESET
);
709 /* Close and free the VFIO resources. */
710 void qemu_vfio_close(QEMUVFIOState
*s
)
717 for (i
= 0; i
< s
->nr_mappings
; ++i
) {
718 qemu_vfio_undo_mapping(s
, &s
->mappings
[i
], NULL
);
720 ram_block_notifier_remove(&s
->ram_notifier
);