docs: Move cpu-hotplug.rst into the system manual
[qemu/ar7.git] / util / vfio-helpers.c
blob2bec48e1635fc7bfc544a4f38a4261293d5fdadb
1 /*
2 * VFIO utility
4 * Copyright 2016 - 2018 Red Hat, Inc.
6 * Authors:
7 * Fam Zheng <famz@redhat.com>
9 * This work is licensed under the terms of the GNU GPL, version 2 or later.
10 * See the COPYING file in the top-level directory.
13 #include "qemu/osdep.h"
14 #include <sys/ioctl.h>
15 #include <linux/vfio.h>
16 #include "qapi/error.h"
17 #include "exec/ramlist.h"
18 #include "exec/cpu-common.h"
19 #include "exec/memory.h"
20 #include "trace.h"
21 #include "qemu/error-report.h"
22 #include "standard-headers/linux/pci_regs.h"
23 #include "qemu/event_notifier.h"
24 #include "qemu/vfio-helpers.h"
25 #include "qemu/lockable.h"
26 #include "trace.h"
28 #define QEMU_VFIO_DEBUG 0
30 #define QEMU_VFIO_IOVA_MIN 0x10000ULL
31 /* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface,
32 * we can use a runtime limit; alternatively it's also possible to do platform
33 * specific detection by reading sysfs entries. Until then, 39 is a safe bet.
34 **/
35 #define QEMU_VFIO_IOVA_MAX (1ULL << 39)
37 typedef struct {
38 /* Page aligned addr. */
39 void *host;
40 size_t size;
41 uint64_t iova;
42 } IOVAMapping;
44 struct IOVARange {
45 uint64_t start;
46 uint64_t end;
49 struct QEMUVFIOState {
50 QemuMutex lock;
52 /* These fields are protected by BQL */
53 int container;
54 int group;
55 int device;
56 RAMBlockNotifier ram_notifier;
57 struct vfio_region_info config_region_info, bar_region_info[6];
58 struct IOVARange *usable_iova_ranges;
59 uint8_t nb_iova_ranges;
61 /* These fields are protected by @lock */
62 /* VFIO's IO virtual address space is managed by splitting into a few
63 * sections:
65 * --------------- <= 0
66 * |xxxxxxxxxxxxx|
67 * |-------------| <= QEMU_VFIO_IOVA_MIN
68 * | |
69 * | Fixed |
70 * | |
71 * |-------------| <= low_water_mark
72 * | |
73 * | Free |
74 * | |
75 * |-------------| <= high_water_mark
76 * | |
77 * | Temp |
78 * | |
79 * |-------------| <= QEMU_VFIO_IOVA_MAX
80 * |xxxxxxxxxxxxx|
81 * |xxxxxxxxxxxxx|
82 * ---------------
84 * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid;
86 * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of
87 * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be
88 * reclaimed - low_water_mark never shrinks;
90 * - IOVAs in range [low_water_mark, high_water_mark) are free;
92 * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile
93 * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area
94 * is recycled. The caller should make sure I/O's depending on these
95 * mappings are completed before calling.
96 **/
97 uint64_t low_water_mark;
98 uint64_t high_water_mark;
99 IOVAMapping *mappings;
100 int nr_mappings;
104 * Find group file by PCI device address as specified @device, and return the
105 * path. The returned string is owned by caller and should be g_free'ed later.
107 static char *sysfs_find_group_file(const char *device, Error **errp)
109 char *sysfs_link;
110 char *sysfs_group;
111 char *p;
112 char *path = NULL;
114 sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device);
115 sysfs_group = g_malloc0(PATH_MAX);
116 if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) {
117 error_setg_errno(errp, errno, "Failed to find iommu group sysfs path");
118 goto out;
120 p = strrchr(sysfs_group, '/');
121 if (!p) {
122 error_setg(errp, "Failed to find iommu group number");
123 goto out;
126 path = g_strdup_printf("/dev/vfio/%s", p + 1);
127 out:
128 g_free(sysfs_link);
129 g_free(sysfs_group);
130 return path;
133 static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
135 assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info));
138 static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
140 assert_bar_index_valid(s, index);
141 s->bar_region_info[index] = (struct vfio_region_info) {
142 .index = VFIO_PCI_BAR0_REGION_INDEX + index,
143 .argsz = sizeof(struct vfio_region_info),
145 if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) {
146 error_setg_errno(errp, errno, "Failed to get BAR region info");
147 return -errno;
150 return 0;
154 * Map a PCI bar area.
156 void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
157 uint64_t offset, uint64_t size, int prot,
158 Error **errp)
160 void *p;
161 assert_bar_index_valid(s, index);
162 p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
163 prot, MAP_SHARED,
164 s->device, s->bar_region_info[index].offset + offset);
165 if (p == MAP_FAILED) {
166 error_setg_errno(errp, errno, "Failed to map BAR region");
167 p = NULL;
169 return p;
173 * Unmap a PCI bar area.
175 void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar,
176 uint64_t offset, uint64_t size)
178 if (bar) {
179 munmap(bar, MIN(size, s->bar_region_info[index].size - offset));
184 * Initialize device IRQ with @irq_type and register an event notifier.
186 int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e,
187 int irq_type, Error **errp)
189 int r;
190 struct vfio_irq_set *irq_set;
191 size_t irq_set_size;
192 struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
194 irq_info.index = irq_type;
195 if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) {
196 error_setg_errno(errp, errno, "Failed to get device interrupt info");
197 return -errno;
199 if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
200 error_setg(errp, "Device interrupt doesn't support eventfd");
201 return -EINVAL;
204 irq_set_size = sizeof(*irq_set) + sizeof(int);
205 irq_set = g_malloc0(irq_set_size);
207 /* Get to a known IRQ state */
208 *irq_set = (struct vfio_irq_set) {
209 .argsz = irq_set_size,
210 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
211 .index = irq_info.index,
212 .start = 0,
213 .count = 1,
216 *(int *)&irq_set->data = event_notifier_get_fd(e);
217 r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set);
218 g_free(irq_set);
219 if (r) {
220 error_setg_errno(errp, errno, "Failed to setup device interrupt");
221 return -errno;
223 return 0;
226 static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
227 int size, int ofs)
229 int ret;
231 do {
232 ret = pread(s->device, buf, size, s->config_region_info.offset + ofs);
233 } while (ret == -1 && errno == EINTR);
234 return ret == size ? 0 : -errno;
237 static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs)
239 int ret;
241 do {
242 ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs);
243 } while (ret == -1 && errno == EINTR);
244 return ret == size ? 0 : -errno;
247 static void collect_usable_iova_ranges(QEMUVFIOState *s, void *buf)
249 struct vfio_iommu_type1_info *info = (struct vfio_iommu_type1_info *)buf;
250 struct vfio_info_cap_header *cap = (void *)buf + info->cap_offset;
251 struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
252 int i;
254 while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
255 if (!cap->next) {
256 return;
258 cap = (struct vfio_info_cap_header *)(buf + cap->next);
261 cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
263 s->nb_iova_ranges = cap_iova_range->nr_iovas;
264 if (s->nb_iova_ranges > 1) {
265 s->usable_iova_ranges =
266 g_realloc(s->usable_iova_ranges,
267 s->nb_iova_ranges * sizeof(struct IOVARange));
270 for (i = 0; i < s->nb_iova_ranges; i++) {
271 s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
272 s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
276 static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
277 Error **errp)
279 int ret;
280 int i;
281 uint16_t pci_cmd;
282 struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
283 struct vfio_iommu_type1_info *iommu_info = NULL;
284 size_t iommu_info_size = sizeof(*iommu_info);
285 struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
286 char *group_file = NULL;
288 s->usable_iova_ranges = NULL;
290 /* Create a new container */
291 s->container = open("/dev/vfio/vfio", O_RDWR);
293 if (s->container == -1) {
294 error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio");
295 return -errno;
297 if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) {
298 error_setg(errp, "Invalid VFIO version");
299 ret = -EINVAL;
300 goto fail_container;
303 if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
304 error_setg_errno(errp, errno, "VFIO IOMMU check failed");
305 ret = -EINVAL;
306 goto fail_container;
309 /* Open the group */
310 group_file = sysfs_find_group_file(device, errp);
311 if (!group_file) {
312 ret = -EINVAL;
313 goto fail_container;
316 s->group = open(group_file, O_RDWR);
317 if (s->group == -1) {
318 error_setg_errno(errp, errno, "Failed to open VFIO group file: %s",
319 group_file);
320 g_free(group_file);
321 ret = -errno;
322 goto fail_container;
324 g_free(group_file);
326 /* Test the group is viable and available */
327 if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) {
328 error_setg_errno(errp, errno, "Failed to get VFIO group status");
329 ret = -errno;
330 goto fail;
333 if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
334 error_setg(errp, "VFIO group is not viable");
335 ret = -EINVAL;
336 goto fail;
339 /* Add the group to the container */
340 if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) {
341 error_setg_errno(errp, errno, "Failed to add group to VFIO container");
342 ret = -errno;
343 goto fail;
346 /* Enable the IOMMU model we want */
347 if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) {
348 error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type");
349 ret = -errno;
350 goto fail;
353 iommu_info = g_malloc0(iommu_info_size);
354 iommu_info->argsz = iommu_info_size;
356 /* Get additional IOMMU info */
357 if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
358 error_setg_errno(errp, errno, "Failed to get IOMMU info");
359 ret = -errno;
360 goto fail;
364 * if the kernel does not report usable IOVA regions, choose
365 * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
367 s->nb_iova_ranges = 1;
368 s->usable_iova_ranges = g_new0(struct IOVARange, 1);
369 s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
370 s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
372 if (iommu_info->argsz > iommu_info_size) {
373 iommu_info_size = iommu_info->argsz;
374 iommu_info = g_realloc(iommu_info, iommu_info_size);
375 if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
376 ret = -errno;
377 goto fail;
379 collect_usable_iova_ranges(s, iommu_info);
382 s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
384 if (s->device < 0) {
385 error_setg_errno(errp, errno, "Failed to get device fd");
386 ret = -errno;
387 goto fail;
390 /* Test and setup the device */
391 if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) {
392 error_setg_errno(errp, errno, "Failed to get device info");
393 ret = -errno;
394 goto fail;
397 if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
398 error_setg(errp, "Invalid device regions");
399 ret = -EINVAL;
400 goto fail;
403 s->config_region_info = (struct vfio_region_info) {
404 .index = VFIO_PCI_CONFIG_REGION_INDEX,
405 .argsz = sizeof(struct vfio_region_info),
407 if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) {
408 error_setg_errno(errp, errno, "Failed to get config region info");
409 ret = -errno;
410 goto fail;
413 for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) {
414 ret = qemu_vfio_pci_init_bar(s, i, errp);
415 if (ret) {
416 goto fail;
420 /* Enable bus master */
421 ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
422 if (ret) {
423 goto fail;
425 pci_cmd |= PCI_COMMAND_MASTER;
426 ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
427 if (ret) {
428 goto fail;
430 g_free(iommu_info);
431 return 0;
432 fail:
433 g_free(s->usable_iova_ranges);
434 s->usable_iova_ranges = NULL;
435 s->nb_iova_ranges = 0;
436 g_free(iommu_info);
437 close(s->group);
438 fail_container:
439 close(s->container);
440 return ret;
443 static void qemu_vfio_ram_block_added(RAMBlockNotifier *n,
444 void *host, size_t size)
446 QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
447 trace_qemu_vfio_ram_block_added(s, host, size);
448 qemu_vfio_dma_map(s, host, size, false, NULL);
451 static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n,
452 void *host, size_t size)
454 QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
455 if (host) {
456 trace_qemu_vfio_ram_block_removed(s, host, size);
457 qemu_vfio_dma_unmap(s, host);
461 static int qemu_vfio_init_ramblock(RAMBlock *rb, void *opaque)
463 void *host_addr = qemu_ram_get_host_addr(rb);
464 ram_addr_t length = qemu_ram_get_used_length(rb);
465 int ret;
466 QEMUVFIOState *s = opaque;
468 if (!host_addr) {
469 return 0;
471 ret = qemu_vfio_dma_map(s, host_addr, length, false, NULL);
472 if (ret) {
473 fprintf(stderr, "qemu_vfio_init_ramblock: failed %p %" PRId64 "\n",
474 host_addr, (uint64_t)length);
476 return 0;
479 static void qemu_vfio_open_common(QEMUVFIOState *s)
481 qemu_mutex_init(&s->lock);
482 s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added;
483 s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed;
484 ram_block_notifier_add(&s->ram_notifier);
485 s->low_water_mark = QEMU_VFIO_IOVA_MIN;
486 s->high_water_mark = QEMU_VFIO_IOVA_MAX;
487 qemu_ram_foreach_block(qemu_vfio_init_ramblock, s);
491 * Open a PCI device, e.g. "0000:00:01.0".
493 QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
495 int r;
496 QEMUVFIOState *s = g_new0(QEMUVFIOState, 1);
499 * VFIO may pin all memory inside mappings, resulting it in pinning
500 * all memory inside RAM blocks unconditionally.
502 r = ram_block_discard_disable(true);
503 if (r) {
504 error_setg_errno(errp, -r, "Cannot set discarding of RAM broken");
505 g_free(s);
506 return NULL;
509 r = qemu_vfio_init_pci(s, device, errp);
510 if (r) {
511 ram_block_discard_disable(false);
512 g_free(s);
513 return NULL;
515 qemu_vfio_open_common(s);
516 return s;
519 static void qemu_vfio_dump_mapping(IOVAMapping *m)
521 if (QEMU_VFIO_DEBUG) {
522 printf(" vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host,
523 (uint64_t)m->size, (uint64_t)m->iova);
527 static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
529 int i;
531 if (QEMU_VFIO_DEBUG) {
532 printf("vfio mappings\n");
533 for (i = 0; i < s->nr_mappings; ++i) {
534 qemu_vfio_dump_mapping(&s->mappings[i]);
540 * Find the mapping entry that contains [host, host + size) and set @index to
541 * the position. If no entry contains it, @index is the position _after_ which
542 * to insert the new mapping. IOW, it is the index of the largest element that
543 * is smaller than @host, or -1 if no entry is.
545 static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host,
546 int *index)
548 IOVAMapping *p = s->mappings;
549 IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL;
550 IOVAMapping *mid;
551 trace_qemu_vfio_find_mapping(s, host);
552 if (!p) {
553 *index = -1;
554 return NULL;
556 while (true) {
557 mid = p + (q - p) / 2;
558 if (mid == p) {
559 break;
561 if (mid->host > host) {
562 q = mid;
563 } else if (mid->host < host) {
564 p = mid;
565 } else {
566 break;
569 if (mid->host > host) {
570 mid--;
571 } else if (mid < &s->mappings[s->nr_mappings - 1]
572 && (mid + 1)->host <= host) {
573 mid++;
575 *index = mid - &s->mappings[0];
576 if (mid >= &s->mappings[0] &&
577 mid->host <= host && mid->host + mid->size > host) {
578 assert(mid < &s->mappings[s->nr_mappings]);
579 return mid;
581 /* At this point *index + 1 is the right position to insert the new
582 * mapping.*/
583 return NULL;
587 * Allocate IOVA and create a new mapping record and insert it in @s.
589 static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s,
590 void *host, size_t size,
591 int index, uint64_t iova)
593 int shift;
594 IOVAMapping m = {.host = host, .size = size, .iova = iova};
595 IOVAMapping *insert;
597 assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size));
598 assert(QEMU_IS_ALIGNED(s->low_water_mark, qemu_real_host_page_size));
599 assert(QEMU_IS_ALIGNED(s->high_water_mark, qemu_real_host_page_size));
600 trace_qemu_vfio_new_mapping(s, host, size, index, iova);
602 assert(index >= 0);
603 s->nr_mappings++;
604 s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
605 insert = &s->mappings[index];
606 shift = s->nr_mappings - index - 1;
607 if (shift) {
608 memmove(insert + 1, insert, shift * sizeof(s->mappings[0]));
610 *insert = m;
611 return insert;
614 /* Do the DMA mapping with VFIO. */
615 static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
616 uint64_t iova)
618 struct vfio_iommu_type1_dma_map dma_map = {
619 .argsz = sizeof(dma_map),
620 .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
621 .iova = iova,
622 .vaddr = (uintptr_t)host,
623 .size = size,
625 trace_qemu_vfio_do_mapping(s, host, size, iova);
627 if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
628 error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
629 return -errno;
631 return 0;
635 * Undo the DMA mapping from @s with VFIO, and remove from mapping list.
637 static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping,
638 Error **errp)
640 int index;
641 struct vfio_iommu_type1_dma_unmap unmap = {
642 .argsz = sizeof(unmap),
643 .flags = 0,
644 .iova = mapping->iova,
645 .size = mapping->size,
648 index = mapping - s->mappings;
649 assert(mapping->size > 0);
650 assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size));
651 assert(index >= 0 && index < s->nr_mappings);
652 if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
653 error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed");
655 memmove(mapping, &s->mappings[index + 1],
656 sizeof(s->mappings[0]) * (s->nr_mappings - index - 1));
657 s->nr_mappings--;
658 s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
661 /* Check if the mapping list is (ascending) ordered. */
662 static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
664 int i;
665 if (QEMU_VFIO_DEBUG) {
666 for (i = 0; i < s->nr_mappings - 1; ++i) {
667 if (!(s->mappings[i].host < s->mappings[i + 1].host)) {
668 fprintf(stderr, "item %d not sorted!\n", i);
669 qemu_vfio_dump_mappings(s);
670 return false;
672 if (!(s->mappings[i].host + s->mappings[i].size <=
673 s->mappings[i + 1].host)) {
674 fprintf(stderr, "item %d overlap with next!\n", i);
675 qemu_vfio_dump_mappings(s);
676 return false;
680 return true;
683 static int
684 qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
686 int i;
688 for (i = 0; i < s->nb_iova_ranges; i++) {
689 if (s->usable_iova_ranges[i].end < s->low_water_mark) {
690 continue;
692 s->low_water_mark =
693 MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
695 if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size ||
696 s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) {
697 *iova = s->low_water_mark;
698 s->low_water_mark += size;
699 return 0;
702 return -ENOMEM;
705 static int
706 qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
708 int i;
710 for (i = s->nb_iova_ranges - 1; i >= 0; i--) {
711 if (s->usable_iova_ranges[i].start > s->high_water_mark) {
712 continue;
714 s->high_water_mark =
715 MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1);
717 if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size ||
718 s->high_water_mark - s->usable_iova_ranges[i].start + 1 == 0) {
719 *iova = s->high_water_mark - size;
720 s->high_water_mark = *iova;
721 return 0;
724 return -ENOMEM;
727 /* Map [host, host + size) area into a contiguous IOVA address space, and store
728 * the result in @iova if not NULL. The caller need to make sure the area is
729 * aligned to page size, and mustn't overlap with existing mapping areas (split
730 * mapping status within this area is not allowed).
732 int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
733 bool temporary, uint64_t *iova)
735 int ret = 0;
736 int index;
737 IOVAMapping *mapping;
738 uint64_t iova0;
740 assert(QEMU_PTR_IS_ALIGNED(host, qemu_real_host_page_size));
741 assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size));
742 trace_qemu_vfio_dma_map(s, host, size, temporary, iova);
743 qemu_mutex_lock(&s->lock);
744 mapping = qemu_vfio_find_mapping(s, host, &index);
745 if (mapping) {
746 iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host);
747 } else {
748 if (s->high_water_mark - s->low_water_mark + 1 < size) {
749 ret = -ENOMEM;
750 goto out;
752 if (!temporary) {
753 if (qemu_vfio_find_fixed_iova(s, size, &iova0)) {
754 ret = -ENOMEM;
755 goto out;
758 mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
759 if (!mapping) {
760 ret = -ENOMEM;
761 goto out;
763 assert(qemu_vfio_verify_mappings(s));
764 ret = qemu_vfio_do_mapping(s, host, size, iova0);
765 if (ret) {
766 qemu_vfio_undo_mapping(s, mapping, NULL);
767 goto out;
769 qemu_vfio_dump_mappings(s);
770 } else {
771 if (qemu_vfio_find_temp_iova(s, size, &iova0)) {
772 ret = -ENOMEM;
773 goto out;
775 ret = qemu_vfio_do_mapping(s, host, size, iova0);
776 if (ret) {
777 goto out;
781 if (iova) {
782 *iova = iova0;
784 out:
785 qemu_mutex_unlock(&s->lock);
786 return ret;
789 /* Reset the high watermark and free all "temporary" mappings. */
790 int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
792 struct vfio_iommu_type1_dma_unmap unmap = {
793 .argsz = sizeof(unmap),
794 .flags = 0,
795 .iova = s->high_water_mark,
796 .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark,
798 trace_qemu_vfio_dma_reset_temporary(s);
799 QEMU_LOCK_GUARD(&s->lock);
800 if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
801 error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
802 return -errno;
804 s->high_water_mark = QEMU_VFIO_IOVA_MAX;
805 return 0;
808 /* Unmapping the whole area that was previously mapped with
809 * qemu_vfio_dma_map(). */
810 void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host)
812 int index = 0;
813 IOVAMapping *m;
815 if (!host) {
816 return;
819 trace_qemu_vfio_dma_unmap(s, host);
820 qemu_mutex_lock(&s->lock);
821 m = qemu_vfio_find_mapping(s, host, &index);
822 if (!m) {
823 goto out;
825 qemu_vfio_undo_mapping(s, m, NULL);
826 out:
827 qemu_mutex_unlock(&s->lock);
830 static void qemu_vfio_reset(QEMUVFIOState *s)
832 ioctl(s->device, VFIO_DEVICE_RESET);
835 /* Close and free the VFIO resources. */
836 void qemu_vfio_close(QEMUVFIOState *s)
838 int i;
840 if (!s) {
841 return;
843 for (i = 0; i < s->nr_mappings; ++i) {
844 qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
846 ram_block_notifier_remove(&s->ram_notifier);
847 g_free(s->usable_iova_ranges);
848 s->nb_iova_ranges = 0;
849 qemu_vfio_reset(s);
850 close(s->device);
851 close(s->group);
852 close(s->container);
853 ram_block_discard_disable(false);