qemu-iotests: Test qemu-img bitmap/commit exit code on error
[qemu.git] / hw / virtio / vhost-vdpa.c
blob542e003101b5e1e82bb6ccc56f92858e0ab0251b
1 /*
2 * vhost-vdpa
4 * Copyright(c) 2017-2018 Intel Corporation.
5 * Copyright(c) 2020 Red Hat, Inc.
7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
8 * See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
13 #include <linux/vhost.h>
14 #include <linux/vfio.h>
15 #include <sys/eventfd.h>
16 #include <sys/ioctl.h>
17 #include "hw/virtio/vhost.h"
18 #include "hw/virtio/vhost-backend.h"
19 #include "hw/virtio/virtio-net.h"
20 #include "hw/virtio/vhost-shadow-virtqueue.h"
21 #include "hw/virtio/vhost-vdpa.h"
22 #include "exec/address-spaces.h"
23 #include "migration/blocker.h"
24 #include "qemu/cutils.h"
25 #include "qemu/main-loop.h"
26 #include "cpu.h"
27 #include "trace.h"
28 #include "qapi/error.h"
31 * Return one past the end of the end of section. Be careful with uint64_t
32 * conversions!
34 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section)
36 Int128 llend = int128_make64(section->offset_within_address_space);
37 llend = int128_add(llend, section->size);
38 llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
40 return llend;
43 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
44 uint64_t iova_min,
45 uint64_t iova_max)
47 Int128 llend;
49 if ((!memory_region_is_ram(section->mr) &&
50 !memory_region_is_iommu(section->mr)) ||
51 memory_region_is_protected(section->mr) ||
52 /* vhost-vDPA doesn't allow MMIO to be mapped */
53 memory_region_is_ram_device(section->mr)) {
54 return true;
57 if (section->offset_within_address_space < iova_min) {
58 error_report("RAM section out of device range (min=0x%" PRIx64
59 ", addr=0x%" HWADDR_PRIx ")",
60 iova_min, section->offset_within_address_space);
61 return true;
64 llend = vhost_vdpa_section_end(section);
65 if (int128_gt(llend, int128_make64(iova_max))) {
66 error_report("RAM section out of device range (max=0x%" PRIx64
67 ", end addr=0x%" PRIx64 ")",
68 iova_max, int128_get64(llend));
69 return true;
72 return false;
76 * The caller must set asid = 0 if the device does not support asid.
77 * This is not an ABI break since it is set to 0 by the initializer anyway.
79 int vhost_vdpa_dma_map(struct vhost_vdpa *v, uint32_t asid, hwaddr iova,
80 hwaddr size, void *vaddr, bool readonly)
82 struct vhost_msg_v2 msg = {};
83 int fd = v->device_fd;
84 int ret = 0;
86 msg.type = v->msg_type;
87 msg.asid = asid;
88 msg.iotlb.iova = iova;
89 msg.iotlb.size = size;
90 msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr;
91 msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
92 msg.iotlb.type = VHOST_IOTLB_UPDATE;
94 trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.asid, msg.iotlb.iova,
95 msg.iotlb.size, msg.iotlb.uaddr, msg.iotlb.perm,
96 msg.iotlb.type);
98 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
99 error_report("failed to write, fd=%d, errno=%d (%s)",
100 fd, errno, strerror(errno));
101 return -EIO ;
104 return ret;
108 * The caller must set asid = 0 if the device does not support asid.
109 * This is not an ABI break since it is set to 0 by the initializer anyway.
111 int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, uint32_t asid, hwaddr iova,
112 hwaddr size)
114 struct vhost_msg_v2 msg = {};
115 int fd = v->device_fd;
116 int ret = 0;
118 msg.type = v->msg_type;
119 msg.asid = asid;
120 msg.iotlb.iova = iova;
121 msg.iotlb.size = size;
122 msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
124 trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.asid, msg.iotlb.iova,
125 msg.iotlb.size, msg.iotlb.type);
127 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
128 error_report("failed to write, fd=%d, errno=%d (%s)",
129 fd, errno, strerror(errno));
130 return -EIO ;
133 return ret;
136 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v)
138 int fd = v->device_fd;
139 struct vhost_msg_v2 msg = {
140 .type = v->msg_type,
141 .iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
144 trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);
145 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
146 error_report("failed to write, fd=%d, errno=%d (%s)",
147 fd, errno, strerror(errno));
151 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v)
153 if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) &&
154 !v->iotlb_batch_begin_sent) {
155 vhost_vdpa_listener_begin_batch(v);
158 v->iotlb_batch_begin_sent = true;
161 static void vhost_vdpa_listener_commit(MemoryListener *listener)
163 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
164 struct vhost_dev *dev = v->dev;
165 struct vhost_msg_v2 msg = {};
166 int fd = v->device_fd;
168 if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) {
169 return;
172 if (!v->iotlb_batch_begin_sent) {
173 return;
176 msg.type = v->msg_type;
177 msg.iotlb.type = VHOST_IOTLB_BATCH_END;
179 trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type);
180 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
181 error_report("failed to write, fd=%d, errno=%d (%s)",
182 fd, errno, strerror(errno));
185 v->iotlb_batch_begin_sent = false;
188 static void vhost_vdpa_listener_region_add(MemoryListener *listener,
189 MemoryRegionSection *section)
191 DMAMap mem_region = {};
192 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
193 hwaddr iova;
194 Int128 llend, llsize;
195 void *vaddr;
196 int ret;
198 if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
199 v->iova_range.last)) {
200 return;
203 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
204 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
205 error_report("%s received unaligned region", __func__);
206 return;
209 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
210 llend = vhost_vdpa_section_end(section);
211 if (int128_ge(int128_make64(iova), llend)) {
212 return;
215 memory_region_ref(section->mr);
217 /* Here we assume that memory_region_is_ram(section->mr)==true */
219 vaddr = memory_region_get_ram_ptr(section->mr) +
220 section->offset_within_region +
221 (iova - section->offset_within_address_space);
223 trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend),
224 vaddr, section->readonly);
226 llsize = int128_sub(llend, int128_make64(iova));
227 if (v->shadow_data) {
228 int r;
230 mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr,
231 mem_region.size = int128_get64(llsize) - 1,
232 mem_region.perm = IOMMU_ACCESS_FLAG(true, section->readonly),
234 r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
235 if (unlikely(r != IOVA_OK)) {
236 error_report("Can't allocate a mapping (%d)", r);
237 goto fail;
240 iova = mem_region.iova;
243 vhost_vdpa_iotlb_batch_begin_once(v);
244 ret = vhost_vdpa_dma_map(v, VHOST_VDPA_GUEST_PA_ASID, iova,
245 int128_get64(llsize), vaddr, section->readonly);
246 if (ret) {
247 error_report("vhost vdpa map fail!");
248 goto fail_map;
251 return;
253 fail_map:
254 if (v->shadow_data) {
255 vhost_iova_tree_remove(v->iova_tree, mem_region);
258 fail:
260 * On the initfn path, store the first error in the container so we
261 * can gracefully fail. Runtime, there's not much we can do other
262 * than throw a hardware error.
264 error_report("vhost-vdpa: DMA mapping failed, unable to continue");
265 return;
269 static void vhost_vdpa_listener_region_del(MemoryListener *listener,
270 MemoryRegionSection *section)
272 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
273 hwaddr iova;
274 Int128 llend, llsize;
275 int ret;
277 if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
278 v->iova_range.last)) {
279 return;
282 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
283 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
284 error_report("%s received unaligned region", __func__);
285 return;
288 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
289 llend = vhost_vdpa_section_end(section);
291 trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend));
293 if (int128_ge(int128_make64(iova), llend)) {
294 return;
297 llsize = int128_sub(llend, int128_make64(iova));
299 if (v->shadow_data) {
300 const DMAMap *result;
301 const void *vaddr = memory_region_get_ram_ptr(section->mr) +
302 section->offset_within_region +
303 (iova - section->offset_within_address_space);
304 DMAMap mem_region = {
305 .translated_addr = (hwaddr)(uintptr_t)vaddr,
306 .size = int128_get64(llsize) - 1,
309 result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
310 if (!result) {
311 /* The memory listener map wasn't mapped */
312 return;
314 iova = result->iova;
315 vhost_iova_tree_remove(v->iova_tree, *result);
317 vhost_vdpa_iotlb_batch_begin_once(v);
318 ret = vhost_vdpa_dma_unmap(v, VHOST_VDPA_GUEST_PA_ASID, iova,
319 int128_get64(llsize));
320 if (ret) {
321 error_report("vhost_vdpa dma unmap error!");
324 memory_region_unref(section->mr);
327 * IOTLB API is used by vhost-vdpa which requires incremental updating
328 * of the mapping. So we can not use generic vhost memory listener which
329 * depends on the addnop().
331 static const MemoryListener vhost_vdpa_memory_listener = {
332 .name = "vhost-vdpa",
333 .commit = vhost_vdpa_listener_commit,
334 .region_add = vhost_vdpa_listener_region_add,
335 .region_del = vhost_vdpa_listener_region_del,
338 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request,
339 void *arg)
341 struct vhost_vdpa *v = dev->opaque;
342 int fd = v->device_fd;
343 int ret;
345 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
347 ret = ioctl(fd, request, arg);
348 return ret < 0 ? -errno : ret;
351 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status)
353 uint8_t s;
354 int ret;
356 trace_vhost_vdpa_add_status(dev, status);
357 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
358 if (ret < 0) {
359 return ret;
362 s |= status;
364 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s);
365 if (ret < 0) {
366 return ret;
369 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
370 if (ret < 0) {
371 return ret;
374 if (!(s & status)) {
375 return -EIO;
378 return 0;
381 int vhost_vdpa_get_iova_range(int fd, struct vhost_vdpa_iova_range *iova_range)
383 int ret = ioctl(fd, VHOST_VDPA_GET_IOVA_RANGE, iova_range);
385 return ret < 0 ? -errno : 0;
389 * The use of this function is for requests that only need to be
390 * applied once. Typically such request occurs at the beginning
391 * of operation, and before setting up queues. It should not be
392 * used for request that performs operation until all queues are
393 * set, which would need to check dev->vq_index_end instead.
395 static bool vhost_vdpa_first_dev(struct vhost_dev *dev)
397 struct vhost_vdpa *v = dev->opaque;
399 return v->index == 0;
402 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
403 uint64_t *features)
405 int ret;
407 ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
408 trace_vhost_vdpa_get_features(dev, *features);
409 return ret;
412 static void vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v)
414 g_autoptr(GPtrArray) shadow_vqs = NULL;
416 shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
417 for (unsigned n = 0; n < hdev->nvqs; ++n) {
418 VhostShadowVirtqueue *svq;
420 svq = vhost_svq_new(v->shadow_vq_ops, v->shadow_vq_ops_opaque);
421 g_ptr_array_add(shadow_vqs, svq);
424 v->shadow_vqs = g_steal_pointer(&shadow_vqs);
427 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
429 struct vhost_vdpa *v;
430 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
431 trace_vhost_vdpa_init(dev, opaque);
432 int ret;
435 * Similar to VFIO, we end up pinning all guest memory and have to
436 * disable discarding of RAM.
438 ret = ram_block_discard_disable(true);
439 if (ret) {
440 error_report("Cannot set discarding of RAM broken");
441 return ret;
444 v = opaque;
445 v->dev = dev;
446 dev->opaque = opaque ;
447 v->listener = vhost_vdpa_memory_listener;
448 v->msg_type = VHOST_IOTLB_MSG_V2;
449 vhost_vdpa_init_svq(dev, v);
451 if (!vhost_vdpa_first_dev(dev)) {
452 return 0;
455 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
456 VIRTIO_CONFIG_S_DRIVER);
458 return 0;
461 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
462 int queue_index)
464 size_t page_size = qemu_real_host_page_size();
465 struct vhost_vdpa *v = dev->opaque;
466 VirtIODevice *vdev = dev->vdev;
467 VhostVDPAHostNotifier *n;
469 n = &v->notifier[queue_index];
471 if (n->addr) {
472 virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false);
473 object_unparent(OBJECT(&n->mr));
474 munmap(n->addr, page_size);
475 n->addr = NULL;
479 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index)
481 size_t page_size = qemu_real_host_page_size();
482 struct vhost_vdpa *v = dev->opaque;
483 VirtIODevice *vdev = dev->vdev;
484 VhostVDPAHostNotifier *n;
485 int fd = v->device_fd;
486 void *addr;
487 char *name;
489 vhost_vdpa_host_notifier_uninit(dev, queue_index);
491 n = &v->notifier[queue_index];
493 addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
494 queue_index * page_size);
495 if (addr == MAP_FAILED) {
496 goto err;
499 name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
500 v, queue_index);
501 memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
502 page_size, addr);
503 g_free(name);
505 if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) {
506 object_unparent(OBJECT(&n->mr));
507 munmap(addr, page_size);
508 goto err;
510 n->addr = addr;
512 return 0;
514 err:
515 return -1;
518 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
520 int i;
523 * Pack all the changes to the memory regions in a single
524 * transaction to avoid a few updating of the address space
525 * topology.
527 memory_region_transaction_begin();
529 for (i = dev->vq_index; i < dev->vq_index + n; i++) {
530 vhost_vdpa_host_notifier_uninit(dev, i);
533 memory_region_transaction_commit();
536 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
538 struct vhost_vdpa *v = dev->opaque;
539 int i;
541 if (v->shadow_vqs_enabled) {
542 /* FIXME SVQ is not compatible with host notifiers mr */
543 return;
547 * Pack all the changes to the memory regions in a single
548 * transaction to avoid a few updating of the address space
549 * topology.
551 memory_region_transaction_begin();
553 for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
554 if (vhost_vdpa_host_notifier_init(dev, i)) {
555 vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index);
556 break;
560 memory_region_transaction_commit();
563 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
565 struct vhost_vdpa *v = dev->opaque;
566 size_t idx;
568 for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
569 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
571 g_ptr_array_free(v->shadow_vqs, true);
574 static int vhost_vdpa_cleanup(struct vhost_dev *dev)
576 struct vhost_vdpa *v;
577 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
578 v = dev->opaque;
579 trace_vhost_vdpa_cleanup(dev, v);
580 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
581 memory_listener_unregister(&v->listener);
582 vhost_vdpa_svq_cleanup(dev);
584 dev->opaque = NULL;
585 ram_block_discard_disable(false);
587 return 0;
590 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev)
592 trace_vhost_vdpa_memslots_limit(dev, INT_MAX);
593 return INT_MAX;
596 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
597 struct vhost_memory *mem)
599 if (!vhost_vdpa_first_dev(dev)) {
600 return 0;
603 trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding);
604 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) &&
605 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) {
606 int i;
607 for (i = 0; i < mem->nregions; i++) {
608 trace_vhost_vdpa_dump_regions(dev, i,
609 mem->regions[i].guest_phys_addr,
610 mem->regions[i].memory_size,
611 mem->regions[i].userspace_addr,
612 mem->regions[i].flags_padding);
615 if (mem->padding) {
616 return -EINVAL;
619 return 0;
622 static int vhost_vdpa_set_features(struct vhost_dev *dev,
623 uint64_t features)
625 struct vhost_vdpa *v = dev->opaque;
626 int ret;
628 if (!vhost_vdpa_first_dev(dev)) {
629 return 0;
632 if (v->shadow_vqs_enabled) {
633 if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
635 * QEMU is just trying to enable or disable logging. SVQ handles
636 * this sepparately, so no need to forward this.
638 v->acked_features = features;
639 return 0;
642 v->acked_features = features;
644 /* We must not ack _F_LOG if SVQ is enabled */
645 features &= ~BIT_ULL(VHOST_F_LOG_ALL);
648 trace_vhost_vdpa_set_features(dev, features);
649 ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
650 if (ret) {
651 return ret;
654 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
657 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
659 uint64_t features;
660 uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
661 0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH |
662 0x1ULL << VHOST_BACKEND_F_IOTLB_ASID;
663 int r;
665 if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
666 return -EFAULT;
669 features &= f;
671 if (vhost_vdpa_first_dev(dev)) {
672 r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features);
673 if (r) {
674 return -EFAULT;
678 dev->backend_cap = features;
680 return 0;
683 static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
684 uint32_t *device_id)
686 int ret;
687 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id);
688 trace_vhost_vdpa_get_device_id(dev, *device_id);
689 return ret;
692 static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
694 if (!v->shadow_vqs_enabled) {
695 return;
698 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
699 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
700 vhost_svq_stop(svq);
704 static int vhost_vdpa_reset_device(struct vhost_dev *dev)
706 struct vhost_vdpa *v = dev->opaque;
707 int ret;
708 uint8_t status = 0;
710 vhost_vdpa_reset_svq(v);
712 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
713 trace_vhost_vdpa_reset_device(dev, status);
714 return ret;
717 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx)
719 assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
721 trace_vhost_vdpa_get_vq_index(dev, idx, idx);
722 return idx;
725 static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev)
727 int i;
728 trace_vhost_vdpa_set_vring_ready(dev);
729 for (i = 0; i < dev->nvqs; ++i) {
730 struct vhost_vring_state state = {
731 .index = dev->vq_index + i,
732 .num = 1,
734 vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
736 return 0;
739 static int vhost_vdpa_set_config_call(struct vhost_dev *dev,
740 int fd)
742 trace_vhost_vdpa_set_config_call(dev, fd);
743 return vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG_CALL, &fd);
746 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config,
747 uint32_t config_len)
749 int b, len;
750 char line[QEMU_HEXDUMP_LINE_LEN];
752 for (b = 0; b < config_len; b += 16) {
753 len = config_len - b;
754 qemu_hexdump_line(line, b, config, len, false);
755 trace_vhost_vdpa_dump_config(dev, line);
759 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data,
760 uint32_t offset, uint32_t size,
761 uint32_t flags)
763 struct vhost_vdpa_config *config;
764 int ret;
765 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
767 trace_vhost_vdpa_set_config(dev, offset, size, flags);
768 config = g_malloc(size + config_size);
769 config->off = offset;
770 config->len = size;
771 memcpy(config->buf, data, size);
772 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) &&
773 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
774 vhost_vdpa_dump_config(dev, data, size);
776 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config);
777 g_free(config);
778 return ret;
781 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
782 uint32_t config_len, Error **errp)
784 struct vhost_vdpa_config *v_config;
785 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
786 int ret;
788 trace_vhost_vdpa_get_config(dev, config, config_len);
789 v_config = g_malloc(config_len + config_size);
790 v_config->len = config_len;
791 v_config->off = 0;
792 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config);
793 memcpy(config, v_config->buf, config_len);
794 g_free(v_config);
795 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) &&
796 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
797 vhost_vdpa_dump_config(dev, config, config_len);
799 return ret;
802 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
803 struct vhost_vring_state *ring)
805 trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
806 return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
809 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
810 struct vhost_vring_file *file)
812 trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
813 return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
816 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
817 struct vhost_vring_file *file)
819 trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
820 return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
823 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
824 struct vhost_vring_addr *addr)
826 trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
827 addr->desc_user_addr, addr->used_user_addr,
828 addr->avail_user_addr,
829 addr->log_guest_addr);
831 return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
836 * Set the shadow virtqueue descriptors to the device
838 * @dev: The vhost device model
839 * @svq: The shadow virtqueue
840 * @idx: The index of the virtqueue in the vhost device
841 * @errp: Error
843 * Note that this function does not rewind kick file descriptor if cannot set
844 * call one.
846 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
847 VhostShadowVirtqueue *svq, unsigned idx,
848 Error **errp)
850 struct vhost_vring_file file = {
851 .index = dev->vq_index + idx,
853 const EventNotifier *event_notifier = &svq->hdev_kick;
854 int r;
856 r = event_notifier_init(&svq->hdev_kick, 0);
857 if (r != 0) {
858 error_setg_errno(errp, -r, "Couldn't create kick event notifier");
859 goto err_init_hdev_kick;
862 r = event_notifier_init(&svq->hdev_call, 0);
863 if (r != 0) {
864 error_setg_errno(errp, -r, "Couldn't create call event notifier");
865 goto err_init_hdev_call;
868 file.fd = event_notifier_get_fd(event_notifier);
869 r = vhost_vdpa_set_vring_dev_kick(dev, &file);
870 if (unlikely(r != 0)) {
871 error_setg_errno(errp, -r, "Can't set device kick fd");
872 goto err_init_set_dev_fd;
875 event_notifier = &svq->hdev_call;
876 file.fd = event_notifier_get_fd(event_notifier);
877 r = vhost_vdpa_set_vring_dev_call(dev, &file);
878 if (unlikely(r != 0)) {
879 error_setg_errno(errp, -r, "Can't set device call fd");
880 goto err_init_set_dev_fd;
883 return 0;
885 err_init_set_dev_fd:
886 event_notifier_set_handler(&svq->hdev_call, NULL);
888 err_init_hdev_call:
889 event_notifier_cleanup(&svq->hdev_kick);
891 err_init_hdev_kick:
892 return r;
896 * Unmap a SVQ area in the device
898 static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr addr)
900 const DMAMap needle = {
901 .translated_addr = addr,
903 const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, &needle);
904 hwaddr size;
905 int r;
907 if (unlikely(!result)) {
908 error_report("Unable to find SVQ address to unmap");
909 return;
912 size = ROUND_UP(result->size, qemu_real_host_page_size());
913 r = vhost_vdpa_dma_unmap(v, v->address_space_id, result->iova, size);
914 if (unlikely(r < 0)) {
915 error_report("Unable to unmap SVQ vring: %s (%d)", g_strerror(-r), -r);
916 return;
919 vhost_iova_tree_remove(v->iova_tree, *result);
922 static void vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
923 const VhostShadowVirtqueue *svq)
925 struct vhost_vdpa *v = dev->opaque;
926 struct vhost_vring_addr svq_addr;
928 vhost_svq_get_vring_addr(svq, &svq_addr);
930 vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr);
932 vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr);
936 * Map the SVQ area in the device
938 * @v: Vhost-vdpa device
939 * @needle: The area to search iova
940 * @errorp: Error pointer
942 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
943 Error **errp)
945 int r;
947 r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
948 if (unlikely(r != IOVA_OK)) {
949 error_setg(errp, "Cannot allocate iova (%d)", r);
950 return false;
953 r = vhost_vdpa_dma_map(v, v->address_space_id, needle->iova,
954 needle->size + 1,
955 (void *)(uintptr_t)needle->translated_addr,
956 needle->perm == IOMMU_RO);
957 if (unlikely(r != 0)) {
958 error_setg_errno(errp, -r, "Cannot map region to device");
959 vhost_iova_tree_remove(v->iova_tree, *needle);
962 return r == 0;
966 * Map the shadow virtqueue rings in the device
968 * @dev: The vhost device
969 * @svq: The shadow virtqueue
970 * @addr: Assigned IOVA addresses
971 * @errp: Error pointer
973 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
974 const VhostShadowVirtqueue *svq,
975 struct vhost_vring_addr *addr,
976 Error **errp)
978 ERRP_GUARD();
979 DMAMap device_region, driver_region;
980 struct vhost_vring_addr svq_addr;
981 struct vhost_vdpa *v = dev->opaque;
982 size_t device_size = vhost_svq_device_area_size(svq);
983 size_t driver_size = vhost_svq_driver_area_size(svq);
984 size_t avail_offset;
985 bool ok;
987 vhost_svq_get_vring_addr(svq, &svq_addr);
989 driver_region = (DMAMap) {
990 .translated_addr = svq_addr.desc_user_addr,
991 .size = driver_size - 1,
992 .perm = IOMMU_RO,
994 ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
995 if (unlikely(!ok)) {
996 error_prepend(errp, "Cannot create vq driver region: ");
997 return false;
999 addr->desc_user_addr = driver_region.iova;
1000 avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
1001 addr->avail_user_addr = driver_region.iova + avail_offset;
1003 device_region = (DMAMap) {
1004 .translated_addr = svq_addr.used_user_addr,
1005 .size = device_size - 1,
1006 .perm = IOMMU_RW,
1008 ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
1009 if (unlikely(!ok)) {
1010 error_prepend(errp, "Cannot create vq device region: ");
1011 vhost_vdpa_svq_unmap_ring(v, driver_region.translated_addr);
1013 addr->used_user_addr = device_region.iova;
1015 return ok;
1018 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
1019 VhostShadowVirtqueue *svq, unsigned idx,
1020 Error **errp)
1022 uint16_t vq_index = dev->vq_index + idx;
1023 struct vhost_vring_state s = {
1024 .index = vq_index,
1026 int r;
1028 r = vhost_vdpa_set_dev_vring_base(dev, &s);
1029 if (unlikely(r)) {
1030 error_setg_errno(errp, -r, "Cannot set vring base");
1031 return false;
1034 r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
1035 return r == 0;
1038 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
1040 struct vhost_vdpa *v = dev->opaque;
1041 Error *err = NULL;
1042 unsigned i;
1044 if (!v->shadow_vqs_enabled) {
1045 return true;
1048 for (i = 0; i < v->shadow_vqs->len; ++i) {
1049 VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
1050 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1051 struct vhost_vring_addr addr = {
1052 .index = dev->vq_index + i,
1054 int r;
1055 bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
1056 if (unlikely(!ok)) {
1057 goto err;
1060 vhost_svq_start(svq, dev->vdev, vq, v->iova_tree);
1061 ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
1062 if (unlikely(!ok)) {
1063 goto err_map;
1066 /* Override vring GPA set by vhost subsystem */
1067 r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
1068 if (unlikely(r != 0)) {
1069 error_setg_errno(&err, -r, "Cannot set device address");
1070 goto err_set_addr;
1074 return true;
1076 err_set_addr:
1077 vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
1079 err_map:
1080 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
1082 err:
1083 error_reportf_err(err, "Cannot setup SVQ %u: ", i);
1084 for (unsigned j = 0; j < i; ++j) {
1085 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
1086 vhost_vdpa_svq_unmap_rings(dev, svq);
1087 vhost_svq_stop(svq);
1090 return false;
1093 static void vhost_vdpa_svqs_stop(struct vhost_dev *dev)
1095 struct vhost_vdpa *v = dev->opaque;
1097 if (!v->shadow_vqs_enabled) {
1098 return;
1101 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
1102 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1103 vhost_vdpa_svq_unmap_rings(dev, svq);
1105 event_notifier_cleanup(&svq->hdev_kick);
1106 event_notifier_cleanup(&svq->hdev_call);
1110 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
1112 struct vhost_vdpa *v = dev->opaque;
1113 bool ok;
1114 trace_vhost_vdpa_dev_start(dev, started);
1116 if (started) {
1117 vhost_vdpa_host_notifiers_init(dev);
1118 ok = vhost_vdpa_svqs_start(dev);
1119 if (unlikely(!ok)) {
1120 return -1;
1122 vhost_vdpa_set_vring_ready(dev);
1123 } else {
1124 vhost_vdpa_svqs_stop(dev);
1125 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
1128 if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
1129 return 0;
1132 if (started) {
1133 memory_listener_register(&v->listener, &address_space_memory);
1134 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
1135 } else {
1136 vhost_vdpa_reset_device(dev);
1137 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
1138 VIRTIO_CONFIG_S_DRIVER);
1139 memory_listener_unregister(&v->listener);
1141 return 0;
1145 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
1146 struct vhost_log *log)
1148 struct vhost_vdpa *v = dev->opaque;
1149 if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) {
1150 return 0;
1153 trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd,
1154 log->log);
1155 return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base);
1158 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
1159 struct vhost_vring_addr *addr)
1161 struct vhost_vdpa *v = dev->opaque;
1163 if (v->shadow_vqs_enabled) {
1165 * Device vring addr was set at device start. SVQ base is handled by
1166 * VirtQueue code.
1168 return 0;
1171 return vhost_vdpa_set_vring_dev_addr(dev, addr);
1174 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
1175 struct vhost_vring_state *ring)
1177 trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num);
1178 return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
1181 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
1182 struct vhost_vring_state *ring)
1184 struct vhost_vdpa *v = dev->opaque;
1185 VirtQueue *vq = virtio_get_queue(dev->vdev, ring->index);
1188 * vhost-vdpa devices does not support in-flight requests. Set all of them
1189 * as available.
1191 * TODO: This is ok for networking, but other kinds of devices might
1192 * have problems with these retransmissions.
1194 while (virtqueue_rewind(vq, 1)) {
1195 continue;
1197 if (v->shadow_vqs_enabled) {
1199 * Device vring base was set at device start. SVQ base is handled by
1200 * VirtQueue code.
1202 return 0;
1205 return vhost_vdpa_set_dev_vring_base(dev, ring);
1208 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
1209 struct vhost_vring_state *ring)
1211 struct vhost_vdpa *v = dev->opaque;
1212 int ret;
1214 if (v->shadow_vqs_enabled) {
1215 ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index);
1216 return 0;
1219 ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
1220 trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
1221 return ret;
1224 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
1225 struct vhost_vring_file *file)
1227 struct vhost_vdpa *v = dev->opaque;
1228 int vdpa_idx = file->index - dev->vq_index;
1230 if (v->shadow_vqs_enabled) {
1231 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1232 vhost_svq_set_svq_kick_fd(svq, file->fd);
1233 return 0;
1234 } else {
1235 return vhost_vdpa_set_vring_dev_kick(dev, file);
1239 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
1240 struct vhost_vring_file *file)
1242 struct vhost_vdpa *v = dev->opaque;
1244 if (v->shadow_vqs_enabled) {
1245 int vdpa_idx = file->index - dev->vq_index;
1246 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1248 vhost_svq_set_svq_call_fd(svq, file->fd);
1249 return 0;
1250 } else {
1251 return vhost_vdpa_set_vring_dev_call(dev, file);
1255 static int vhost_vdpa_get_features(struct vhost_dev *dev,
1256 uint64_t *features)
1258 struct vhost_vdpa *v = dev->opaque;
1259 int ret = vhost_vdpa_get_dev_features(dev, features);
1261 if (ret == 0 && v->shadow_vqs_enabled) {
1262 /* Add SVQ logging capabilities */
1263 *features |= BIT_ULL(VHOST_F_LOG_ALL);
1266 return ret;
1269 static int vhost_vdpa_set_owner(struct vhost_dev *dev)
1271 if (!vhost_vdpa_first_dev(dev)) {
1272 return 0;
1275 trace_vhost_vdpa_set_owner(dev);
1276 return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
1279 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
1280 struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
1282 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
1283 addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
1284 addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
1285 addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
1286 trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr,
1287 addr->avail_user_addr, addr->used_user_addr);
1288 return 0;
1291 static bool vhost_vdpa_force_iommu(struct vhost_dev *dev)
1293 return true;
1296 const VhostOps vdpa_ops = {
1297 .backend_type = VHOST_BACKEND_TYPE_VDPA,
1298 .vhost_backend_init = vhost_vdpa_init,
1299 .vhost_backend_cleanup = vhost_vdpa_cleanup,
1300 .vhost_set_log_base = vhost_vdpa_set_log_base,
1301 .vhost_set_vring_addr = vhost_vdpa_set_vring_addr,
1302 .vhost_set_vring_num = vhost_vdpa_set_vring_num,
1303 .vhost_set_vring_base = vhost_vdpa_set_vring_base,
1304 .vhost_get_vring_base = vhost_vdpa_get_vring_base,
1305 .vhost_set_vring_kick = vhost_vdpa_set_vring_kick,
1306 .vhost_set_vring_call = vhost_vdpa_set_vring_call,
1307 .vhost_get_features = vhost_vdpa_get_features,
1308 .vhost_set_backend_cap = vhost_vdpa_set_backend_cap,
1309 .vhost_set_owner = vhost_vdpa_set_owner,
1310 .vhost_set_vring_endian = NULL,
1311 .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit,
1312 .vhost_set_mem_table = vhost_vdpa_set_mem_table,
1313 .vhost_set_features = vhost_vdpa_set_features,
1314 .vhost_reset_device = vhost_vdpa_reset_device,
1315 .vhost_get_vq_index = vhost_vdpa_get_vq_index,
1316 .vhost_get_config = vhost_vdpa_get_config,
1317 .vhost_set_config = vhost_vdpa_set_config,
1318 .vhost_requires_shm_log = NULL,
1319 .vhost_migration_done = NULL,
1320 .vhost_backend_can_merge = NULL,
1321 .vhost_net_set_mtu = NULL,
1322 .vhost_set_iotlb_callback = NULL,
1323 .vhost_send_device_iotlb_msg = NULL,
1324 .vhost_dev_start = vhost_vdpa_dev_start,
1325 .vhost_get_device_id = vhost_vdpa_get_device_id,
1326 .vhost_vq_get_addr = vhost_vdpa_vq_get_addr,
1327 .vhost_force_iommu = vhost_vdpa_force_iommu,
1328 .vhost_set_config_call = vhost_vdpa_set_config_call,