hw/riscv: virt: Enable booting S-mode firmware from pflash
[qemu.git] / hw / virtio / vhost-vdpa.c
blob7468e44b87770b41a22e92c23e7c6114b99b3f92
1 /*
2 * vhost-vdpa
4 * Copyright(c) 2017-2018 Intel Corporation.
5 * Copyright(c) 2020 Red Hat, Inc.
7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
8 * See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
13 #include <linux/vhost.h>
14 #include <linux/vfio.h>
15 #include <sys/eventfd.h>
16 #include <sys/ioctl.h>
17 #include "hw/virtio/vhost.h"
18 #include "hw/virtio/vhost-backend.h"
19 #include "hw/virtio/virtio-net.h"
20 #include "hw/virtio/vhost-shadow-virtqueue.h"
21 #include "hw/virtio/vhost-vdpa.h"
22 #include "exec/address-spaces.h"
23 #include "migration/blocker.h"
24 #include "qemu/cutils.h"
25 #include "qemu/main-loop.h"
26 #include "cpu.h"
27 #include "trace.h"
28 #include "qapi/error.h"
31 * Return one past the end of the end of section. Be careful with uint64_t
32 * conversions!
34 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section)
36 Int128 llend = int128_make64(section->offset_within_address_space);
37 llend = int128_add(llend, section->size);
38 llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
40 return llend;
43 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
44 uint64_t iova_min,
45 uint64_t iova_max)
47 Int128 llend;
49 if ((!memory_region_is_ram(section->mr) &&
50 !memory_region_is_iommu(section->mr)) ||
51 memory_region_is_protected(section->mr) ||
52 /* vhost-vDPA doesn't allow MMIO to be mapped */
53 memory_region_is_ram_device(section->mr)) {
54 return true;
57 if (section->offset_within_address_space < iova_min) {
58 error_report("RAM section out of device range (min=0x%" PRIx64
59 ", addr=0x%" HWADDR_PRIx ")",
60 iova_min, section->offset_within_address_space);
61 return true;
64 llend = vhost_vdpa_section_end(section);
65 if (int128_gt(llend, int128_make64(iova_max))) {
66 error_report("RAM section out of device range (max=0x%" PRIx64
67 ", end addr=0x%" PRIx64 ")",
68 iova_max, int128_get64(llend));
69 return true;
72 return false;
75 int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
76 void *vaddr, bool readonly)
78 struct vhost_msg_v2 msg = {};
79 int fd = v->device_fd;
80 int ret = 0;
82 msg.type = v->msg_type;
83 msg.iotlb.iova = iova;
84 msg.iotlb.size = size;
85 msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr;
86 msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
87 msg.iotlb.type = VHOST_IOTLB_UPDATE;
89 trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size,
90 msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type);
92 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
93 error_report("failed to write, fd=%d, errno=%d (%s)",
94 fd, errno, strerror(errno));
95 return -EIO ;
98 return ret;
101 int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size)
103 struct vhost_msg_v2 msg = {};
104 int fd = v->device_fd;
105 int ret = 0;
107 msg.type = v->msg_type;
108 msg.iotlb.iova = iova;
109 msg.iotlb.size = size;
110 msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
112 trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova,
113 msg.iotlb.size, msg.iotlb.type);
115 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
116 error_report("failed to write, fd=%d, errno=%d (%s)",
117 fd, errno, strerror(errno));
118 return -EIO ;
121 return ret;
124 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v)
126 int fd = v->device_fd;
127 struct vhost_msg_v2 msg = {
128 .type = v->msg_type,
129 .iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
132 trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);
133 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
134 error_report("failed to write, fd=%d, errno=%d (%s)",
135 fd, errno, strerror(errno));
139 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v)
141 if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) &&
142 !v->iotlb_batch_begin_sent) {
143 vhost_vdpa_listener_begin_batch(v);
146 v->iotlb_batch_begin_sent = true;
149 static void vhost_vdpa_listener_commit(MemoryListener *listener)
151 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
152 struct vhost_dev *dev = v->dev;
153 struct vhost_msg_v2 msg = {};
154 int fd = v->device_fd;
156 if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) {
157 return;
160 if (!v->iotlb_batch_begin_sent) {
161 return;
164 msg.type = v->msg_type;
165 msg.iotlb.type = VHOST_IOTLB_BATCH_END;
167 trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type);
168 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
169 error_report("failed to write, fd=%d, errno=%d (%s)",
170 fd, errno, strerror(errno));
173 v->iotlb_batch_begin_sent = false;
176 static void vhost_vdpa_listener_region_add(MemoryListener *listener,
177 MemoryRegionSection *section)
179 DMAMap mem_region = {};
180 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
181 hwaddr iova;
182 Int128 llend, llsize;
183 void *vaddr;
184 int ret;
186 if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
187 v->iova_range.last)) {
188 return;
191 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
192 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
193 error_report("%s received unaligned region", __func__);
194 return;
197 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
198 llend = vhost_vdpa_section_end(section);
199 if (int128_ge(int128_make64(iova), llend)) {
200 return;
203 memory_region_ref(section->mr);
205 /* Here we assume that memory_region_is_ram(section->mr)==true */
207 vaddr = memory_region_get_ram_ptr(section->mr) +
208 section->offset_within_region +
209 (iova - section->offset_within_address_space);
211 trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend),
212 vaddr, section->readonly);
214 llsize = int128_sub(llend, int128_make64(iova));
215 if (v->shadow_vqs_enabled) {
216 int r;
218 mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr,
219 mem_region.size = int128_get64(llsize) - 1,
220 mem_region.perm = IOMMU_ACCESS_FLAG(true, section->readonly),
222 r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
223 if (unlikely(r != IOVA_OK)) {
224 error_report("Can't allocate a mapping (%d)", r);
225 goto fail;
228 iova = mem_region.iova;
231 vhost_vdpa_iotlb_batch_begin_once(v);
232 ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
233 vaddr, section->readonly);
234 if (ret) {
235 error_report("vhost vdpa map fail!");
236 goto fail_map;
239 return;
241 fail_map:
242 if (v->shadow_vqs_enabled) {
243 vhost_iova_tree_remove(v->iova_tree, mem_region);
246 fail:
248 * On the initfn path, store the first error in the container so we
249 * can gracefully fail. Runtime, there's not much we can do other
250 * than throw a hardware error.
252 error_report("vhost-vdpa: DMA mapping failed, unable to continue");
253 return;
257 static void vhost_vdpa_listener_region_del(MemoryListener *listener,
258 MemoryRegionSection *section)
260 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
261 hwaddr iova;
262 Int128 llend, llsize;
263 int ret;
265 if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
266 v->iova_range.last)) {
267 return;
270 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
271 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
272 error_report("%s received unaligned region", __func__);
273 return;
276 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
277 llend = vhost_vdpa_section_end(section);
279 trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend));
281 if (int128_ge(int128_make64(iova), llend)) {
282 return;
285 llsize = int128_sub(llend, int128_make64(iova));
287 if (v->shadow_vqs_enabled) {
288 const DMAMap *result;
289 const void *vaddr = memory_region_get_ram_ptr(section->mr) +
290 section->offset_within_region +
291 (iova - section->offset_within_address_space);
292 DMAMap mem_region = {
293 .translated_addr = (hwaddr)(uintptr_t)vaddr,
294 .size = int128_get64(llsize) - 1,
297 result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
298 if (!result) {
299 /* The memory listener map wasn't mapped */
300 return;
302 iova = result->iova;
303 vhost_iova_tree_remove(v->iova_tree, *result);
305 vhost_vdpa_iotlb_batch_begin_once(v);
306 ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
307 if (ret) {
308 error_report("vhost_vdpa dma unmap error!");
311 memory_region_unref(section->mr);
314 * IOTLB API is used by vhost-vdpa which requires incremental updating
315 * of the mapping. So we can not use generic vhost memory listener which
316 * depends on the addnop().
318 static const MemoryListener vhost_vdpa_memory_listener = {
319 .name = "vhost-vdpa",
320 .commit = vhost_vdpa_listener_commit,
321 .region_add = vhost_vdpa_listener_region_add,
322 .region_del = vhost_vdpa_listener_region_del,
325 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request,
326 void *arg)
328 struct vhost_vdpa *v = dev->opaque;
329 int fd = v->device_fd;
330 int ret;
332 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
334 ret = ioctl(fd, request, arg);
335 return ret < 0 ? -errno : ret;
338 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status)
340 uint8_t s;
341 int ret;
343 trace_vhost_vdpa_add_status(dev, status);
344 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
345 if (ret < 0) {
346 return ret;
349 s |= status;
351 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s);
352 if (ret < 0) {
353 return ret;
356 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
357 if (ret < 0) {
358 return ret;
361 if (!(s & status)) {
362 return -EIO;
365 return 0;
368 static void vhost_vdpa_get_iova_range(struct vhost_vdpa *v)
370 int ret = vhost_vdpa_call(v->dev, VHOST_VDPA_GET_IOVA_RANGE,
371 &v->iova_range);
372 if (ret != 0) {
373 v->iova_range.first = 0;
374 v->iova_range.last = UINT64_MAX;
377 trace_vhost_vdpa_get_iova_range(v->dev, v->iova_range.first,
378 v->iova_range.last);
382 * The use of this function is for requests that only need to be
383 * applied once. Typically such request occurs at the beginning
384 * of operation, and before setting up queues. It should not be
385 * used for request that performs operation until all queues are
386 * set, which would need to check dev->vq_index_end instead.
388 static bool vhost_vdpa_first_dev(struct vhost_dev *dev)
390 struct vhost_vdpa *v = dev->opaque;
392 return v->index == 0;
395 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
396 uint64_t *features)
398 int ret;
400 ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
401 trace_vhost_vdpa_get_features(dev, *features);
402 return ret;
405 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
406 Error **errp)
408 g_autoptr(GPtrArray) shadow_vqs = NULL;
409 uint64_t dev_features, svq_features;
410 int r;
411 bool ok;
413 if (!v->shadow_vqs_enabled) {
414 return 0;
417 r = vhost_vdpa_get_dev_features(hdev, &dev_features);
418 if (r != 0) {
419 error_setg_errno(errp, -r, "Can't get vdpa device features");
420 return r;
423 svq_features = dev_features;
424 ok = vhost_svq_valid_features(svq_features, errp);
425 if (unlikely(!ok)) {
426 return -1;
429 shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
430 for (unsigned n = 0; n < hdev->nvqs; ++n) {
431 g_autoptr(VhostShadowVirtqueue) svq;
433 svq = vhost_svq_new(v->iova_tree, v->shadow_vq_ops,
434 v->shadow_vq_ops_opaque);
435 if (unlikely(!svq)) {
436 error_setg(errp, "Cannot create svq %u", n);
437 return -1;
439 g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq));
442 v->shadow_vqs = g_steal_pointer(&shadow_vqs);
443 return 0;
446 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
448 struct vhost_vdpa *v;
449 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
450 trace_vhost_vdpa_init(dev, opaque);
451 int ret;
454 * Similar to VFIO, we end up pinning all guest memory and have to
455 * disable discarding of RAM.
457 ret = ram_block_discard_disable(true);
458 if (ret) {
459 error_report("Cannot set discarding of RAM broken");
460 return ret;
463 v = opaque;
464 v->dev = dev;
465 dev->opaque = opaque ;
466 v->listener = vhost_vdpa_memory_listener;
467 v->msg_type = VHOST_IOTLB_MSG_V2;
468 ret = vhost_vdpa_init_svq(dev, v, errp);
469 if (ret) {
470 goto err;
473 vhost_vdpa_get_iova_range(v);
475 if (!vhost_vdpa_first_dev(dev)) {
476 return 0;
479 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
480 VIRTIO_CONFIG_S_DRIVER);
482 return 0;
484 err:
485 ram_block_discard_disable(false);
486 return ret;
489 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
490 int queue_index)
492 size_t page_size = qemu_real_host_page_size();
493 struct vhost_vdpa *v = dev->opaque;
494 VirtIODevice *vdev = dev->vdev;
495 VhostVDPAHostNotifier *n;
497 n = &v->notifier[queue_index];
499 if (n->addr) {
500 virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false);
501 object_unparent(OBJECT(&n->mr));
502 munmap(n->addr, page_size);
503 n->addr = NULL;
507 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index)
509 size_t page_size = qemu_real_host_page_size();
510 struct vhost_vdpa *v = dev->opaque;
511 VirtIODevice *vdev = dev->vdev;
512 VhostVDPAHostNotifier *n;
513 int fd = v->device_fd;
514 void *addr;
515 char *name;
517 vhost_vdpa_host_notifier_uninit(dev, queue_index);
519 n = &v->notifier[queue_index];
521 addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
522 queue_index * page_size);
523 if (addr == MAP_FAILED) {
524 goto err;
527 name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
528 v, queue_index);
529 memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
530 page_size, addr);
531 g_free(name);
533 if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) {
534 object_unparent(OBJECT(&n->mr));
535 munmap(addr, page_size);
536 goto err;
538 n->addr = addr;
540 return 0;
542 err:
543 return -1;
546 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
548 int i;
550 for (i = dev->vq_index; i < dev->vq_index + n; i++) {
551 vhost_vdpa_host_notifier_uninit(dev, i);
555 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
557 struct vhost_vdpa *v = dev->opaque;
558 int i;
560 if (v->shadow_vqs_enabled) {
561 /* FIXME SVQ is not compatible with host notifiers mr */
562 return;
565 for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
566 if (vhost_vdpa_host_notifier_init(dev, i)) {
567 goto err;
571 return;
573 err:
574 vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index);
575 return;
578 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
580 struct vhost_vdpa *v = dev->opaque;
581 size_t idx;
583 if (!v->shadow_vqs) {
584 return;
587 for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
588 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
590 g_ptr_array_free(v->shadow_vqs, true);
593 static int vhost_vdpa_cleanup(struct vhost_dev *dev)
595 struct vhost_vdpa *v;
596 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
597 v = dev->opaque;
598 trace_vhost_vdpa_cleanup(dev, v);
599 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
600 memory_listener_unregister(&v->listener);
601 vhost_vdpa_svq_cleanup(dev);
603 dev->opaque = NULL;
604 ram_block_discard_disable(false);
606 return 0;
609 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev)
611 trace_vhost_vdpa_memslots_limit(dev, INT_MAX);
612 return INT_MAX;
615 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
616 struct vhost_memory *mem)
618 if (!vhost_vdpa_first_dev(dev)) {
619 return 0;
622 trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding);
623 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) &&
624 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) {
625 int i;
626 for (i = 0; i < mem->nregions; i++) {
627 trace_vhost_vdpa_dump_regions(dev, i,
628 mem->regions[i].guest_phys_addr,
629 mem->regions[i].memory_size,
630 mem->regions[i].userspace_addr,
631 mem->regions[i].flags_padding);
634 if (mem->padding) {
635 return -EINVAL;
638 return 0;
641 static int vhost_vdpa_set_features(struct vhost_dev *dev,
642 uint64_t features)
644 struct vhost_vdpa *v = dev->opaque;
645 int ret;
647 if (!vhost_vdpa_first_dev(dev)) {
648 return 0;
651 if (v->shadow_vqs_enabled) {
652 if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
654 * QEMU is just trying to enable or disable logging. SVQ handles
655 * this sepparately, so no need to forward this.
657 v->acked_features = features;
658 return 0;
661 v->acked_features = features;
663 /* We must not ack _F_LOG if SVQ is enabled */
664 features &= ~BIT_ULL(VHOST_F_LOG_ALL);
667 trace_vhost_vdpa_set_features(dev, features);
668 ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
669 if (ret) {
670 return ret;
673 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
676 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
678 uint64_t features;
679 uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
680 0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH;
681 int r;
683 if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
684 return -EFAULT;
687 features &= f;
689 if (vhost_vdpa_first_dev(dev)) {
690 r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features);
691 if (r) {
692 return -EFAULT;
696 dev->backend_cap = features;
698 return 0;
701 static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
702 uint32_t *device_id)
704 int ret;
705 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id);
706 trace_vhost_vdpa_get_device_id(dev, *device_id);
707 return ret;
710 static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
712 if (!v->shadow_vqs_enabled) {
713 return;
716 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
717 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
718 vhost_svq_stop(svq);
722 static int vhost_vdpa_reset_device(struct vhost_dev *dev)
724 struct vhost_vdpa *v = dev->opaque;
725 int ret;
726 uint8_t status = 0;
728 vhost_vdpa_reset_svq(v);
730 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
731 trace_vhost_vdpa_reset_device(dev, status);
732 return ret;
735 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx)
737 assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
739 trace_vhost_vdpa_get_vq_index(dev, idx, idx);
740 return idx;
743 static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev)
745 int i;
746 trace_vhost_vdpa_set_vring_ready(dev);
747 for (i = 0; i < dev->nvqs; ++i) {
748 struct vhost_vring_state state = {
749 .index = dev->vq_index + i,
750 .num = 1,
752 vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
754 return 0;
757 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config,
758 uint32_t config_len)
760 int b, len;
761 char line[QEMU_HEXDUMP_LINE_LEN];
763 for (b = 0; b < config_len; b += 16) {
764 len = config_len - b;
765 qemu_hexdump_line(line, b, config, len, false);
766 trace_vhost_vdpa_dump_config(dev, line);
770 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data,
771 uint32_t offset, uint32_t size,
772 uint32_t flags)
774 struct vhost_vdpa_config *config;
775 int ret;
776 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
778 trace_vhost_vdpa_set_config(dev, offset, size, flags);
779 config = g_malloc(size + config_size);
780 config->off = offset;
781 config->len = size;
782 memcpy(config->buf, data, size);
783 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) &&
784 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
785 vhost_vdpa_dump_config(dev, data, size);
787 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config);
788 g_free(config);
789 return ret;
792 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
793 uint32_t config_len, Error **errp)
795 struct vhost_vdpa_config *v_config;
796 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
797 int ret;
799 trace_vhost_vdpa_get_config(dev, config, config_len);
800 v_config = g_malloc(config_len + config_size);
801 v_config->len = config_len;
802 v_config->off = 0;
803 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config);
804 memcpy(config, v_config->buf, config_len);
805 g_free(v_config);
806 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) &&
807 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
808 vhost_vdpa_dump_config(dev, config, config_len);
810 return ret;
813 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
814 struct vhost_vring_state *ring)
816 trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
817 return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
820 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
821 struct vhost_vring_file *file)
823 trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
824 return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
827 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
828 struct vhost_vring_file *file)
830 trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
831 return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
834 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
835 struct vhost_vring_addr *addr)
837 trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
838 addr->desc_user_addr, addr->used_user_addr,
839 addr->avail_user_addr,
840 addr->log_guest_addr);
842 return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
847 * Set the shadow virtqueue descriptors to the device
849 * @dev: The vhost device model
850 * @svq: The shadow virtqueue
851 * @idx: The index of the virtqueue in the vhost device
852 * @errp: Error
854 * Note that this function does not rewind kick file descriptor if cannot set
855 * call one.
857 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
858 VhostShadowVirtqueue *svq, unsigned idx,
859 Error **errp)
861 struct vhost_vring_file file = {
862 .index = dev->vq_index + idx,
864 const EventNotifier *event_notifier = &svq->hdev_kick;
865 int r;
867 file.fd = event_notifier_get_fd(event_notifier);
868 r = vhost_vdpa_set_vring_dev_kick(dev, &file);
869 if (unlikely(r != 0)) {
870 error_setg_errno(errp, -r, "Can't set device kick fd");
871 return r;
874 event_notifier = &svq->hdev_call;
875 file.fd = event_notifier_get_fd(event_notifier);
876 r = vhost_vdpa_set_vring_dev_call(dev, &file);
877 if (unlikely(r != 0)) {
878 error_setg_errno(errp, -r, "Can't set device call fd");
881 return r;
885 * Unmap a SVQ area in the device
887 static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr addr)
889 const DMAMap needle = {
890 .translated_addr = addr,
892 const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, &needle);
893 hwaddr size;
894 int r;
896 if (unlikely(!result)) {
897 error_report("Unable to find SVQ address to unmap");
898 return;
901 size = ROUND_UP(result->size, qemu_real_host_page_size());
902 r = vhost_vdpa_dma_unmap(v, result->iova, size);
903 if (unlikely(r < 0)) {
904 error_report("Unable to unmap SVQ vring: %s (%d)", g_strerror(-r), -r);
905 return;
908 vhost_iova_tree_remove(v->iova_tree, *result);
911 static void vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
912 const VhostShadowVirtqueue *svq)
914 struct vhost_vdpa *v = dev->opaque;
915 struct vhost_vring_addr svq_addr;
917 vhost_svq_get_vring_addr(svq, &svq_addr);
919 vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr);
921 vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr);
925 * Map the SVQ area in the device
927 * @v: Vhost-vdpa device
928 * @needle: The area to search iova
929 * @errorp: Error pointer
931 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
932 Error **errp)
934 int r;
936 r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
937 if (unlikely(r != IOVA_OK)) {
938 error_setg(errp, "Cannot allocate iova (%d)", r);
939 return false;
942 r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
943 (void *)(uintptr_t)needle->translated_addr,
944 needle->perm == IOMMU_RO);
945 if (unlikely(r != 0)) {
946 error_setg_errno(errp, -r, "Cannot map region to device");
947 vhost_iova_tree_remove(v->iova_tree, *needle);
950 return r == 0;
954 * Map the shadow virtqueue rings in the device
956 * @dev: The vhost device
957 * @svq: The shadow virtqueue
958 * @addr: Assigned IOVA addresses
959 * @errp: Error pointer
961 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
962 const VhostShadowVirtqueue *svq,
963 struct vhost_vring_addr *addr,
964 Error **errp)
966 DMAMap device_region, driver_region;
967 struct vhost_vring_addr svq_addr;
968 struct vhost_vdpa *v = dev->opaque;
969 size_t device_size = vhost_svq_device_area_size(svq);
970 size_t driver_size = vhost_svq_driver_area_size(svq);
971 size_t avail_offset;
972 bool ok;
974 ERRP_GUARD();
975 vhost_svq_get_vring_addr(svq, &svq_addr);
977 driver_region = (DMAMap) {
978 .translated_addr = svq_addr.desc_user_addr,
979 .size = driver_size - 1,
980 .perm = IOMMU_RO,
982 ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
983 if (unlikely(!ok)) {
984 error_prepend(errp, "Cannot create vq driver region: ");
985 return false;
987 addr->desc_user_addr = driver_region.iova;
988 avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
989 addr->avail_user_addr = driver_region.iova + avail_offset;
991 device_region = (DMAMap) {
992 .translated_addr = svq_addr.used_user_addr,
993 .size = device_size - 1,
994 .perm = IOMMU_RW,
996 ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
997 if (unlikely(!ok)) {
998 error_prepend(errp, "Cannot create vq device region: ");
999 vhost_vdpa_svq_unmap_ring(v, driver_region.translated_addr);
1001 addr->used_user_addr = device_region.iova;
1003 return ok;
1006 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
1007 VhostShadowVirtqueue *svq, unsigned idx,
1008 Error **errp)
1010 uint16_t vq_index = dev->vq_index + idx;
1011 struct vhost_vring_state s = {
1012 .index = vq_index,
1014 int r;
1016 r = vhost_vdpa_set_dev_vring_base(dev, &s);
1017 if (unlikely(r)) {
1018 error_setg_errno(errp, -r, "Cannot set vring base");
1019 return false;
1022 r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
1023 return r == 0;
1026 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
1028 struct vhost_vdpa *v = dev->opaque;
1029 Error *err = NULL;
1030 unsigned i;
1032 if (!v->shadow_vqs) {
1033 return true;
1036 for (i = 0; i < v->shadow_vqs->len; ++i) {
1037 VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
1038 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1039 struct vhost_vring_addr addr = {
1040 .index = dev->vq_index + i,
1042 int r;
1043 bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
1044 if (unlikely(!ok)) {
1045 goto err;
1048 vhost_svq_start(svq, dev->vdev, vq);
1049 ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
1050 if (unlikely(!ok)) {
1051 goto err_map;
1054 /* Override vring GPA set by vhost subsystem */
1055 r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
1056 if (unlikely(r != 0)) {
1057 error_setg_errno(&err, -r, "Cannot set device address");
1058 goto err_set_addr;
1062 return true;
1064 err_set_addr:
1065 vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
1067 err_map:
1068 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
1070 err:
1071 error_reportf_err(err, "Cannot setup SVQ %u: ", i);
1072 for (unsigned j = 0; j < i; ++j) {
1073 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
1074 vhost_vdpa_svq_unmap_rings(dev, svq);
1075 vhost_svq_stop(svq);
1078 return false;
1081 static void vhost_vdpa_svqs_stop(struct vhost_dev *dev)
1083 struct vhost_vdpa *v = dev->opaque;
1085 if (!v->shadow_vqs) {
1086 return;
1089 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
1090 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1091 vhost_vdpa_svq_unmap_rings(dev, svq);
1095 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
1097 struct vhost_vdpa *v = dev->opaque;
1098 bool ok;
1099 trace_vhost_vdpa_dev_start(dev, started);
1101 if (started) {
1102 vhost_vdpa_host_notifiers_init(dev);
1103 ok = vhost_vdpa_svqs_start(dev);
1104 if (unlikely(!ok)) {
1105 return -1;
1107 vhost_vdpa_set_vring_ready(dev);
1108 } else {
1109 vhost_vdpa_svqs_stop(dev);
1110 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
1113 if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
1114 return 0;
1117 if (started) {
1118 memory_listener_register(&v->listener, &address_space_memory);
1119 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
1120 } else {
1121 vhost_vdpa_reset_device(dev);
1122 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
1123 VIRTIO_CONFIG_S_DRIVER);
1124 memory_listener_unregister(&v->listener);
1126 return 0;
1130 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
1131 struct vhost_log *log)
1133 struct vhost_vdpa *v = dev->opaque;
1134 if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) {
1135 return 0;
1138 trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd,
1139 log->log);
1140 return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base);
1143 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
1144 struct vhost_vring_addr *addr)
1146 struct vhost_vdpa *v = dev->opaque;
1148 if (v->shadow_vqs_enabled) {
1150 * Device vring addr was set at device start. SVQ base is handled by
1151 * VirtQueue code.
1153 return 0;
1156 return vhost_vdpa_set_vring_dev_addr(dev, addr);
1159 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
1160 struct vhost_vring_state *ring)
1162 trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num);
1163 return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
1166 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
1167 struct vhost_vring_state *ring)
1169 struct vhost_vdpa *v = dev->opaque;
1170 VirtQueue *vq = virtio_get_queue(dev->vdev, ring->index);
1173 * vhost-vdpa devices does not support in-flight requests. Set all of them
1174 * as available.
1176 * TODO: This is ok for networking, but other kinds of devices might
1177 * have problems with these retransmissions.
1179 while (virtqueue_rewind(vq, 1)) {
1180 continue;
1182 if (v->shadow_vqs_enabled) {
1184 * Device vring base was set at device start. SVQ base is handled by
1185 * VirtQueue code.
1187 return 0;
1190 return vhost_vdpa_set_dev_vring_base(dev, ring);
1193 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
1194 struct vhost_vring_state *ring)
1196 struct vhost_vdpa *v = dev->opaque;
1197 int ret;
1199 if (v->shadow_vqs_enabled) {
1200 ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index);
1201 return 0;
1204 ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
1205 trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
1206 return ret;
1209 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
1210 struct vhost_vring_file *file)
1212 struct vhost_vdpa *v = dev->opaque;
1213 int vdpa_idx = file->index - dev->vq_index;
1215 if (v->shadow_vqs_enabled) {
1216 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1217 vhost_svq_set_svq_kick_fd(svq, file->fd);
1218 return 0;
1219 } else {
1220 return vhost_vdpa_set_vring_dev_kick(dev, file);
1224 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
1225 struct vhost_vring_file *file)
1227 struct vhost_vdpa *v = dev->opaque;
1229 if (v->shadow_vqs_enabled) {
1230 int vdpa_idx = file->index - dev->vq_index;
1231 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1233 vhost_svq_set_svq_call_fd(svq, file->fd);
1234 return 0;
1235 } else {
1236 return vhost_vdpa_set_vring_dev_call(dev, file);
1240 static int vhost_vdpa_get_features(struct vhost_dev *dev,
1241 uint64_t *features)
1243 struct vhost_vdpa *v = dev->opaque;
1244 int ret = vhost_vdpa_get_dev_features(dev, features);
1246 if (ret == 0 && v->shadow_vqs_enabled) {
1247 /* Add SVQ logging capabilities */
1248 *features |= BIT_ULL(VHOST_F_LOG_ALL);
1251 return ret;
1254 static int vhost_vdpa_set_owner(struct vhost_dev *dev)
1256 if (!vhost_vdpa_first_dev(dev)) {
1257 return 0;
1260 trace_vhost_vdpa_set_owner(dev);
1261 return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
1264 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
1265 struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
1267 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
1268 addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
1269 addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
1270 addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
1271 trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr,
1272 addr->avail_user_addr, addr->used_user_addr);
1273 return 0;
1276 static bool vhost_vdpa_force_iommu(struct vhost_dev *dev)
1278 return true;
1281 const VhostOps vdpa_ops = {
1282 .backend_type = VHOST_BACKEND_TYPE_VDPA,
1283 .vhost_backend_init = vhost_vdpa_init,
1284 .vhost_backend_cleanup = vhost_vdpa_cleanup,
1285 .vhost_set_log_base = vhost_vdpa_set_log_base,
1286 .vhost_set_vring_addr = vhost_vdpa_set_vring_addr,
1287 .vhost_set_vring_num = vhost_vdpa_set_vring_num,
1288 .vhost_set_vring_base = vhost_vdpa_set_vring_base,
1289 .vhost_get_vring_base = vhost_vdpa_get_vring_base,
1290 .vhost_set_vring_kick = vhost_vdpa_set_vring_kick,
1291 .vhost_set_vring_call = vhost_vdpa_set_vring_call,
1292 .vhost_get_features = vhost_vdpa_get_features,
1293 .vhost_set_backend_cap = vhost_vdpa_set_backend_cap,
1294 .vhost_set_owner = vhost_vdpa_set_owner,
1295 .vhost_set_vring_endian = NULL,
1296 .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit,
1297 .vhost_set_mem_table = vhost_vdpa_set_mem_table,
1298 .vhost_set_features = vhost_vdpa_set_features,
1299 .vhost_reset_device = vhost_vdpa_reset_device,
1300 .vhost_get_vq_index = vhost_vdpa_get_vq_index,
1301 .vhost_get_config = vhost_vdpa_get_config,
1302 .vhost_set_config = vhost_vdpa_set_config,
1303 .vhost_requires_shm_log = NULL,
1304 .vhost_migration_done = NULL,
1305 .vhost_backend_can_merge = NULL,
1306 .vhost_net_set_mtu = NULL,
1307 .vhost_set_iotlb_callback = NULL,
1308 .vhost_send_device_iotlb_msg = NULL,
1309 .vhost_dev_start = vhost_vdpa_dev_start,
1310 .vhost_get_device_id = vhost_vdpa_get_device_id,
1311 .vhost_vq_get_addr = vhost_vdpa_vq_get_addr,
1312 .vhost_force_iommu = vhost_vdpa_force_iommu,