Merge tag 'pull-testing-next-140622-1' of https://github.com/stsquad/qemu into staging
[qemu.git] / hw / virtio / vhost-vdpa.c
blob66f054a12c23d6ebd0a100af6927e5c0b1dcf225
1 /*
2 * vhost-vdpa
4 * Copyright(c) 2017-2018 Intel Corporation.
5 * Copyright(c) 2020 Red Hat, Inc.
7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
8 * See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
13 #include <linux/vhost.h>
14 #include <linux/vfio.h>
15 #include <sys/eventfd.h>
16 #include <sys/ioctl.h>
17 #include "hw/virtio/vhost.h"
18 #include "hw/virtio/vhost-backend.h"
19 #include "hw/virtio/virtio-net.h"
20 #include "hw/virtio/vhost-shadow-virtqueue.h"
21 #include "hw/virtio/vhost-vdpa.h"
22 #include "exec/address-spaces.h"
23 #include "qemu/cutils.h"
24 #include "qemu/main-loop.h"
25 #include "cpu.h"
26 #include "trace.h"
27 #include "qapi/error.h"
30 * Return one past the end of the end of section. Be careful with uint64_t
31 * conversions!
33 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section)
35 Int128 llend = int128_make64(section->offset_within_address_space);
36 llend = int128_add(llend, section->size);
37 llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
39 return llend;
42 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
43 uint64_t iova_min,
44 uint64_t iova_max)
46 Int128 llend;
48 if ((!memory_region_is_ram(section->mr) &&
49 !memory_region_is_iommu(section->mr)) ||
50 memory_region_is_protected(section->mr) ||
51 /* vhost-vDPA doesn't allow MMIO to be mapped */
52 memory_region_is_ram_device(section->mr)) {
53 return true;
56 if (section->offset_within_address_space < iova_min) {
57 error_report("RAM section out of device range (min=0x%" PRIx64
58 ", addr=0x%" HWADDR_PRIx ")",
59 iova_min, section->offset_within_address_space);
60 return true;
63 llend = vhost_vdpa_section_end(section);
64 if (int128_gt(llend, int128_make64(iova_max))) {
65 error_report("RAM section out of device range (max=0x%" PRIx64
66 ", end addr=0x%" PRIx64 ")",
67 iova_max, int128_get64(llend));
68 return true;
71 return false;
74 static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
75 void *vaddr, bool readonly)
77 struct vhost_msg_v2 msg = {};
78 int fd = v->device_fd;
79 int ret = 0;
81 msg.type = v->msg_type;
82 msg.iotlb.iova = iova;
83 msg.iotlb.size = size;
84 msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr;
85 msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
86 msg.iotlb.type = VHOST_IOTLB_UPDATE;
88 trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size,
89 msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type);
91 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
92 error_report("failed to write, fd=%d, errno=%d (%s)",
93 fd, errno, strerror(errno));
94 return -EIO ;
97 return ret;
100 static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova,
101 hwaddr size)
103 struct vhost_msg_v2 msg = {};
104 int fd = v->device_fd;
105 int ret = 0;
107 msg.type = v->msg_type;
108 msg.iotlb.iova = iova;
109 msg.iotlb.size = size;
110 msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
112 trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova,
113 msg.iotlb.size, msg.iotlb.type);
115 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
116 error_report("failed to write, fd=%d, errno=%d (%s)",
117 fd, errno, strerror(errno));
118 return -EIO ;
121 return ret;
124 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v)
126 int fd = v->device_fd;
127 struct vhost_msg_v2 msg = {
128 .type = v->msg_type,
129 .iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
132 trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);
133 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
134 error_report("failed to write, fd=%d, errno=%d (%s)",
135 fd, errno, strerror(errno));
139 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v)
141 if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) &&
142 !v->iotlb_batch_begin_sent) {
143 vhost_vdpa_listener_begin_batch(v);
146 v->iotlb_batch_begin_sent = true;
149 static void vhost_vdpa_listener_commit(MemoryListener *listener)
151 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
152 struct vhost_dev *dev = v->dev;
153 struct vhost_msg_v2 msg = {};
154 int fd = v->device_fd;
156 if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) {
157 return;
160 if (!v->iotlb_batch_begin_sent) {
161 return;
164 msg.type = v->msg_type;
165 msg.iotlb.type = VHOST_IOTLB_BATCH_END;
167 trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type);
168 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
169 error_report("failed to write, fd=%d, errno=%d (%s)",
170 fd, errno, strerror(errno));
173 v->iotlb_batch_begin_sent = false;
176 static void vhost_vdpa_listener_region_add(MemoryListener *listener,
177 MemoryRegionSection *section)
179 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
180 hwaddr iova;
181 Int128 llend, llsize;
182 void *vaddr;
183 int ret;
185 if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
186 v->iova_range.last)) {
187 return;
190 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
191 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
192 error_report("%s received unaligned region", __func__);
193 return;
196 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
197 llend = vhost_vdpa_section_end(section);
198 if (int128_ge(int128_make64(iova), llend)) {
199 return;
202 memory_region_ref(section->mr);
204 /* Here we assume that memory_region_is_ram(section->mr)==true */
206 vaddr = memory_region_get_ram_ptr(section->mr) +
207 section->offset_within_region +
208 (iova - section->offset_within_address_space);
210 trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend),
211 vaddr, section->readonly);
213 llsize = int128_sub(llend, int128_make64(iova));
214 if (v->shadow_vqs_enabled) {
215 DMAMap mem_region = {
216 .translated_addr = (hwaddr)(uintptr_t)vaddr,
217 .size = int128_get64(llsize) - 1,
218 .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
221 int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
222 if (unlikely(r != IOVA_OK)) {
223 error_report("Can't allocate a mapping (%d)", r);
224 goto fail;
227 iova = mem_region.iova;
230 vhost_vdpa_iotlb_batch_begin_once(v);
231 ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
232 vaddr, section->readonly);
233 if (ret) {
234 error_report("vhost vdpa map fail!");
235 goto fail;
238 return;
240 fail:
242 * On the initfn path, store the first error in the container so we
243 * can gracefully fail. Runtime, there's not much we can do other
244 * than throw a hardware error.
246 error_report("vhost-vdpa: DMA mapping failed, unable to continue");
247 return;
251 static void vhost_vdpa_listener_region_del(MemoryListener *listener,
252 MemoryRegionSection *section)
254 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
255 hwaddr iova;
256 Int128 llend, llsize;
257 int ret;
259 if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
260 v->iova_range.last)) {
261 return;
264 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
265 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
266 error_report("%s received unaligned region", __func__);
267 return;
270 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
271 llend = vhost_vdpa_section_end(section);
273 trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend));
275 if (int128_ge(int128_make64(iova), llend)) {
276 return;
279 llsize = int128_sub(llend, int128_make64(iova));
281 if (v->shadow_vqs_enabled) {
282 const DMAMap *result;
283 const void *vaddr = memory_region_get_ram_ptr(section->mr) +
284 section->offset_within_region +
285 (iova - section->offset_within_address_space);
286 DMAMap mem_region = {
287 .translated_addr = (hwaddr)(uintptr_t)vaddr,
288 .size = int128_get64(llsize) - 1,
291 result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
292 iova = result->iova;
293 vhost_iova_tree_remove(v->iova_tree, &mem_region);
295 vhost_vdpa_iotlb_batch_begin_once(v);
296 ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
297 if (ret) {
298 error_report("vhost_vdpa dma unmap error!");
301 memory_region_unref(section->mr);
304 * IOTLB API is used by vhost-vdpa which requires incremental updating
305 * of the mapping. So we can not use generic vhost memory listener which
306 * depends on the addnop().
308 static const MemoryListener vhost_vdpa_memory_listener = {
309 .name = "vhost-vdpa",
310 .commit = vhost_vdpa_listener_commit,
311 .region_add = vhost_vdpa_listener_region_add,
312 .region_del = vhost_vdpa_listener_region_del,
315 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request,
316 void *arg)
318 struct vhost_vdpa *v = dev->opaque;
319 int fd = v->device_fd;
320 int ret;
322 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
324 ret = ioctl(fd, request, arg);
325 return ret < 0 ? -errno : ret;
328 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status)
330 uint8_t s;
331 int ret;
333 trace_vhost_vdpa_add_status(dev, status);
334 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
335 if (ret < 0) {
336 return ret;
339 s |= status;
341 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s);
342 if (ret < 0) {
343 return ret;
346 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
347 if (ret < 0) {
348 return ret;
351 if (!(s & status)) {
352 return -EIO;
355 return 0;
358 static void vhost_vdpa_get_iova_range(struct vhost_vdpa *v)
360 int ret = vhost_vdpa_call(v->dev, VHOST_VDPA_GET_IOVA_RANGE,
361 &v->iova_range);
362 if (ret != 0) {
363 v->iova_range.first = 0;
364 v->iova_range.last = UINT64_MAX;
367 trace_vhost_vdpa_get_iova_range(v->dev, v->iova_range.first,
368 v->iova_range.last);
372 * The use of this function is for requests that only need to be
373 * applied once. Typically such request occurs at the beginning
374 * of operation, and before setting up queues. It should not be
375 * used for request that performs operation until all queues are
376 * set, which would need to check dev->vq_index_end instead.
378 static bool vhost_vdpa_first_dev(struct vhost_dev *dev)
380 struct vhost_vdpa *v = dev->opaque;
382 return v->index == 0;
385 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
386 uint64_t *features)
388 int ret;
390 ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
391 trace_vhost_vdpa_get_features(dev, *features);
392 return ret;
395 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
396 Error **errp)
398 g_autoptr(GPtrArray) shadow_vqs = NULL;
399 uint64_t dev_features, svq_features;
400 int r;
401 bool ok;
403 if (!v->shadow_vqs_enabled) {
404 return 0;
407 r = vhost_vdpa_get_dev_features(hdev, &dev_features);
408 if (r != 0) {
409 error_setg_errno(errp, -r, "Can't get vdpa device features");
410 return r;
413 svq_features = dev_features;
414 ok = vhost_svq_valid_features(svq_features, errp);
415 if (unlikely(!ok)) {
416 return -1;
419 shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
420 for (unsigned n = 0; n < hdev->nvqs; ++n) {
421 g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
423 if (unlikely(!svq)) {
424 error_setg(errp, "Cannot create svq %u", n);
425 return -1;
427 g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq));
430 v->shadow_vqs = g_steal_pointer(&shadow_vqs);
431 return 0;
434 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
436 struct vhost_vdpa *v;
437 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
438 trace_vhost_vdpa_init(dev, opaque);
439 int ret;
442 * Similar to VFIO, we end up pinning all guest memory and have to
443 * disable discarding of RAM.
445 ret = ram_block_discard_disable(true);
446 if (ret) {
447 error_report("Cannot set discarding of RAM broken");
448 return ret;
451 v = opaque;
452 v->dev = dev;
453 dev->opaque = opaque ;
454 v->listener = vhost_vdpa_memory_listener;
455 v->msg_type = VHOST_IOTLB_MSG_V2;
456 ret = vhost_vdpa_init_svq(dev, v, errp);
457 if (ret) {
458 goto err;
461 vhost_vdpa_get_iova_range(v);
463 if (!vhost_vdpa_first_dev(dev)) {
464 return 0;
467 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
468 VIRTIO_CONFIG_S_DRIVER);
470 return 0;
472 err:
473 ram_block_discard_disable(false);
474 return ret;
477 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
478 int queue_index)
480 size_t page_size = qemu_real_host_page_size();
481 struct vhost_vdpa *v = dev->opaque;
482 VirtIODevice *vdev = dev->vdev;
483 VhostVDPAHostNotifier *n;
485 n = &v->notifier[queue_index];
487 if (n->addr) {
488 virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false);
489 object_unparent(OBJECT(&n->mr));
490 munmap(n->addr, page_size);
491 n->addr = NULL;
495 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index)
497 size_t page_size = qemu_real_host_page_size();
498 struct vhost_vdpa *v = dev->opaque;
499 VirtIODevice *vdev = dev->vdev;
500 VhostVDPAHostNotifier *n;
501 int fd = v->device_fd;
502 void *addr;
503 char *name;
505 vhost_vdpa_host_notifier_uninit(dev, queue_index);
507 n = &v->notifier[queue_index];
509 addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
510 queue_index * page_size);
511 if (addr == MAP_FAILED) {
512 goto err;
515 name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
516 v, queue_index);
517 memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
518 page_size, addr);
519 g_free(name);
521 if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) {
522 object_unparent(OBJECT(&n->mr));
523 munmap(addr, page_size);
524 goto err;
526 n->addr = addr;
528 return 0;
530 err:
531 return -1;
534 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
536 int i;
538 for (i = dev->vq_index; i < dev->vq_index + n; i++) {
539 vhost_vdpa_host_notifier_uninit(dev, i);
543 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
545 struct vhost_vdpa *v = dev->opaque;
546 int i;
548 if (v->shadow_vqs_enabled) {
549 /* FIXME SVQ is not compatible with host notifiers mr */
550 return;
553 for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
554 if (vhost_vdpa_host_notifier_init(dev, i)) {
555 goto err;
559 return;
561 err:
562 vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index);
563 return;
566 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
568 struct vhost_vdpa *v = dev->opaque;
569 size_t idx;
571 if (!v->shadow_vqs) {
572 return;
575 for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
576 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
578 g_ptr_array_free(v->shadow_vqs, true);
581 static int vhost_vdpa_cleanup(struct vhost_dev *dev)
583 struct vhost_vdpa *v;
584 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
585 v = dev->opaque;
586 trace_vhost_vdpa_cleanup(dev, v);
587 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
588 memory_listener_unregister(&v->listener);
589 vhost_vdpa_svq_cleanup(dev);
591 dev->opaque = NULL;
592 ram_block_discard_disable(false);
594 return 0;
597 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev)
599 trace_vhost_vdpa_memslots_limit(dev, INT_MAX);
600 return INT_MAX;
603 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
604 struct vhost_memory *mem)
606 if (!vhost_vdpa_first_dev(dev)) {
607 return 0;
610 trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding);
611 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) &&
612 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) {
613 int i;
614 for (i = 0; i < mem->nregions; i++) {
615 trace_vhost_vdpa_dump_regions(dev, i,
616 mem->regions[i].guest_phys_addr,
617 mem->regions[i].memory_size,
618 mem->regions[i].userspace_addr,
619 mem->regions[i].flags_padding);
622 if (mem->padding) {
623 return -EINVAL;
626 return 0;
629 static int vhost_vdpa_set_features(struct vhost_dev *dev,
630 uint64_t features)
632 struct vhost_vdpa *v = dev->opaque;
633 int ret;
635 if (!vhost_vdpa_first_dev(dev)) {
636 return 0;
639 if (v->shadow_vqs_enabled) {
640 if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
642 * QEMU is just trying to enable or disable logging. SVQ handles
643 * this sepparately, so no need to forward this.
645 v->acked_features = features;
646 return 0;
649 v->acked_features = features;
651 /* We must not ack _F_LOG if SVQ is enabled */
652 features &= ~BIT_ULL(VHOST_F_LOG_ALL);
655 trace_vhost_vdpa_set_features(dev, features);
656 ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
657 if (ret) {
658 return ret;
661 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
664 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
666 uint64_t features;
667 uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
668 0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH;
669 int r;
671 if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
672 return -EFAULT;
675 features &= f;
677 if (vhost_vdpa_first_dev(dev)) {
678 r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features);
679 if (r) {
680 return -EFAULT;
684 dev->backend_cap = features;
686 return 0;
689 static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
690 uint32_t *device_id)
692 int ret;
693 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id);
694 trace_vhost_vdpa_get_device_id(dev, *device_id);
695 return ret;
698 static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
700 if (!v->shadow_vqs_enabled) {
701 return;
704 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
705 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
706 vhost_svq_stop(svq);
710 static int vhost_vdpa_reset_device(struct vhost_dev *dev)
712 struct vhost_vdpa *v = dev->opaque;
713 int ret;
714 uint8_t status = 0;
716 vhost_vdpa_reset_svq(v);
718 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
719 trace_vhost_vdpa_reset_device(dev, status);
720 return ret;
723 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx)
725 assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
727 trace_vhost_vdpa_get_vq_index(dev, idx, idx);
728 return idx;
731 static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev)
733 int i;
734 trace_vhost_vdpa_set_vring_ready(dev);
735 for (i = 0; i < dev->nvqs; ++i) {
736 struct vhost_vring_state state = {
737 .index = dev->vq_index + i,
738 .num = 1,
740 vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
742 return 0;
745 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config,
746 uint32_t config_len)
748 int b, len;
749 char line[QEMU_HEXDUMP_LINE_LEN];
751 for (b = 0; b < config_len; b += 16) {
752 len = config_len - b;
753 qemu_hexdump_line(line, b, config, len, false);
754 trace_vhost_vdpa_dump_config(dev, line);
758 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data,
759 uint32_t offset, uint32_t size,
760 uint32_t flags)
762 struct vhost_vdpa_config *config;
763 int ret;
764 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
766 trace_vhost_vdpa_set_config(dev, offset, size, flags);
767 config = g_malloc(size + config_size);
768 config->off = offset;
769 config->len = size;
770 memcpy(config->buf, data, size);
771 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) &&
772 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
773 vhost_vdpa_dump_config(dev, data, size);
775 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config);
776 g_free(config);
777 return ret;
780 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
781 uint32_t config_len, Error **errp)
783 struct vhost_vdpa_config *v_config;
784 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
785 int ret;
787 trace_vhost_vdpa_get_config(dev, config, config_len);
788 v_config = g_malloc(config_len + config_size);
789 v_config->len = config_len;
790 v_config->off = 0;
791 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config);
792 memcpy(config, v_config->buf, config_len);
793 g_free(v_config);
794 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) &&
795 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
796 vhost_vdpa_dump_config(dev, config, config_len);
798 return ret;
801 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
802 struct vhost_vring_state *ring)
804 trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
805 return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
808 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
809 struct vhost_vring_file *file)
811 trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
812 return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
815 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
816 struct vhost_vring_file *file)
818 trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
819 return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
822 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
823 struct vhost_vring_addr *addr)
825 trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
826 addr->desc_user_addr, addr->used_user_addr,
827 addr->avail_user_addr,
828 addr->log_guest_addr);
830 return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
835 * Set the shadow virtqueue descriptors to the device
837 * @dev: The vhost device model
838 * @svq: The shadow virtqueue
839 * @idx: The index of the virtqueue in the vhost device
840 * @errp: Error
842 * Note that this function does not rewind kick file descriptor if cannot set
843 * call one.
845 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
846 VhostShadowVirtqueue *svq, unsigned idx,
847 Error **errp)
849 struct vhost_vring_file file = {
850 .index = dev->vq_index + idx,
852 const EventNotifier *event_notifier = &svq->hdev_kick;
853 int r;
855 file.fd = event_notifier_get_fd(event_notifier);
856 r = vhost_vdpa_set_vring_dev_kick(dev, &file);
857 if (unlikely(r != 0)) {
858 error_setg_errno(errp, -r, "Can't set device kick fd");
859 return r;
862 event_notifier = &svq->hdev_call;
863 file.fd = event_notifier_get_fd(event_notifier);
864 r = vhost_vdpa_set_vring_dev_call(dev, &file);
865 if (unlikely(r != 0)) {
866 error_setg_errno(errp, -r, "Can't set device call fd");
869 return r;
873 * Unmap a SVQ area in the device
875 static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
876 const DMAMap *needle)
878 const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
879 hwaddr size;
880 int r;
882 if (unlikely(!result)) {
883 error_report("Unable to find SVQ address to unmap");
884 return false;
887 size = ROUND_UP(result->size, qemu_real_host_page_size());
888 r = vhost_vdpa_dma_unmap(v, result->iova, size);
889 return r == 0;
892 static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
893 const VhostShadowVirtqueue *svq)
895 DMAMap needle = {};
896 struct vhost_vdpa *v = dev->opaque;
897 struct vhost_vring_addr svq_addr;
898 bool ok;
900 vhost_svq_get_vring_addr(svq, &svq_addr);
902 needle.translated_addr = svq_addr.desc_user_addr;
903 ok = vhost_vdpa_svq_unmap_ring(v, &needle);
904 if (unlikely(!ok)) {
905 return false;
908 needle.translated_addr = svq_addr.used_user_addr;
909 return vhost_vdpa_svq_unmap_ring(v, &needle);
913 * Map the SVQ area in the device
915 * @v: Vhost-vdpa device
916 * @needle: The area to search iova
917 * @errorp: Error pointer
919 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
920 Error **errp)
922 int r;
924 r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
925 if (unlikely(r != IOVA_OK)) {
926 error_setg(errp, "Cannot allocate iova (%d)", r);
927 return false;
930 r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
931 (void *)(uintptr_t)needle->translated_addr,
932 needle->perm == IOMMU_RO);
933 if (unlikely(r != 0)) {
934 error_setg_errno(errp, -r, "Cannot map region to device");
935 vhost_iova_tree_remove(v->iova_tree, needle);
938 return r == 0;
942 * Map the shadow virtqueue rings in the device
944 * @dev: The vhost device
945 * @svq: The shadow virtqueue
946 * @addr: Assigned IOVA addresses
947 * @errp: Error pointer
949 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
950 const VhostShadowVirtqueue *svq,
951 struct vhost_vring_addr *addr,
952 Error **errp)
954 DMAMap device_region, driver_region;
955 struct vhost_vring_addr svq_addr;
956 struct vhost_vdpa *v = dev->opaque;
957 size_t device_size = vhost_svq_device_area_size(svq);
958 size_t driver_size = vhost_svq_driver_area_size(svq);
959 size_t avail_offset;
960 bool ok;
962 ERRP_GUARD();
963 vhost_svq_get_vring_addr(svq, &svq_addr);
965 driver_region = (DMAMap) {
966 .translated_addr = svq_addr.desc_user_addr,
967 .size = driver_size - 1,
968 .perm = IOMMU_RO,
970 ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
971 if (unlikely(!ok)) {
972 error_prepend(errp, "Cannot create vq driver region: ");
973 return false;
975 addr->desc_user_addr = driver_region.iova;
976 avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
977 addr->avail_user_addr = driver_region.iova + avail_offset;
979 device_region = (DMAMap) {
980 .translated_addr = svq_addr.used_user_addr,
981 .size = device_size - 1,
982 .perm = IOMMU_RW,
984 ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
985 if (unlikely(!ok)) {
986 error_prepend(errp, "Cannot create vq device region: ");
987 vhost_vdpa_svq_unmap_ring(v, &driver_region);
989 addr->used_user_addr = device_region.iova;
991 return ok;
994 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
995 VhostShadowVirtqueue *svq, unsigned idx,
996 Error **errp)
998 uint16_t vq_index = dev->vq_index + idx;
999 struct vhost_vring_state s = {
1000 .index = vq_index,
1002 int r;
1004 r = vhost_vdpa_set_dev_vring_base(dev, &s);
1005 if (unlikely(r)) {
1006 error_setg_errno(errp, -r, "Cannot set vring base");
1007 return false;
1010 r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
1011 return r == 0;
1014 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
1016 struct vhost_vdpa *v = dev->opaque;
1017 Error *err = NULL;
1018 unsigned i;
1020 if (!v->shadow_vqs) {
1021 return true;
1024 for (i = 0; i < v->shadow_vqs->len; ++i) {
1025 VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
1026 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1027 struct vhost_vring_addr addr = {
1028 .index = dev->vq_index + i,
1030 int r;
1031 bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
1032 if (unlikely(!ok)) {
1033 goto err;
1036 vhost_svq_start(svq, dev->vdev, vq);
1037 ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
1038 if (unlikely(!ok)) {
1039 goto err_map;
1042 /* Override vring GPA set by vhost subsystem */
1043 r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
1044 if (unlikely(r != 0)) {
1045 error_setg_errno(&err, -r, "Cannot set device address");
1046 goto err_set_addr;
1050 return true;
1052 err_set_addr:
1053 vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
1055 err_map:
1056 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
1058 err:
1059 error_reportf_err(err, "Cannot setup SVQ %u: ", i);
1060 for (unsigned j = 0; j < i; ++j) {
1061 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
1062 vhost_vdpa_svq_unmap_rings(dev, svq);
1063 vhost_svq_stop(svq);
1066 return false;
1069 static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
1071 struct vhost_vdpa *v = dev->opaque;
1073 if (!v->shadow_vqs) {
1074 return true;
1077 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
1078 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1079 bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
1080 if (unlikely(!ok)) {
1081 return false;
1085 return true;
1088 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
1090 struct vhost_vdpa *v = dev->opaque;
1091 bool ok;
1092 trace_vhost_vdpa_dev_start(dev, started);
1094 if (started) {
1095 vhost_vdpa_host_notifiers_init(dev);
1096 ok = vhost_vdpa_svqs_start(dev);
1097 if (unlikely(!ok)) {
1098 return -1;
1100 vhost_vdpa_set_vring_ready(dev);
1101 } else {
1102 ok = vhost_vdpa_svqs_stop(dev);
1103 if (unlikely(!ok)) {
1104 return -1;
1106 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
1109 if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
1110 return 0;
1113 if (started) {
1114 memory_listener_register(&v->listener, &address_space_memory);
1115 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
1116 } else {
1117 vhost_vdpa_reset_device(dev);
1118 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
1119 VIRTIO_CONFIG_S_DRIVER);
1120 memory_listener_unregister(&v->listener);
1122 return 0;
1126 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
1127 struct vhost_log *log)
1129 struct vhost_vdpa *v = dev->opaque;
1130 if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) {
1131 return 0;
1134 trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd,
1135 log->log);
1136 return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base);
1139 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
1140 struct vhost_vring_addr *addr)
1142 struct vhost_vdpa *v = dev->opaque;
1144 if (v->shadow_vqs_enabled) {
1146 * Device vring addr was set at device start. SVQ base is handled by
1147 * VirtQueue code.
1149 return 0;
1152 return vhost_vdpa_set_vring_dev_addr(dev, addr);
1155 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
1156 struct vhost_vring_state *ring)
1158 trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num);
1159 return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
1162 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
1163 struct vhost_vring_state *ring)
1165 struct vhost_vdpa *v = dev->opaque;
1167 if (v->shadow_vqs_enabled) {
1169 * Device vring base was set at device start. SVQ base is handled by
1170 * VirtQueue code.
1172 return 0;
1175 return vhost_vdpa_set_dev_vring_base(dev, ring);
1178 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
1179 struct vhost_vring_state *ring)
1181 struct vhost_vdpa *v = dev->opaque;
1182 int vdpa_idx = ring->index - dev->vq_index;
1183 int ret;
1185 if (v->shadow_vqs_enabled) {
1186 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1189 * Setting base as last used idx, so destination will see as available
1190 * all the entries that the device did not use, including the in-flight
1191 * processing ones.
1193 * TODO: This is ok for networking, but other kinds of devices might
1194 * have problems with these retransmissions.
1196 ring->num = svq->last_used_idx;
1197 return 0;
1200 ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
1201 trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
1202 return ret;
1205 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
1206 struct vhost_vring_file *file)
1208 struct vhost_vdpa *v = dev->opaque;
1209 int vdpa_idx = file->index - dev->vq_index;
1211 if (v->shadow_vqs_enabled) {
1212 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1213 vhost_svq_set_svq_kick_fd(svq, file->fd);
1214 return 0;
1215 } else {
1216 return vhost_vdpa_set_vring_dev_kick(dev, file);
1220 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
1221 struct vhost_vring_file *file)
1223 struct vhost_vdpa *v = dev->opaque;
1225 if (v->shadow_vqs_enabled) {
1226 int vdpa_idx = file->index - dev->vq_index;
1227 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1229 vhost_svq_set_svq_call_fd(svq, file->fd);
1230 return 0;
1231 } else {
1232 return vhost_vdpa_set_vring_dev_call(dev, file);
1236 static int vhost_vdpa_get_features(struct vhost_dev *dev,
1237 uint64_t *features)
1239 struct vhost_vdpa *v = dev->opaque;
1240 int ret = vhost_vdpa_get_dev_features(dev, features);
1242 if (ret == 0 && v->shadow_vqs_enabled) {
1243 /* Add SVQ logging capabilities */
1244 *features |= BIT_ULL(VHOST_F_LOG_ALL);
1247 return ret;
1250 static int vhost_vdpa_set_owner(struct vhost_dev *dev)
1252 if (!vhost_vdpa_first_dev(dev)) {
1253 return 0;
1256 trace_vhost_vdpa_set_owner(dev);
1257 return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
1260 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
1261 struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
1263 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
1264 addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
1265 addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
1266 addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
1267 trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr,
1268 addr->avail_user_addr, addr->used_user_addr);
1269 return 0;
1272 static bool vhost_vdpa_force_iommu(struct vhost_dev *dev)
1274 return true;
1277 const VhostOps vdpa_ops = {
1278 .backend_type = VHOST_BACKEND_TYPE_VDPA,
1279 .vhost_backend_init = vhost_vdpa_init,
1280 .vhost_backend_cleanup = vhost_vdpa_cleanup,
1281 .vhost_set_log_base = vhost_vdpa_set_log_base,
1282 .vhost_set_vring_addr = vhost_vdpa_set_vring_addr,
1283 .vhost_set_vring_num = vhost_vdpa_set_vring_num,
1284 .vhost_set_vring_base = vhost_vdpa_set_vring_base,
1285 .vhost_get_vring_base = vhost_vdpa_get_vring_base,
1286 .vhost_set_vring_kick = vhost_vdpa_set_vring_kick,
1287 .vhost_set_vring_call = vhost_vdpa_set_vring_call,
1288 .vhost_get_features = vhost_vdpa_get_features,
1289 .vhost_set_backend_cap = vhost_vdpa_set_backend_cap,
1290 .vhost_set_owner = vhost_vdpa_set_owner,
1291 .vhost_set_vring_endian = NULL,
1292 .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit,
1293 .vhost_set_mem_table = vhost_vdpa_set_mem_table,
1294 .vhost_set_features = vhost_vdpa_set_features,
1295 .vhost_reset_device = vhost_vdpa_reset_device,
1296 .vhost_get_vq_index = vhost_vdpa_get_vq_index,
1297 .vhost_get_config = vhost_vdpa_get_config,
1298 .vhost_set_config = vhost_vdpa_set_config,
1299 .vhost_requires_shm_log = NULL,
1300 .vhost_migration_done = NULL,
1301 .vhost_backend_can_merge = NULL,
1302 .vhost_net_set_mtu = NULL,
1303 .vhost_set_iotlb_callback = NULL,
1304 .vhost_send_device_iotlb_msg = NULL,
1305 .vhost_dev_start = vhost_vdpa_dev_start,
1306 .vhost_get_device_id = vhost_vdpa_get_device_id,
1307 .vhost_vq_get_addr = vhost_vdpa_vq_get_addr,
1308 .vhost_force_iommu = vhost_vdpa_force_iommu,