4 * Copyright(c) 2017-2018 Intel Corporation.
5 * Copyright(c) 2020 Red Hat, Inc.
7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
8 * See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
13 #include <linux/vhost.h>
14 #include <linux/vfio.h>
15 #include <sys/eventfd.h>
16 #include <sys/ioctl.h>
17 #include "hw/virtio/vhost.h"
18 #include "hw/virtio/vhost-backend.h"
19 #include "hw/virtio/virtio-net.h"
20 #include "hw/virtio/vhost-shadow-virtqueue.h"
21 #include "hw/virtio/vhost-vdpa.h"
22 #include "exec/address-spaces.h"
23 #include "migration/blocker.h"
24 #include "qemu/cutils.h"
25 #include "qemu/main-loop.h"
28 #include "qapi/error.h"
31 * Return one past the end of the end of section. Be careful with uint64_t
34 static Int128
vhost_vdpa_section_end(const MemoryRegionSection
*section
)
36 Int128 llend
= int128_make64(section
->offset_within_address_space
);
37 llend
= int128_add(llend
, section
->size
);
38 llend
= int128_and(llend
, int128_exts64(TARGET_PAGE_MASK
));
43 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection
*section
,
49 if ((!memory_region_is_ram(section
->mr
) &&
50 !memory_region_is_iommu(section
->mr
)) ||
51 memory_region_is_protected(section
->mr
) ||
52 /* vhost-vDPA doesn't allow MMIO to be mapped */
53 memory_region_is_ram_device(section
->mr
)) {
57 if (section
->offset_within_address_space
< iova_min
) {
58 error_report("RAM section out of device range (min=0x%" PRIx64
59 ", addr=0x%" HWADDR_PRIx
")",
60 iova_min
, section
->offset_within_address_space
);
64 llend
= vhost_vdpa_section_end(section
);
65 if (int128_gt(llend
, int128_make64(iova_max
))) {
66 error_report("RAM section out of device range (max=0x%" PRIx64
67 ", end addr=0x%" PRIx64
")",
68 iova_max
, int128_get64(llend
));
75 int vhost_vdpa_dma_map(struct vhost_vdpa
*v
, hwaddr iova
, hwaddr size
,
76 void *vaddr
, bool readonly
)
78 struct vhost_msg_v2 msg
= {};
79 int fd
= v
->device_fd
;
82 msg
.type
= v
->msg_type
;
83 msg
.iotlb
.iova
= iova
;
84 msg
.iotlb
.size
= size
;
85 msg
.iotlb
.uaddr
= (uint64_t)(uintptr_t)vaddr
;
86 msg
.iotlb
.perm
= readonly
? VHOST_ACCESS_RO
: VHOST_ACCESS_RW
;
87 msg
.iotlb
.type
= VHOST_IOTLB_UPDATE
;
89 trace_vhost_vdpa_dma_map(v
, fd
, msg
.type
, msg
.iotlb
.iova
, msg
.iotlb
.size
,
90 msg
.iotlb
.uaddr
, msg
.iotlb
.perm
, msg
.iotlb
.type
);
92 if (write(fd
, &msg
, sizeof(msg
)) != sizeof(msg
)) {
93 error_report("failed to write, fd=%d, errno=%d (%s)",
94 fd
, errno
, strerror(errno
));
101 int vhost_vdpa_dma_unmap(struct vhost_vdpa
*v
, hwaddr iova
, hwaddr size
)
103 struct vhost_msg_v2 msg
= {};
104 int fd
= v
->device_fd
;
107 msg
.type
= v
->msg_type
;
108 msg
.iotlb
.iova
= iova
;
109 msg
.iotlb
.size
= size
;
110 msg
.iotlb
.type
= VHOST_IOTLB_INVALIDATE
;
112 trace_vhost_vdpa_dma_unmap(v
, fd
, msg
.type
, msg
.iotlb
.iova
,
113 msg
.iotlb
.size
, msg
.iotlb
.type
);
115 if (write(fd
, &msg
, sizeof(msg
)) != sizeof(msg
)) {
116 error_report("failed to write, fd=%d, errno=%d (%s)",
117 fd
, errno
, strerror(errno
));
124 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa
*v
)
126 int fd
= v
->device_fd
;
127 struct vhost_msg_v2 msg
= {
129 .iotlb
.type
= VHOST_IOTLB_BATCH_BEGIN
,
132 trace_vhost_vdpa_listener_begin_batch(v
, fd
, msg
.type
, msg
.iotlb
.type
);
133 if (write(fd
, &msg
, sizeof(msg
)) != sizeof(msg
)) {
134 error_report("failed to write, fd=%d, errno=%d (%s)",
135 fd
, errno
, strerror(errno
));
139 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa
*v
)
141 if (v
->dev
->backend_cap
& (0x1ULL
<< VHOST_BACKEND_F_IOTLB_BATCH
) &&
142 !v
->iotlb_batch_begin_sent
) {
143 vhost_vdpa_listener_begin_batch(v
);
146 v
->iotlb_batch_begin_sent
= true;
149 static void vhost_vdpa_listener_commit(MemoryListener
*listener
)
151 struct vhost_vdpa
*v
= container_of(listener
, struct vhost_vdpa
, listener
);
152 struct vhost_dev
*dev
= v
->dev
;
153 struct vhost_msg_v2 msg
= {};
154 int fd
= v
->device_fd
;
156 if (!(dev
->backend_cap
& (0x1ULL
<< VHOST_BACKEND_F_IOTLB_BATCH
))) {
160 if (!v
->iotlb_batch_begin_sent
) {
164 msg
.type
= v
->msg_type
;
165 msg
.iotlb
.type
= VHOST_IOTLB_BATCH_END
;
167 trace_vhost_vdpa_listener_commit(v
, fd
, msg
.type
, msg
.iotlb
.type
);
168 if (write(fd
, &msg
, sizeof(msg
)) != sizeof(msg
)) {
169 error_report("failed to write, fd=%d, errno=%d (%s)",
170 fd
, errno
, strerror(errno
));
173 v
->iotlb_batch_begin_sent
= false;
176 static void vhost_vdpa_listener_region_add(MemoryListener
*listener
,
177 MemoryRegionSection
*section
)
179 DMAMap mem_region
= {};
180 struct vhost_vdpa
*v
= container_of(listener
, struct vhost_vdpa
, listener
);
182 Int128 llend
, llsize
;
186 if (vhost_vdpa_listener_skipped_section(section
, v
->iova_range
.first
,
187 v
->iova_range
.last
)) {
191 if (unlikely((section
->offset_within_address_space
& ~TARGET_PAGE_MASK
) !=
192 (section
->offset_within_region
& ~TARGET_PAGE_MASK
))) {
193 error_report("%s received unaligned region", __func__
);
197 iova
= TARGET_PAGE_ALIGN(section
->offset_within_address_space
);
198 llend
= vhost_vdpa_section_end(section
);
199 if (int128_ge(int128_make64(iova
), llend
)) {
203 memory_region_ref(section
->mr
);
205 /* Here we assume that memory_region_is_ram(section->mr)==true */
207 vaddr
= memory_region_get_ram_ptr(section
->mr
) +
208 section
->offset_within_region
+
209 (iova
- section
->offset_within_address_space
);
211 trace_vhost_vdpa_listener_region_add(v
, iova
, int128_get64(llend
),
212 vaddr
, section
->readonly
);
214 llsize
= int128_sub(llend
, int128_make64(iova
));
215 if (v
->shadow_vqs_enabled
) {
218 mem_region
.translated_addr
= (hwaddr
)(uintptr_t)vaddr
,
219 mem_region
.size
= int128_get64(llsize
) - 1,
220 mem_region
.perm
= IOMMU_ACCESS_FLAG(true, section
->readonly
),
222 r
= vhost_iova_tree_map_alloc(v
->iova_tree
, &mem_region
);
223 if (unlikely(r
!= IOVA_OK
)) {
224 error_report("Can't allocate a mapping (%d)", r
);
228 iova
= mem_region
.iova
;
231 vhost_vdpa_iotlb_batch_begin_once(v
);
232 ret
= vhost_vdpa_dma_map(v
, iova
, int128_get64(llsize
),
233 vaddr
, section
->readonly
);
235 error_report("vhost vdpa map fail!");
242 if (v
->shadow_vqs_enabled
) {
243 vhost_iova_tree_remove(v
->iova_tree
, mem_region
);
248 * On the initfn path, store the first error in the container so we
249 * can gracefully fail. Runtime, there's not much we can do other
250 * than throw a hardware error.
252 error_report("vhost-vdpa: DMA mapping failed, unable to continue");
257 static void vhost_vdpa_listener_region_del(MemoryListener
*listener
,
258 MemoryRegionSection
*section
)
260 struct vhost_vdpa
*v
= container_of(listener
, struct vhost_vdpa
, listener
);
262 Int128 llend
, llsize
;
265 if (vhost_vdpa_listener_skipped_section(section
, v
->iova_range
.first
,
266 v
->iova_range
.last
)) {
270 if (unlikely((section
->offset_within_address_space
& ~TARGET_PAGE_MASK
) !=
271 (section
->offset_within_region
& ~TARGET_PAGE_MASK
))) {
272 error_report("%s received unaligned region", __func__
);
276 iova
= TARGET_PAGE_ALIGN(section
->offset_within_address_space
);
277 llend
= vhost_vdpa_section_end(section
);
279 trace_vhost_vdpa_listener_region_del(v
, iova
, int128_get64(llend
));
281 if (int128_ge(int128_make64(iova
), llend
)) {
285 llsize
= int128_sub(llend
, int128_make64(iova
));
287 if (v
->shadow_vqs_enabled
) {
288 const DMAMap
*result
;
289 const void *vaddr
= memory_region_get_ram_ptr(section
->mr
) +
290 section
->offset_within_region
+
291 (iova
- section
->offset_within_address_space
);
292 DMAMap mem_region
= {
293 .translated_addr
= (hwaddr
)(uintptr_t)vaddr
,
294 .size
= int128_get64(llsize
) - 1,
297 result
= vhost_iova_tree_find_iova(v
->iova_tree
, &mem_region
);
299 /* The memory listener map wasn't mapped */
303 vhost_iova_tree_remove(v
->iova_tree
, *result
);
305 vhost_vdpa_iotlb_batch_begin_once(v
);
306 ret
= vhost_vdpa_dma_unmap(v
, iova
, int128_get64(llsize
));
308 error_report("vhost_vdpa dma unmap error!");
311 memory_region_unref(section
->mr
);
314 * IOTLB API is used by vhost-vdpa which requires incremental updating
315 * of the mapping. So we can not use generic vhost memory listener which
316 * depends on the addnop().
318 static const MemoryListener vhost_vdpa_memory_listener
= {
319 .name
= "vhost-vdpa",
320 .commit
= vhost_vdpa_listener_commit
,
321 .region_add
= vhost_vdpa_listener_region_add
,
322 .region_del
= vhost_vdpa_listener_region_del
,
325 static int vhost_vdpa_call(struct vhost_dev
*dev
, unsigned long int request
,
328 struct vhost_vdpa
*v
= dev
->opaque
;
329 int fd
= v
->device_fd
;
332 assert(dev
->vhost_ops
->backend_type
== VHOST_BACKEND_TYPE_VDPA
);
334 ret
= ioctl(fd
, request
, arg
);
335 return ret
< 0 ? -errno
: ret
;
338 static int vhost_vdpa_add_status(struct vhost_dev
*dev
, uint8_t status
)
343 trace_vhost_vdpa_add_status(dev
, status
);
344 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_GET_STATUS
, &s
);
351 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_SET_STATUS
, &s
);
356 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_GET_STATUS
, &s
);
368 static void vhost_vdpa_get_iova_range(struct vhost_vdpa
*v
)
370 int ret
= vhost_vdpa_call(v
->dev
, VHOST_VDPA_GET_IOVA_RANGE
,
373 v
->iova_range
.first
= 0;
374 v
->iova_range
.last
= UINT64_MAX
;
377 trace_vhost_vdpa_get_iova_range(v
->dev
, v
->iova_range
.first
,
382 * The use of this function is for requests that only need to be
383 * applied once. Typically such request occurs at the beginning
384 * of operation, and before setting up queues. It should not be
385 * used for request that performs operation until all queues are
386 * set, which would need to check dev->vq_index_end instead.
388 static bool vhost_vdpa_first_dev(struct vhost_dev
*dev
)
390 struct vhost_vdpa
*v
= dev
->opaque
;
392 return v
->index
== 0;
395 static int vhost_vdpa_get_dev_features(struct vhost_dev
*dev
,
400 ret
= vhost_vdpa_call(dev
, VHOST_GET_FEATURES
, features
);
401 trace_vhost_vdpa_get_features(dev
, *features
);
405 static int vhost_vdpa_init_svq(struct vhost_dev
*hdev
, struct vhost_vdpa
*v
,
408 g_autoptr(GPtrArray
) shadow_vqs
= NULL
;
409 uint64_t dev_features
, svq_features
;
413 if (!v
->shadow_vqs_enabled
) {
417 r
= vhost_vdpa_get_dev_features(hdev
, &dev_features
);
419 error_setg_errno(errp
, -r
, "Can't get vdpa device features");
423 svq_features
= dev_features
;
424 ok
= vhost_svq_valid_features(svq_features
, errp
);
429 shadow_vqs
= g_ptr_array_new_full(hdev
->nvqs
, vhost_svq_free
);
430 for (unsigned n
= 0; n
< hdev
->nvqs
; ++n
) {
431 g_autoptr(VhostShadowVirtqueue
) svq
;
433 svq
= vhost_svq_new(v
->iova_tree
, v
->shadow_vq_ops
,
434 v
->shadow_vq_ops_opaque
);
435 if (unlikely(!svq
)) {
436 error_setg(errp
, "Cannot create svq %u", n
);
439 g_ptr_array_add(shadow_vqs
, g_steal_pointer(&svq
));
442 v
->shadow_vqs
= g_steal_pointer(&shadow_vqs
);
446 static int vhost_vdpa_init(struct vhost_dev
*dev
, void *opaque
, Error
**errp
)
448 struct vhost_vdpa
*v
;
449 assert(dev
->vhost_ops
->backend_type
== VHOST_BACKEND_TYPE_VDPA
);
450 trace_vhost_vdpa_init(dev
, opaque
);
454 * Similar to VFIO, we end up pinning all guest memory and have to
455 * disable discarding of RAM.
457 ret
= ram_block_discard_disable(true);
459 error_report("Cannot set discarding of RAM broken");
465 dev
->opaque
= opaque
;
466 v
->listener
= vhost_vdpa_memory_listener
;
467 v
->msg_type
= VHOST_IOTLB_MSG_V2
;
468 ret
= vhost_vdpa_init_svq(dev
, v
, errp
);
473 vhost_vdpa_get_iova_range(v
);
475 if (!vhost_vdpa_first_dev(dev
)) {
479 vhost_vdpa_add_status(dev
, VIRTIO_CONFIG_S_ACKNOWLEDGE
|
480 VIRTIO_CONFIG_S_DRIVER
);
485 ram_block_discard_disable(false);
489 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev
*dev
,
492 size_t page_size
= qemu_real_host_page_size();
493 struct vhost_vdpa
*v
= dev
->opaque
;
494 VirtIODevice
*vdev
= dev
->vdev
;
495 VhostVDPAHostNotifier
*n
;
497 n
= &v
->notifier
[queue_index
];
500 virtio_queue_set_host_notifier_mr(vdev
, queue_index
, &n
->mr
, false);
501 object_unparent(OBJECT(&n
->mr
));
502 munmap(n
->addr
, page_size
);
507 static int vhost_vdpa_host_notifier_init(struct vhost_dev
*dev
, int queue_index
)
509 size_t page_size
= qemu_real_host_page_size();
510 struct vhost_vdpa
*v
= dev
->opaque
;
511 VirtIODevice
*vdev
= dev
->vdev
;
512 VhostVDPAHostNotifier
*n
;
513 int fd
= v
->device_fd
;
517 vhost_vdpa_host_notifier_uninit(dev
, queue_index
);
519 n
= &v
->notifier
[queue_index
];
521 addr
= mmap(NULL
, page_size
, PROT_WRITE
, MAP_SHARED
, fd
,
522 queue_index
* page_size
);
523 if (addr
== MAP_FAILED
) {
527 name
= g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
529 memory_region_init_ram_device_ptr(&n
->mr
, OBJECT(vdev
), name
,
533 if (virtio_queue_set_host_notifier_mr(vdev
, queue_index
, &n
->mr
, true)) {
534 object_unparent(OBJECT(&n
->mr
));
535 munmap(addr
, page_size
);
546 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev
*dev
, int n
)
550 for (i
= dev
->vq_index
; i
< dev
->vq_index
+ n
; i
++) {
551 vhost_vdpa_host_notifier_uninit(dev
, i
);
555 static void vhost_vdpa_host_notifiers_init(struct vhost_dev
*dev
)
557 struct vhost_vdpa
*v
= dev
->opaque
;
560 if (v
->shadow_vqs_enabled
) {
561 /* FIXME SVQ is not compatible with host notifiers mr */
565 for (i
= dev
->vq_index
; i
< dev
->vq_index
+ dev
->nvqs
; i
++) {
566 if (vhost_vdpa_host_notifier_init(dev
, i
)) {
574 vhost_vdpa_host_notifiers_uninit(dev
, i
- dev
->vq_index
);
578 static void vhost_vdpa_svq_cleanup(struct vhost_dev
*dev
)
580 struct vhost_vdpa
*v
= dev
->opaque
;
583 if (!v
->shadow_vqs
) {
587 for (idx
= 0; idx
< v
->shadow_vqs
->len
; ++idx
) {
588 vhost_svq_stop(g_ptr_array_index(v
->shadow_vqs
, idx
));
590 g_ptr_array_free(v
->shadow_vqs
, true);
593 static int vhost_vdpa_cleanup(struct vhost_dev
*dev
)
595 struct vhost_vdpa
*v
;
596 assert(dev
->vhost_ops
->backend_type
== VHOST_BACKEND_TYPE_VDPA
);
598 trace_vhost_vdpa_cleanup(dev
, v
);
599 vhost_vdpa_host_notifiers_uninit(dev
, dev
->nvqs
);
600 memory_listener_unregister(&v
->listener
);
601 vhost_vdpa_svq_cleanup(dev
);
604 ram_block_discard_disable(false);
609 static int vhost_vdpa_memslots_limit(struct vhost_dev
*dev
)
611 trace_vhost_vdpa_memslots_limit(dev
, INT_MAX
);
615 static int vhost_vdpa_set_mem_table(struct vhost_dev
*dev
,
616 struct vhost_memory
*mem
)
618 if (!vhost_vdpa_first_dev(dev
)) {
622 trace_vhost_vdpa_set_mem_table(dev
, mem
->nregions
, mem
->padding
);
623 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE
) &&
624 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS
)) {
626 for (i
= 0; i
< mem
->nregions
; i
++) {
627 trace_vhost_vdpa_dump_regions(dev
, i
,
628 mem
->regions
[i
].guest_phys_addr
,
629 mem
->regions
[i
].memory_size
,
630 mem
->regions
[i
].userspace_addr
,
631 mem
->regions
[i
].flags_padding
);
641 static int vhost_vdpa_set_features(struct vhost_dev
*dev
,
644 struct vhost_vdpa
*v
= dev
->opaque
;
647 if (!vhost_vdpa_first_dev(dev
)) {
651 if (v
->shadow_vqs_enabled
) {
652 if ((v
->acked_features
^ features
) == BIT_ULL(VHOST_F_LOG_ALL
)) {
654 * QEMU is just trying to enable or disable logging. SVQ handles
655 * this sepparately, so no need to forward this.
657 v
->acked_features
= features
;
661 v
->acked_features
= features
;
663 /* We must not ack _F_LOG if SVQ is enabled */
664 features
&= ~BIT_ULL(VHOST_F_LOG_ALL
);
667 trace_vhost_vdpa_set_features(dev
, features
);
668 ret
= vhost_vdpa_call(dev
, VHOST_SET_FEATURES
, &features
);
673 return vhost_vdpa_add_status(dev
, VIRTIO_CONFIG_S_FEATURES_OK
);
676 static int vhost_vdpa_set_backend_cap(struct vhost_dev
*dev
)
679 uint64_t f
= 0x1ULL
<< VHOST_BACKEND_F_IOTLB_MSG_V2
|
680 0x1ULL
<< VHOST_BACKEND_F_IOTLB_BATCH
;
683 if (vhost_vdpa_call(dev
, VHOST_GET_BACKEND_FEATURES
, &features
)) {
689 if (vhost_vdpa_first_dev(dev
)) {
690 r
= vhost_vdpa_call(dev
, VHOST_SET_BACKEND_FEATURES
, &features
);
696 dev
->backend_cap
= features
;
701 static int vhost_vdpa_get_device_id(struct vhost_dev
*dev
,
705 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_GET_DEVICE_ID
, device_id
);
706 trace_vhost_vdpa_get_device_id(dev
, *device_id
);
710 static void vhost_vdpa_reset_svq(struct vhost_vdpa
*v
)
712 if (!v
->shadow_vqs_enabled
) {
716 for (unsigned i
= 0; i
< v
->shadow_vqs
->len
; ++i
) {
717 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, i
);
722 static int vhost_vdpa_reset_device(struct vhost_dev
*dev
)
724 struct vhost_vdpa
*v
= dev
->opaque
;
728 vhost_vdpa_reset_svq(v
);
730 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_SET_STATUS
, &status
);
731 trace_vhost_vdpa_reset_device(dev
, status
);
735 static int vhost_vdpa_get_vq_index(struct vhost_dev
*dev
, int idx
)
737 assert(idx
>= dev
->vq_index
&& idx
< dev
->vq_index
+ dev
->nvqs
);
739 trace_vhost_vdpa_get_vq_index(dev
, idx
, idx
);
743 static int vhost_vdpa_set_vring_ready(struct vhost_dev
*dev
)
746 trace_vhost_vdpa_set_vring_ready(dev
);
747 for (i
= 0; i
< dev
->nvqs
; ++i
) {
748 struct vhost_vring_state state
= {
749 .index
= dev
->vq_index
+ i
,
752 vhost_vdpa_call(dev
, VHOST_VDPA_SET_VRING_ENABLE
, &state
);
757 static void vhost_vdpa_dump_config(struct vhost_dev
*dev
, const uint8_t *config
,
761 char line
[QEMU_HEXDUMP_LINE_LEN
];
763 for (b
= 0; b
< config_len
; b
+= 16) {
764 len
= config_len
- b
;
765 qemu_hexdump_line(line
, b
, config
, len
, false);
766 trace_vhost_vdpa_dump_config(dev
, line
);
770 static int vhost_vdpa_set_config(struct vhost_dev
*dev
, const uint8_t *data
,
771 uint32_t offset
, uint32_t size
,
774 struct vhost_vdpa_config
*config
;
776 unsigned long config_size
= offsetof(struct vhost_vdpa_config
, buf
);
778 trace_vhost_vdpa_set_config(dev
, offset
, size
, flags
);
779 config
= g_malloc(size
+ config_size
);
780 config
->off
= offset
;
782 memcpy(config
->buf
, data
, size
);
783 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG
) &&
784 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG
)) {
785 vhost_vdpa_dump_config(dev
, data
, size
);
787 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_SET_CONFIG
, config
);
792 static int vhost_vdpa_get_config(struct vhost_dev
*dev
, uint8_t *config
,
793 uint32_t config_len
, Error
**errp
)
795 struct vhost_vdpa_config
*v_config
;
796 unsigned long config_size
= offsetof(struct vhost_vdpa_config
, buf
);
799 trace_vhost_vdpa_get_config(dev
, config
, config_len
);
800 v_config
= g_malloc(config_len
+ config_size
);
801 v_config
->len
= config_len
;
803 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_GET_CONFIG
, v_config
);
804 memcpy(config
, v_config
->buf
, config_len
);
806 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG
) &&
807 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG
)) {
808 vhost_vdpa_dump_config(dev
, config
, config_len
);
813 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev
*dev
,
814 struct vhost_vring_state
*ring
)
816 trace_vhost_vdpa_set_vring_base(dev
, ring
->index
, ring
->num
);
817 return vhost_vdpa_call(dev
, VHOST_SET_VRING_BASE
, ring
);
820 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev
*dev
,
821 struct vhost_vring_file
*file
)
823 trace_vhost_vdpa_set_vring_kick(dev
, file
->index
, file
->fd
);
824 return vhost_vdpa_call(dev
, VHOST_SET_VRING_KICK
, file
);
827 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev
*dev
,
828 struct vhost_vring_file
*file
)
830 trace_vhost_vdpa_set_vring_call(dev
, file
->index
, file
->fd
);
831 return vhost_vdpa_call(dev
, VHOST_SET_VRING_CALL
, file
);
834 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev
*dev
,
835 struct vhost_vring_addr
*addr
)
837 trace_vhost_vdpa_set_vring_addr(dev
, addr
->index
, addr
->flags
,
838 addr
->desc_user_addr
, addr
->used_user_addr
,
839 addr
->avail_user_addr
,
840 addr
->log_guest_addr
);
842 return vhost_vdpa_call(dev
, VHOST_SET_VRING_ADDR
, addr
);
847 * Set the shadow virtqueue descriptors to the device
849 * @dev: The vhost device model
850 * @svq: The shadow virtqueue
851 * @idx: The index of the virtqueue in the vhost device
854 * Note that this function does not rewind kick file descriptor if cannot set
857 static int vhost_vdpa_svq_set_fds(struct vhost_dev
*dev
,
858 VhostShadowVirtqueue
*svq
, unsigned idx
,
861 struct vhost_vring_file file
= {
862 .index
= dev
->vq_index
+ idx
,
864 const EventNotifier
*event_notifier
= &svq
->hdev_kick
;
867 file
.fd
= event_notifier_get_fd(event_notifier
);
868 r
= vhost_vdpa_set_vring_dev_kick(dev
, &file
);
869 if (unlikely(r
!= 0)) {
870 error_setg_errno(errp
, -r
, "Can't set device kick fd");
874 event_notifier
= &svq
->hdev_call
;
875 file
.fd
= event_notifier_get_fd(event_notifier
);
876 r
= vhost_vdpa_set_vring_dev_call(dev
, &file
);
877 if (unlikely(r
!= 0)) {
878 error_setg_errno(errp
, -r
, "Can't set device call fd");
885 * Unmap a SVQ area in the device
887 static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa
*v
, hwaddr addr
)
889 const DMAMap needle
= {
890 .translated_addr
= addr
,
892 const DMAMap
*result
= vhost_iova_tree_find_iova(v
->iova_tree
, &needle
);
896 if (unlikely(!result
)) {
897 error_report("Unable to find SVQ address to unmap");
901 size
= ROUND_UP(result
->size
, qemu_real_host_page_size());
902 r
= vhost_vdpa_dma_unmap(v
, result
->iova
, size
);
903 if (unlikely(r
< 0)) {
904 error_report("Unable to unmap SVQ vring: %s (%d)", g_strerror(-r
), -r
);
908 vhost_iova_tree_remove(v
->iova_tree
, *result
);
911 static void vhost_vdpa_svq_unmap_rings(struct vhost_dev
*dev
,
912 const VhostShadowVirtqueue
*svq
)
914 struct vhost_vdpa
*v
= dev
->opaque
;
915 struct vhost_vring_addr svq_addr
;
917 vhost_svq_get_vring_addr(svq
, &svq_addr
);
919 vhost_vdpa_svq_unmap_ring(v
, svq_addr
.desc_user_addr
);
921 vhost_vdpa_svq_unmap_ring(v
, svq_addr
.used_user_addr
);
925 * Map the SVQ area in the device
927 * @v: Vhost-vdpa device
928 * @needle: The area to search iova
929 * @errorp: Error pointer
931 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa
*v
, DMAMap
*needle
,
936 r
= vhost_iova_tree_map_alloc(v
->iova_tree
, needle
);
937 if (unlikely(r
!= IOVA_OK
)) {
938 error_setg(errp
, "Cannot allocate iova (%d)", r
);
942 r
= vhost_vdpa_dma_map(v
, needle
->iova
, needle
->size
+ 1,
943 (void *)(uintptr_t)needle
->translated_addr
,
944 needle
->perm
== IOMMU_RO
);
945 if (unlikely(r
!= 0)) {
946 error_setg_errno(errp
, -r
, "Cannot map region to device");
947 vhost_iova_tree_remove(v
->iova_tree
, *needle
);
954 * Map the shadow virtqueue rings in the device
956 * @dev: The vhost device
957 * @svq: The shadow virtqueue
958 * @addr: Assigned IOVA addresses
959 * @errp: Error pointer
961 static bool vhost_vdpa_svq_map_rings(struct vhost_dev
*dev
,
962 const VhostShadowVirtqueue
*svq
,
963 struct vhost_vring_addr
*addr
,
966 DMAMap device_region
, driver_region
;
967 struct vhost_vring_addr svq_addr
;
968 struct vhost_vdpa
*v
= dev
->opaque
;
969 size_t device_size
= vhost_svq_device_area_size(svq
);
970 size_t driver_size
= vhost_svq_driver_area_size(svq
);
975 vhost_svq_get_vring_addr(svq
, &svq_addr
);
977 driver_region
= (DMAMap
) {
978 .translated_addr
= svq_addr
.desc_user_addr
,
979 .size
= driver_size
- 1,
982 ok
= vhost_vdpa_svq_map_ring(v
, &driver_region
, errp
);
984 error_prepend(errp
, "Cannot create vq driver region: ");
987 addr
->desc_user_addr
= driver_region
.iova
;
988 avail_offset
= svq_addr
.avail_user_addr
- svq_addr
.desc_user_addr
;
989 addr
->avail_user_addr
= driver_region
.iova
+ avail_offset
;
991 device_region
= (DMAMap
) {
992 .translated_addr
= svq_addr
.used_user_addr
,
993 .size
= device_size
- 1,
996 ok
= vhost_vdpa_svq_map_ring(v
, &device_region
, errp
);
998 error_prepend(errp
, "Cannot create vq device region: ");
999 vhost_vdpa_svq_unmap_ring(v
, driver_region
.translated_addr
);
1001 addr
->used_user_addr
= device_region
.iova
;
1006 static bool vhost_vdpa_svq_setup(struct vhost_dev
*dev
,
1007 VhostShadowVirtqueue
*svq
, unsigned idx
,
1010 uint16_t vq_index
= dev
->vq_index
+ idx
;
1011 struct vhost_vring_state s
= {
1016 r
= vhost_vdpa_set_dev_vring_base(dev
, &s
);
1018 error_setg_errno(errp
, -r
, "Cannot set vring base");
1022 r
= vhost_vdpa_svq_set_fds(dev
, svq
, idx
, errp
);
1026 static bool vhost_vdpa_svqs_start(struct vhost_dev
*dev
)
1028 struct vhost_vdpa
*v
= dev
->opaque
;
1032 if (!v
->shadow_vqs
) {
1036 for (i
= 0; i
< v
->shadow_vqs
->len
; ++i
) {
1037 VirtQueue
*vq
= virtio_get_queue(dev
->vdev
, dev
->vq_index
+ i
);
1038 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, i
);
1039 struct vhost_vring_addr addr
= {
1040 .index
= dev
->vq_index
+ i
,
1043 bool ok
= vhost_vdpa_svq_setup(dev
, svq
, i
, &err
);
1044 if (unlikely(!ok
)) {
1048 vhost_svq_start(svq
, dev
->vdev
, vq
);
1049 ok
= vhost_vdpa_svq_map_rings(dev
, svq
, &addr
, &err
);
1050 if (unlikely(!ok
)) {
1054 /* Override vring GPA set by vhost subsystem */
1055 r
= vhost_vdpa_set_vring_dev_addr(dev
, &addr
);
1056 if (unlikely(r
!= 0)) {
1057 error_setg_errno(&err
, -r
, "Cannot set device address");
1065 vhost_vdpa_svq_unmap_rings(dev
, g_ptr_array_index(v
->shadow_vqs
, i
));
1068 vhost_svq_stop(g_ptr_array_index(v
->shadow_vqs
, i
));
1071 error_reportf_err(err
, "Cannot setup SVQ %u: ", i
);
1072 for (unsigned j
= 0; j
< i
; ++j
) {
1073 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, j
);
1074 vhost_vdpa_svq_unmap_rings(dev
, svq
);
1075 vhost_svq_stop(svq
);
1081 static void vhost_vdpa_svqs_stop(struct vhost_dev
*dev
)
1083 struct vhost_vdpa
*v
= dev
->opaque
;
1085 if (!v
->shadow_vqs
) {
1089 for (unsigned i
= 0; i
< v
->shadow_vqs
->len
; ++i
) {
1090 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, i
);
1091 vhost_vdpa_svq_unmap_rings(dev
, svq
);
1095 static int vhost_vdpa_dev_start(struct vhost_dev
*dev
, bool started
)
1097 struct vhost_vdpa
*v
= dev
->opaque
;
1099 trace_vhost_vdpa_dev_start(dev
, started
);
1102 vhost_vdpa_host_notifiers_init(dev
);
1103 ok
= vhost_vdpa_svqs_start(dev
);
1104 if (unlikely(!ok
)) {
1107 vhost_vdpa_set_vring_ready(dev
);
1109 vhost_vdpa_svqs_stop(dev
);
1110 vhost_vdpa_host_notifiers_uninit(dev
, dev
->nvqs
);
1113 if (dev
->vq_index
+ dev
->nvqs
!= dev
->vq_index_end
) {
1118 memory_listener_register(&v
->listener
, &address_space_memory
);
1119 return vhost_vdpa_add_status(dev
, VIRTIO_CONFIG_S_DRIVER_OK
);
1121 vhost_vdpa_reset_device(dev
);
1122 vhost_vdpa_add_status(dev
, VIRTIO_CONFIG_S_ACKNOWLEDGE
|
1123 VIRTIO_CONFIG_S_DRIVER
);
1124 memory_listener_unregister(&v
->listener
);
1130 static int vhost_vdpa_set_log_base(struct vhost_dev
*dev
, uint64_t base
,
1131 struct vhost_log
*log
)
1133 struct vhost_vdpa
*v
= dev
->opaque
;
1134 if (v
->shadow_vqs_enabled
|| !vhost_vdpa_first_dev(dev
)) {
1138 trace_vhost_vdpa_set_log_base(dev
, base
, log
->size
, log
->refcnt
, log
->fd
,
1140 return vhost_vdpa_call(dev
, VHOST_SET_LOG_BASE
, &base
);
1143 static int vhost_vdpa_set_vring_addr(struct vhost_dev
*dev
,
1144 struct vhost_vring_addr
*addr
)
1146 struct vhost_vdpa
*v
= dev
->opaque
;
1148 if (v
->shadow_vqs_enabled
) {
1150 * Device vring addr was set at device start. SVQ base is handled by
1156 return vhost_vdpa_set_vring_dev_addr(dev
, addr
);
1159 static int vhost_vdpa_set_vring_num(struct vhost_dev
*dev
,
1160 struct vhost_vring_state
*ring
)
1162 trace_vhost_vdpa_set_vring_num(dev
, ring
->index
, ring
->num
);
1163 return vhost_vdpa_call(dev
, VHOST_SET_VRING_NUM
, ring
);
1166 static int vhost_vdpa_set_vring_base(struct vhost_dev
*dev
,
1167 struct vhost_vring_state
*ring
)
1169 struct vhost_vdpa
*v
= dev
->opaque
;
1170 VirtQueue
*vq
= virtio_get_queue(dev
->vdev
, ring
->index
);
1173 * vhost-vdpa devices does not support in-flight requests. Set all of them
1176 * TODO: This is ok for networking, but other kinds of devices might
1177 * have problems with these retransmissions.
1179 while (virtqueue_rewind(vq
, 1)) {
1182 if (v
->shadow_vqs_enabled
) {
1184 * Device vring base was set at device start. SVQ base is handled by
1190 return vhost_vdpa_set_dev_vring_base(dev
, ring
);
1193 static int vhost_vdpa_get_vring_base(struct vhost_dev
*dev
,
1194 struct vhost_vring_state
*ring
)
1196 struct vhost_vdpa
*v
= dev
->opaque
;
1199 if (v
->shadow_vqs_enabled
) {
1200 ring
->num
= virtio_queue_get_last_avail_idx(dev
->vdev
, ring
->index
);
1204 ret
= vhost_vdpa_call(dev
, VHOST_GET_VRING_BASE
, ring
);
1205 trace_vhost_vdpa_get_vring_base(dev
, ring
->index
, ring
->num
);
1209 static int vhost_vdpa_set_vring_kick(struct vhost_dev
*dev
,
1210 struct vhost_vring_file
*file
)
1212 struct vhost_vdpa
*v
= dev
->opaque
;
1213 int vdpa_idx
= file
->index
- dev
->vq_index
;
1215 if (v
->shadow_vqs_enabled
) {
1216 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, vdpa_idx
);
1217 vhost_svq_set_svq_kick_fd(svq
, file
->fd
);
1220 return vhost_vdpa_set_vring_dev_kick(dev
, file
);
1224 static int vhost_vdpa_set_vring_call(struct vhost_dev
*dev
,
1225 struct vhost_vring_file
*file
)
1227 struct vhost_vdpa
*v
= dev
->opaque
;
1229 if (v
->shadow_vqs_enabled
) {
1230 int vdpa_idx
= file
->index
- dev
->vq_index
;
1231 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, vdpa_idx
);
1233 vhost_svq_set_svq_call_fd(svq
, file
->fd
);
1236 return vhost_vdpa_set_vring_dev_call(dev
, file
);
1240 static int vhost_vdpa_get_features(struct vhost_dev
*dev
,
1243 struct vhost_vdpa
*v
= dev
->opaque
;
1244 int ret
= vhost_vdpa_get_dev_features(dev
, features
);
1246 if (ret
== 0 && v
->shadow_vqs_enabled
) {
1247 /* Add SVQ logging capabilities */
1248 *features
|= BIT_ULL(VHOST_F_LOG_ALL
);
1254 static int vhost_vdpa_set_owner(struct vhost_dev
*dev
)
1256 if (!vhost_vdpa_first_dev(dev
)) {
1260 trace_vhost_vdpa_set_owner(dev
);
1261 return vhost_vdpa_call(dev
, VHOST_SET_OWNER
, NULL
);
1264 static int vhost_vdpa_vq_get_addr(struct vhost_dev
*dev
,
1265 struct vhost_vring_addr
*addr
, struct vhost_virtqueue
*vq
)
1267 assert(dev
->vhost_ops
->backend_type
== VHOST_BACKEND_TYPE_VDPA
);
1268 addr
->desc_user_addr
= (uint64_t)(unsigned long)vq
->desc_phys
;
1269 addr
->avail_user_addr
= (uint64_t)(unsigned long)vq
->avail_phys
;
1270 addr
->used_user_addr
= (uint64_t)(unsigned long)vq
->used_phys
;
1271 trace_vhost_vdpa_vq_get_addr(dev
, vq
, addr
->desc_user_addr
,
1272 addr
->avail_user_addr
, addr
->used_user_addr
);
1276 static bool vhost_vdpa_force_iommu(struct vhost_dev
*dev
)
1281 const VhostOps vdpa_ops
= {
1282 .backend_type
= VHOST_BACKEND_TYPE_VDPA
,
1283 .vhost_backend_init
= vhost_vdpa_init
,
1284 .vhost_backend_cleanup
= vhost_vdpa_cleanup
,
1285 .vhost_set_log_base
= vhost_vdpa_set_log_base
,
1286 .vhost_set_vring_addr
= vhost_vdpa_set_vring_addr
,
1287 .vhost_set_vring_num
= vhost_vdpa_set_vring_num
,
1288 .vhost_set_vring_base
= vhost_vdpa_set_vring_base
,
1289 .vhost_get_vring_base
= vhost_vdpa_get_vring_base
,
1290 .vhost_set_vring_kick
= vhost_vdpa_set_vring_kick
,
1291 .vhost_set_vring_call
= vhost_vdpa_set_vring_call
,
1292 .vhost_get_features
= vhost_vdpa_get_features
,
1293 .vhost_set_backend_cap
= vhost_vdpa_set_backend_cap
,
1294 .vhost_set_owner
= vhost_vdpa_set_owner
,
1295 .vhost_set_vring_endian
= NULL
,
1296 .vhost_backend_memslots_limit
= vhost_vdpa_memslots_limit
,
1297 .vhost_set_mem_table
= vhost_vdpa_set_mem_table
,
1298 .vhost_set_features
= vhost_vdpa_set_features
,
1299 .vhost_reset_device
= vhost_vdpa_reset_device
,
1300 .vhost_get_vq_index
= vhost_vdpa_get_vq_index
,
1301 .vhost_get_config
= vhost_vdpa_get_config
,
1302 .vhost_set_config
= vhost_vdpa_set_config
,
1303 .vhost_requires_shm_log
= NULL
,
1304 .vhost_migration_done
= NULL
,
1305 .vhost_backend_can_merge
= NULL
,
1306 .vhost_net_set_mtu
= NULL
,
1307 .vhost_set_iotlb_callback
= NULL
,
1308 .vhost_send_device_iotlb_msg
= NULL
,
1309 .vhost_dev_start
= vhost_vdpa_dev_start
,
1310 .vhost_get_device_id
= vhost_vdpa_get_device_id
,
1311 .vhost_vq_get_addr
= vhost_vdpa_vq_get_addr
,
1312 .vhost_force_iommu
= vhost_vdpa_force_iommu
,