4 * Copyright(c) 2017-2018 Intel Corporation.
5 * Copyright(c) 2020 Red Hat, Inc.
7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
8 * See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
13 #include <linux/vhost.h>
14 #include <linux/vfio.h>
15 #include <sys/eventfd.h>
16 #include <sys/ioctl.h>
17 #include "hw/virtio/vhost.h"
18 #include "hw/virtio/vhost-backend.h"
19 #include "hw/virtio/virtio-net.h"
20 #include "hw/virtio/vhost-shadow-virtqueue.h"
21 #include "hw/virtio/vhost-vdpa.h"
22 #include "exec/address-spaces.h"
23 #include "qemu/cutils.h"
24 #include "qemu/main-loop.h"
27 #include "qapi/error.h"
30 * Return one past the end of the end of section. Be careful with uint64_t
33 static Int128
vhost_vdpa_section_end(const MemoryRegionSection
*section
)
35 Int128 llend
= int128_make64(section
->offset_within_address_space
);
36 llend
= int128_add(llend
, section
->size
);
37 llend
= int128_and(llend
, int128_exts64(TARGET_PAGE_MASK
));
42 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection
*section
,
48 if ((!memory_region_is_ram(section
->mr
) &&
49 !memory_region_is_iommu(section
->mr
)) ||
50 memory_region_is_protected(section
->mr
) ||
51 /* vhost-vDPA doesn't allow MMIO to be mapped */
52 memory_region_is_ram_device(section
->mr
)) {
56 if (section
->offset_within_address_space
< iova_min
) {
57 error_report("RAM section out of device range (min=0x%" PRIx64
58 ", addr=0x%" HWADDR_PRIx
")",
59 iova_min
, section
->offset_within_address_space
);
63 llend
= vhost_vdpa_section_end(section
);
64 if (int128_gt(llend
, int128_make64(iova_max
))) {
65 error_report("RAM section out of device range (max=0x%" PRIx64
66 ", end addr=0x%" PRIx64
")",
67 iova_max
, int128_get64(llend
));
74 static int vhost_vdpa_dma_map(struct vhost_vdpa
*v
, hwaddr iova
, hwaddr size
,
75 void *vaddr
, bool readonly
)
77 struct vhost_msg_v2 msg
= {};
78 int fd
= v
->device_fd
;
81 msg
.type
= v
->msg_type
;
82 msg
.iotlb
.iova
= iova
;
83 msg
.iotlb
.size
= size
;
84 msg
.iotlb
.uaddr
= (uint64_t)(uintptr_t)vaddr
;
85 msg
.iotlb
.perm
= readonly
? VHOST_ACCESS_RO
: VHOST_ACCESS_RW
;
86 msg
.iotlb
.type
= VHOST_IOTLB_UPDATE
;
88 trace_vhost_vdpa_dma_map(v
, fd
, msg
.type
, msg
.iotlb
.iova
, msg
.iotlb
.size
,
89 msg
.iotlb
.uaddr
, msg
.iotlb
.perm
, msg
.iotlb
.type
);
91 if (write(fd
, &msg
, sizeof(msg
)) != sizeof(msg
)) {
92 error_report("failed to write, fd=%d, errno=%d (%s)",
93 fd
, errno
, strerror(errno
));
100 static int vhost_vdpa_dma_unmap(struct vhost_vdpa
*v
, hwaddr iova
,
103 struct vhost_msg_v2 msg
= {};
104 int fd
= v
->device_fd
;
107 msg
.type
= v
->msg_type
;
108 msg
.iotlb
.iova
= iova
;
109 msg
.iotlb
.size
= size
;
110 msg
.iotlb
.type
= VHOST_IOTLB_INVALIDATE
;
112 trace_vhost_vdpa_dma_unmap(v
, fd
, msg
.type
, msg
.iotlb
.iova
,
113 msg
.iotlb
.size
, msg
.iotlb
.type
);
115 if (write(fd
, &msg
, sizeof(msg
)) != sizeof(msg
)) {
116 error_report("failed to write, fd=%d, errno=%d (%s)",
117 fd
, errno
, strerror(errno
));
124 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa
*v
)
126 int fd
= v
->device_fd
;
127 struct vhost_msg_v2 msg
= {
129 .iotlb
.type
= VHOST_IOTLB_BATCH_BEGIN
,
132 trace_vhost_vdpa_listener_begin_batch(v
, fd
, msg
.type
, msg
.iotlb
.type
);
133 if (write(fd
, &msg
, sizeof(msg
)) != sizeof(msg
)) {
134 error_report("failed to write, fd=%d, errno=%d (%s)",
135 fd
, errno
, strerror(errno
));
139 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa
*v
)
141 if (v
->dev
->backend_cap
& (0x1ULL
<< VHOST_BACKEND_F_IOTLB_BATCH
) &&
142 !v
->iotlb_batch_begin_sent
) {
143 vhost_vdpa_listener_begin_batch(v
);
146 v
->iotlb_batch_begin_sent
= true;
149 static void vhost_vdpa_listener_commit(MemoryListener
*listener
)
151 struct vhost_vdpa
*v
= container_of(listener
, struct vhost_vdpa
, listener
);
152 struct vhost_dev
*dev
= v
->dev
;
153 struct vhost_msg_v2 msg
= {};
154 int fd
= v
->device_fd
;
156 if (!(dev
->backend_cap
& (0x1ULL
<< VHOST_BACKEND_F_IOTLB_BATCH
))) {
160 if (!v
->iotlb_batch_begin_sent
) {
164 msg
.type
= v
->msg_type
;
165 msg
.iotlb
.type
= VHOST_IOTLB_BATCH_END
;
167 trace_vhost_vdpa_listener_commit(v
, fd
, msg
.type
, msg
.iotlb
.type
);
168 if (write(fd
, &msg
, sizeof(msg
)) != sizeof(msg
)) {
169 error_report("failed to write, fd=%d, errno=%d (%s)",
170 fd
, errno
, strerror(errno
));
173 v
->iotlb_batch_begin_sent
= false;
176 static void vhost_vdpa_listener_region_add(MemoryListener
*listener
,
177 MemoryRegionSection
*section
)
179 struct vhost_vdpa
*v
= container_of(listener
, struct vhost_vdpa
, listener
);
181 Int128 llend
, llsize
;
185 if (vhost_vdpa_listener_skipped_section(section
, v
->iova_range
.first
,
186 v
->iova_range
.last
)) {
190 if (unlikely((section
->offset_within_address_space
& ~TARGET_PAGE_MASK
) !=
191 (section
->offset_within_region
& ~TARGET_PAGE_MASK
))) {
192 error_report("%s received unaligned region", __func__
);
196 iova
= TARGET_PAGE_ALIGN(section
->offset_within_address_space
);
197 llend
= vhost_vdpa_section_end(section
);
198 if (int128_ge(int128_make64(iova
), llend
)) {
202 memory_region_ref(section
->mr
);
204 /* Here we assume that memory_region_is_ram(section->mr)==true */
206 vaddr
= memory_region_get_ram_ptr(section
->mr
) +
207 section
->offset_within_region
+
208 (iova
- section
->offset_within_address_space
);
210 trace_vhost_vdpa_listener_region_add(v
, iova
, int128_get64(llend
),
211 vaddr
, section
->readonly
);
213 llsize
= int128_sub(llend
, int128_make64(iova
));
214 if (v
->shadow_vqs_enabled
) {
215 DMAMap mem_region
= {
216 .translated_addr
= (hwaddr
)(uintptr_t)vaddr
,
217 .size
= int128_get64(llsize
) - 1,
218 .perm
= IOMMU_ACCESS_FLAG(true, section
->readonly
),
221 int r
= vhost_iova_tree_map_alloc(v
->iova_tree
, &mem_region
);
222 if (unlikely(r
!= IOVA_OK
)) {
223 error_report("Can't allocate a mapping (%d)", r
);
227 iova
= mem_region
.iova
;
230 vhost_vdpa_iotlb_batch_begin_once(v
);
231 ret
= vhost_vdpa_dma_map(v
, iova
, int128_get64(llsize
),
232 vaddr
, section
->readonly
);
234 error_report("vhost vdpa map fail!");
242 * On the initfn path, store the first error in the container so we
243 * can gracefully fail. Runtime, there's not much we can do other
244 * than throw a hardware error.
246 error_report("vhost-vdpa: DMA mapping failed, unable to continue");
251 static void vhost_vdpa_listener_region_del(MemoryListener
*listener
,
252 MemoryRegionSection
*section
)
254 struct vhost_vdpa
*v
= container_of(listener
, struct vhost_vdpa
, listener
);
256 Int128 llend
, llsize
;
259 if (vhost_vdpa_listener_skipped_section(section
, v
->iova_range
.first
,
260 v
->iova_range
.last
)) {
264 if (unlikely((section
->offset_within_address_space
& ~TARGET_PAGE_MASK
) !=
265 (section
->offset_within_region
& ~TARGET_PAGE_MASK
))) {
266 error_report("%s received unaligned region", __func__
);
270 iova
= TARGET_PAGE_ALIGN(section
->offset_within_address_space
);
271 llend
= vhost_vdpa_section_end(section
);
273 trace_vhost_vdpa_listener_region_del(v
, iova
, int128_get64(llend
));
275 if (int128_ge(int128_make64(iova
), llend
)) {
279 llsize
= int128_sub(llend
, int128_make64(iova
));
281 if (v
->shadow_vqs_enabled
) {
282 const DMAMap
*result
;
283 const void *vaddr
= memory_region_get_ram_ptr(section
->mr
) +
284 section
->offset_within_region
+
285 (iova
- section
->offset_within_address_space
);
286 DMAMap mem_region
= {
287 .translated_addr
= (hwaddr
)(uintptr_t)vaddr
,
288 .size
= int128_get64(llsize
) - 1,
291 result
= vhost_iova_tree_find_iova(v
->iova_tree
, &mem_region
);
293 vhost_iova_tree_remove(v
->iova_tree
, &mem_region
);
295 vhost_vdpa_iotlb_batch_begin_once(v
);
296 ret
= vhost_vdpa_dma_unmap(v
, iova
, int128_get64(llsize
));
298 error_report("vhost_vdpa dma unmap error!");
301 memory_region_unref(section
->mr
);
304 * IOTLB API is used by vhost-vdpa which requires incremental updating
305 * of the mapping. So we can not use generic vhost memory listener which
306 * depends on the addnop().
308 static const MemoryListener vhost_vdpa_memory_listener
= {
309 .name
= "vhost-vdpa",
310 .commit
= vhost_vdpa_listener_commit
,
311 .region_add
= vhost_vdpa_listener_region_add
,
312 .region_del
= vhost_vdpa_listener_region_del
,
315 static int vhost_vdpa_call(struct vhost_dev
*dev
, unsigned long int request
,
318 struct vhost_vdpa
*v
= dev
->opaque
;
319 int fd
= v
->device_fd
;
322 assert(dev
->vhost_ops
->backend_type
== VHOST_BACKEND_TYPE_VDPA
);
324 ret
= ioctl(fd
, request
, arg
);
325 return ret
< 0 ? -errno
: ret
;
328 static int vhost_vdpa_add_status(struct vhost_dev
*dev
, uint8_t status
)
333 trace_vhost_vdpa_add_status(dev
, status
);
334 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_GET_STATUS
, &s
);
341 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_SET_STATUS
, &s
);
346 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_GET_STATUS
, &s
);
358 static void vhost_vdpa_get_iova_range(struct vhost_vdpa
*v
)
360 int ret
= vhost_vdpa_call(v
->dev
, VHOST_VDPA_GET_IOVA_RANGE
,
363 v
->iova_range
.first
= 0;
364 v
->iova_range
.last
= UINT64_MAX
;
367 trace_vhost_vdpa_get_iova_range(v
->dev
, v
->iova_range
.first
,
372 * The use of this function is for requests that only need to be
373 * applied once. Typically such request occurs at the beginning
374 * of operation, and before setting up queues. It should not be
375 * used for request that performs operation until all queues are
376 * set, which would need to check dev->vq_index_end instead.
378 static bool vhost_vdpa_first_dev(struct vhost_dev
*dev
)
380 struct vhost_vdpa
*v
= dev
->opaque
;
382 return v
->index
== 0;
385 static int vhost_vdpa_get_dev_features(struct vhost_dev
*dev
,
390 ret
= vhost_vdpa_call(dev
, VHOST_GET_FEATURES
, features
);
391 trace_vhost_vdpa_get_features(dev
, *features
);
395 static int vhost_vdpa_init_svq(struct vhost_dev
*hdev
, struct vhost_vdpa
*v
,
398 g_autoptr(GPtrArray
) shadow_vqs
= NULL
;
399 uint64_t dev_features
, svq_features
;
403 if (!v
->shadow_vqs_enabled
) {
407 r
= vhost_vdpa_get_dev_features(hdev
, &dev_features
);
409 error_setg_errno(errp
, -r
, "Can't get vdpa device features");
413 svq_features
= dev_features
;
414 ok
= vhost_svq_valid_features(svq_features
, errp
);
419 shadow_vqs
= g_ptr_array_new_full(hdev
->nvqs
, vhost_svq_free
);
420 for (unsigned n
= 0; n
< hdev
->nvqs
; ++n
) {
421 g_autoptr(VhostShadowVirtqueue
) svq
= vhost_svq_new(v
->iova_tree
);
423 if (unlikely(!svq
)) {
424 error_setg(errp
, "Cannot create svq %u", n
);
427 g_ptr_array_add(shadow_vqs
, g_steal_pointer(&svq
));
430 v
->shadow_vqs
= g_steal_pointer(&shadow_vqs
);
434 static int vhost_vdpa_init(struct vhost_dev
*dev
, void *opaque
, Error
**errp
)
436 struct vhost_vdpa
*v
;
437 assert(dev
->vhost_ops
->backend_type
== VHOST_BACKEND_TYPE_VDPA
);
438 trace_vhost_vdpa_init(dev
, opaque
);
442 * Similar to VFIO, we end up pinning all guest memory and have to
443 * disable discarding of RAM.
445 ret
= ram_block_discard_disable(true);
447 error_report("Cannot set discarding of RAM broken");
453 dev
->opaque
= opaque
;
454 v
->listener
= vhost_vdpa_memory_listener
;
455 v
->msg_type
= VHOST_IOTLB_MSG_V2
;
456 ret
= vhost_vdpa_init_svq(dev
, v
, errp
);
461 vhost_vdpa_get_iova_range(v
);
463 if (!vhost_vdpa_first_dev(dev
)) {
467 vhost_vdpa_add_status(dev
, VIRTIO_CONFIG_S_ACKNOWLEDGE
|
468 VIRTIO_CONFIG_S_DRIVER
);
473 ram_block_discard_disable(false);
477 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev
*dev
,
480 size_t page_size
= qemu_real_host_page_size();
481 struct vhost_vdpa
*v
= dev
->opaque
;
482 VirtIODevice
*vdev
= dev
->vdev
;
483 VhostVDPAHostNotifier
*n
;
485 n
= &v
->notifier
[queue_index
];
488 virtio_queue_set_host_notifier_mr(vdev
, queue_index
, &n
->mr
, false);
489 object_unparent(OBJECT(&n
->mr
));
490 munmap(n
->addr
, page_size
);
495 static int vhost_vdpa_host_notifier_init(struct vhost_dev
*dev
, int queue_index
)
497 size_t page_size
= qemu_real_host_page_size();
498 struct vhost_vdpa
*v
= dev
->opaque
;
499 VirtIODevice
*vdev
= dev
->vdev
;
500 VhostVDPAHostNotifier
*n
;
501 int fd
= v
->device_fd
;
505 vhost_vdpa_host_notifier_uninit(dev
, queue_index
);
507 n
= &v
->notifier
[queue_index
];
509 addr
= mmap(NULL
, page_size
, PROT_WRITE
, MAP_SHARED
, fd
,
510 queue_index
* page_size
);
511 if (addr
== MAP_FAILED
) {
515 name
= g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
517 memory_region_init_ram_device_ptr(&n
->mr
, OBJECT(vdev
), name
,
521 if (virtio_queue_set_host_notifier_mr(vdev
, queue_index
, &n
->mr
, true)) {
522 object_unparent(OBJECT(&n
->mr
));
523 munmap(addr
, page_size
);
534 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev
*dev
, int n
)
538 for (i
= dev
->vq_index
; i
< dev
->vq_index
+ n
; i
++) {
539 vhost_vdpa_host_notifier_uninit(dev
, i
);
543 static void vhost_vdpa_host_notifiers_init(struct vhost_dev
*dev
)
545 struct vhost_vdpa
*v
= dev
->opaque
;
548 if (v
->shadow_vqs_enabled
) {
549 /* FIXME SVQ is not compatible with host notifiers mr */
553 for (i
= dev
->vq_index
; i
< dev
->vq_index
+ dev
->nvqs
; i
++) {
554 if (vhost_vdpa_host_notifier_init(dev
, i
)) {
562 vhost_vdpa_host_notifiers_uninit(dev
, i
- dev
->vq_index
);
566 static void vhost_vdpa_svq_cleanup(struct vhost_dev
*dev
)
568 struct vhost_vdpa
*v
= dev
->opaque
;
571 if (!v
->shadow_vqs
) {
575 for (idx
= 0; idx
< v
->shadow_vqs
->len
; ++idx
) {
576 vhost_svq_stop(g_ptr_array_index(v
->shadow_vqs
, idx
));
578 g_ptr_array_free(v
->shadow_vqs
, true);
581 static int vhost_vdpa_cleanup(struct vhost_dev
*dev
)
583 struct vhost_vdpa
*v
;
584 assert(dev
->vhost_ops
->backend_type
== VHOST_BACKEND_TYPE_VDPA
);
586 trace_vhost_vdpa_cleanup(dev
, v
);
587 vhost_vdpa_host_notifiers_uninit(dev
, dev
->nvqs
);
588 memory_listener_unregister(&v
->listener
);
589 vhost_vdpa_svq_cleanup(dev
);
592 ram_block_discard_disable(false);
597 static int vhost_vdpa_memslots_limit(struct vhost_dev
*dev
)
599 trace_vhost_vdpa_memslots_limit(dev
, INT_MAX
);
603 static int vhost_vdpa_set_mem_table(struct vhost_dev
*dev
,
604 struct vhost_memory
*mem
)
606 if (!vhost_vdpa_first_dev(dev
)) {
610 trace_vhost_vdpa_set_mem_table(dev
, mem
->nregions
, mem
->padding
);
611 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE
) &&
612 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS
)) {
614 for (i
= 0; i
< mem
->nregions
; i
++) {
615 trace_vhost_vdpa_dump_regions(dev
, i
,
616 mem
->regions
[i
].guest_phys_addr
,
617 mem
->regions
[i
].memory_size
,
618 mem
->regions
[i
].userspace_addr
,
619 mem
->regions
[i
].flags_padding
);
629 static int vhost_vdpa_set_features(struct vhost_dev
*dev
,
632 struct vhost_vdpa
*v
= dev
->opaque
;
635 if (!vhost_vdpa_first_dev(dev
)) {
639 if (v
->shadow_vqs_enabled
) {
640 if ((v
->acked_features
^ features
) == BIT_ULL(VHOST_F_LOG_ALL
)) {
642 * QEMU is just trying to enable or disable logging. SVQ handles
643 * this sepparately, so no need to forward this.
645 v
->acked_features
= features
;
649 v
->acked_features
= features
;
651 /* We must not ack _F_LOG if SVQ is enabled */
652 features
&= ~BIT_ULL(VHOST_F_LOG_ALL
);
655 trace_vhost_vdpa_set_features(dev
, features
);
656 ret
= vhost_vdpa_call(dev
, VHOST_SET_FEATURES
, &features
);
661 return vhost_vdpa_add_status(dev
, VIRTIO_CONFIG_S_FEATURES_OK
);
664 static int vhost_vdpa_set_backend_cap(struct vhost_dev
*dev
)
667 uint64_t f
= 0x1ULL
<< VHOST_BACKEND_F_IOTLB_MSG_V2
|
668 0x1ULL
<< VHOST_BACKEND_F_IOTLB_BATCH
;
671 if (vhost_vdpa_call(dev
, VHOST_GET_BACKEND_FEATURES
, &features
)) {
677 if (vhost_vdpa_first_dev(dev
)) {
678 r
= vhost_vdpa_call(dev
, VHOST_SET_BACKEND_FEATURES
, &features
);
684 dev
->backend_cap
= features
;
689 static int vhost_vdpa_get_device_id(struct vhost_dev
*dev
,
693 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_GET_DEVICE_ID
, device_id
);
694 trace_vhost_vdpa_get_device_id(dev
, *device_id
);
698 static void vhost_vdpa_reset_svq(struct vhost_vdpa
*v
)
700 if (!v
->shadow_vqs_enabled
) {
704 for (unsigned i
= 0; i
< v
->shadow_vqs
->len
; ++i
) {
705 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, i
);
710 static int vhost_vdpa_reset_device(struct vhost_dev
*dev
)
712 struct vhost_vdpa
*v
= dev
->opaque
;
716 vhost_vdpa_reset_svq(v
);
718 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_SET_STATUS
, &status
);
719 trace_vhost_vdpa_reset_device(dev
, status
);
723 static int vhost_vdpa_get_vq_index(struct vhost_dev
*dev
, int idx
)
725 assert(idx
>= dev
->vq_index
&& idx
< dev
->vq_index
+ dev
->nvqs
);
727 trace_vhost_vdpa_get_vq_index(dev
, idx
, idx
);
731 static int vhost_vdpa_set_vring_ready(struct vhost_dev
*dev
)
734 trace_vhost_vdpa_set_vring_ready(dev
);
735 for (i
= 0; i
< dev
->nvqs
; ++i
) {
736 struct vhost_vring_state state
= {
737 .index
= dev
->vq_index
+ i
,
740 vhost_vdpa_call(dev
, VHOST_VDPA_SET_VRING_ENABLE
, &state
);
745 static void vhost_vdpa_dump_config(struct vhost_dev
*dev
, const uint8_t *config
,
749 char line
[QEMU_HEXDUMP_LINE_LEN
];
751 for (b
= 0; b
< config_len
; b
+= 16) {
752 len
= config_len
- b
;
753 qemu_hexdump_line(line
, b
, config
, len
, false);
754 trace_vhost_vdpa_dump_config(dev
, line
);
758 static int vhost_vdpa_set_config(struct vhost_dev
*dev
, const uint8_t *data
,
759 uint32_t offset
, uint32_t size
,
762 struct vhost_vdpa_config
*config
;
764 unsigned long config_size
= offsetof(struct vhost_vdpa_config
, buf
);
766 trace_vhost_vdpa_set_config(dev
, offset
, size
, flags
);
767 config
= g_malloc(size
+ config_size
);
768 config
->off
= offset
;
770 memcpy(config
->buf
, data
, size
);
771 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG
) &&
772 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG
)) {
773 vhost_vdpa_dump_config(dev
, data
, size
);
775 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_SET_CONFIG
, config
);
780 static int vhost_vdpa_get_config(struct vhost_dev
*dev
, uint8_t *config
,
781 uint32_t config_len
, Error
**errp
)
783 struct vhost_vdpa_config
*v_config
;
784 unsigned long config_size
= offsetof(struct vhost_vdpa_config
, buf
);
787 trace_vhost_vdpa_get_config(dev
, config
, config_len
);
788 v_config
= g_malloc(config_len
+ config_size
);
789 v_config
->len
= config_len
;
791 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_GET_CONFIG
, v_config
);
792 memcpy(config
, v_config
->buf
, config_len
);
794 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG
) &&
795 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG
)) {
796 vhost_vdpa_dump_config(dev
, config
, config_len
);
801 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev
*dev
,
802 struct vhost_vring_state
*ring
)
804 trace_vhost_vdpa_set_vring_base(dev
, ring
->index
, ring
->num
);
805 return vhost_vdpa_call(dev
, VHOST_SET_VRING_BASE
, ring
);
808 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev
*dev
,
809 struct vhost_vring_file
*file
)
811 trace_vhost_vdpa_set_vring_kick(dev
, file
->index
, file
->fd
);
812 return vhost_vdpa_call(dev
, VHOST_SET_VRING_KICK
, file
);
815 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev
*dev
,
816 struct vhost_vring_file
*file
)
818 trace_vhost_vdpa_set_vring_call(dev
, file
->index
, file
->fd
);
819 return vhost_vdpa_call(dev
, VHOST_SET_VRING_CALL
, file
);
822 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev
*dev
,
823 struct vhost_vring_addr
*addr
)
825 trace_vhost_vdpa_set_vring_addr(dev
, addr
->index
, addr
->flags
,
826 addr
->desc_user_addr
, addr
->used_user_addr
,
827 addr
->avail_user_addr
,
828 addr
->log_guest_addr
);
830 return vhost_vdpa_call(dev
, VHOST_SET_VRING_ADDR
, addr
);
835 * Set the shadow virtqueue descriptors to the device
837 * @dev: The vhost device model
838 * @svq: The shadow virtqueue
839 * @idx: The index of the virtqueue in the vhost device
842 * Note that this function does not rewind kick file descriptor if cannot set
845 static int vhost_vdpa_svq_set_fds(struct vhost_dev
*dev
,
846 VhostShadowVirtqueue
*svq
, unsigned idx
,
849 struct vhost_vring_file file
= {
850 .index
= dev
->vq_index
+ idx
,
852 const EventNotifier
*event_notifier
= &svq
->hdev_kick
;
855 file
.fd
= event_notifier_get_fd(event_notifier
);
856 r
= vhost_vdpa_set_vring_dev_kick(dev
, &file
);
857 if (unlikely(r
!= 0)) {
858 error_setg_errno(errp
, -r
, "Can't set device kick fd");
862 event_notifier
= &svq
->hdev_call
;
863 file
.fd
= event_notifier_get_fd(event_notifier
);
864 r
= vhost_vdpa_set_vring_dev_call(dev
, &file
);
865 if (unlikely(r
!= 0)) {
866 error_setg_errno(errp
, -r
, "Can't set device call fd");
873 * Unmap a SVQ area in the device
875 static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa
*v
,
876 const DMAMap
*needle
)
878 const DMAMap
*result
= vhost_iova_tree_find_iova(v
->iova_tree
, needle
);
882 if (unlikely(!result
)) {
883 error_report("Unable to find SVQ address to unmap");
887 size
= ROUND_UP(result
->size
, qemu_real_host_page_size());
888 r
= vhost_vdpa_dma_unmap(v
, result
->iova
, size
);
892 static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev
*dev
,
893 const VhostShadowVirtqueue
*svq
)
896 struct vhost_vdpa
*v
= dev
->opaque
;
897 struct vhost_vring_addr svq_addr
;
900 vhost_svq_get_vring_addr(svq
, &svq_addr
);
902 needle
.translated_addr
= svq_addr
.desc_user_addr
;
903 ok
= vhost_vdpa_svq_unmap_ring(v
, &needle
);
908 needle
.translated_addr
= svq_addr
.used_user_addr
;
909 return vhost_vdpa_svq_unmap_ring(v
, &needle
);
913 * Map the SVQ area in the device
915 * @v: Vhost-vdpa device
916 * @needle: The area to search iova
917 * @errorp: Error pointer
919 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa
*v
, DMAMap
*needle
,
924 r
= vhost_iova_tree_map_alloc(v
->iova_tree
, needle
);
925 if (unlikely(r
!= IOVA_OK
)) {
926 error_setg(errp
, "Cannot allocate iova (%d)", r
);
930 r
= vhost_vdpa_dma_map(v
, needle
->iova
, needle
->size
+ 1,
931 (void *)(uintptr_t)needle
->translated_addr
,
932 needle
->perm
== IOMMU_RO
);
933 if (unlikely(r
!= 0)) {
934 error_setg_errno(errp
, -r
, "Cannot map region to device");
935 vhost_iova_tree_remove(v
->iova_tree
, needle
);
942 * Map the shadow virtqueue rings in the device
944 * @dev: The vhost device
945 * @svq: The shadow virtqueue
946 * @addr: Assigned IOVA addresses
947 * @errp: Error pointer
949 static bool vhost_vdpa_svq_map_rings(struct vhost_dev
*dev
,
950 const VhostShadowVirtqueue
*svq
,
951 struct vhost_vring_addr
*addr
,
954 DMAMap device_region
, driver_region
;
955 struct vhost_vring_addr svq_addr
;
956 struct vhost_vdpa
*v
= dev
->opaque
;
957 size_t device_size
= vhost_svq_device_area_size(svq
);
958 size_t driver_size
= vhost_svq_driver_area_size(svq
);
963 vhost_svq_get_vring_addr(svq
, &svq_addr
);
965 driver_region
= (DMAMap
) {
966 .translated_addr
= svq_addr
.desc_user_addr
,
967 .size
= driver_size
- 1,
970 ok
= vhost_vdpa_svq_map_ring(v
, &driver_region
, errp
);
972 error_prepend(errp
, "Cannot create vq driver region: ");
975 addr
->desc_user_addr
= driver_region
.iova
;
976 avail_offset
= svq_addr
.avail_user_addr
- svq_addr
.desc_user_addr
;
977 addr
->avail_user_addr
= driver_region
.iova
+ avail_offset
;
979 device_region
= (DMAMap
) {
980 .translated_addr
= svq_addr
.used_user_addr
,
981 .size
= device_size
- 1,
984 ok
= vhost_vdpa_svq_map_ring(v
, &device_region
, errp
);
986 error_prepend(errp
, "Cannot create vq device region: ");
987 vhost_vdpa_svq_unmap_ring(v
, &driver_region
);
989 addr
->used_user_addr
= device_region
.iova
;
994 static bool vhost_vdpa_svq_setup(struct vhost_dev
*dev
,
995 VhostShadowVirtqueue
*svq
, unsigned idx
,
998 uint16_t vq_index
= dev
->vq_index
+ idx
;
999 struct vhost_vring_state s
= {
1004 r
= vhost_vdpa_set_dev_vring_base(dev
, &s
);
1006 error_setg_errno(errp
, -r
, "Cannot set vring base");
1010 r
= vhost_vdpa_svq_set_fds(dev
, svq
, idx
, errp
);
1014 static bool vhost_vdpa_svqs_start(struct vhost_dev
*dev
)
1016 struct vhost_vdpa
*v
= dev
->opaque
;
1020 if (!v
->shadow_vqs
) {
1024 for (i
= 0; i
< v
->shadow_vqs
->len
; ++i
) {
1025 VirtQueue
*vq
= virtio_get_queue(dev
->vdev
, dev
->vq_index
+ i
);
1026 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, i
);
1027 struct vhost_vring_addr addr
= {
1028 .index
= dev
->vq_index
+ i
,
1031 bool ok
= vhost_vdpa_svq_setup(dev
, svq
, i
, &err
);
1032 if (unlikely(!ok
)) {
1036 vhost_svq_start(svq
, dev
->vdev
, vq
);
1037 ok
= vhost_vdpa_svq_map_rings(dev
, svq
, &addr
, &err
);
1038 if (unlikely(!ok
)) {
1042 /* Override vring GPA set by vhost subsystem */
1043 r
= vhost_vdpa_set_vring_dev_addr(dev
, &addr
);
1044 if (unlikely(r
!= 0)) {
1045 error_setg_errno(&err
, -r
, "Cannot set device address");
1053 vhost_vdpa_svq_unmap_rings(dev
, g_ptr_array_index(v
->shadow_vqs
, i
));
1056 vhost_svq_stop(g_ptr_array_index(v
->shadow_vqs
, i
));
1059 error_reportf_err(err
, "Cannot setup SVQ %u: ", i
);
1060 for (unsigned j
= 0; j
< i
; ++j
) {
1061 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, j
);
1062 vhost_vdpa_svq_unmap_rings(dev
, svq
);
1063 vhost_svq_stop(svq
);
1069 static bool vhost_vdpa_svqs_stop(struct vhost_dev
*dev
)
1071 struct vhost_vdpa
*v
= dev
->opaque
;
1073 if (!v
->shadow_vqs
) {
1077 for (unsigned i
= 0; i
< v
->shadow_vqs
->len
; ++i
) {
1078 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, i
);
1079 bool ok
= vhost_vdpa_svq_unmap_rings(dev
, svq
);
1080 if (unlikely(!ok
)) {
1088 static int vhost_vdpa_dev_start(struct vhost_dev
*dev
, bool started
)
1090 struct vhost_vdpa
*v
= dev
->opaque
;
1092 trace_vhost_vdpa_dev_start(dev
, started
);
1095 vhost_vdpa_host_notifiers_init(dev
);
1096 ok
= vhost_vdpa_svqs_start(dev
);
1097 if (unlikely(!ok
)) {
1100 vhost_vdpa_set_vring_ready(dev
);
1102 ok
= vhost_vdpa_svqs_stop(dev
);
1103 if (unlikely(!ok
)) {
1106 vhost_vdpa_host_notifiers_uninit(dev
, dev
->nvqs
);
1109 if (dev
->vq_index
+ dev
->nvqs
!= dev
->vq_index_end
) {
1114 memory_listener_register(&v
->listener
, &address_space_memory
);
1115 return vhost_vdpa_add_status(dev
, VIRTIO_CONFIG_S_DRIVER_OK
);
1117 vhost_vdpa_reset_device(dev
);
1118 vhost_vdpa_add_status(dev
, VIRTIO_CONFIG_S_ACKNOWLEDGE
|
1119 VIRTIO_CONFIG_S_DRIVER
);
1120 memory_listener_unregister(&v
->listener
);
1126 static int vhost_vdpa_set_log_base(struct vhost_dev
*dev
, uint64_t base
,
1127 struct vhost_log
*log
)
1129 struct vhost_vdpa
*v
= dev
->opaque
;
1130 if (v
->shadow_vqs_enabled
|| !vhost_vdpa_first_dev(dev
)) {
1134 trace_vhost_vdpa_set_log_base(dev
, base
, log
->size
, log
->refcnt
, log
->fd
,
1136 return vhost_vdpa_call(dev
, VHOST_SET_LOG_BASE
, &base
);
1139 static int vhost_vdpa_set_vring_addr(struct vhost_dev
*dev
,
1140 struct vhost_vring_addr
*addr
)
1142 struct vhost_vdpa
*v
= dev
->opaque
;
1144 if (v
->shadow_vqs_enabled
) {
1146 * Device vring addr was set at device start. SVQ base is handled by
1152 return vhost_vdpa_set_vring_dev_addr(dev
, addr
);
1155 static int vhost_vdpa_set_vring_num(struct vhost_dev
*dev
,
1156 struct vhost_vring_state
*ring
)
1158 trace_vhost_vdpa_set_vring_num(dev
, ring
->index
, ring
->num
);
1159 return vhost_vdpa_call(dev
, VHOST_SET_VRING_NUM
, ring
);
1162 static int vhost_vdpa_set_vring_base(struct vhost_dev
*dev
,
1163 struct vhost_vring_state
*ring
)
1165 struct vhost_vdpa
*v
= dev
->opaque
;
1167 if (v
->shadow_vqs_enabled
) {
1169 * Device vring base was set at device start. SVQ base is handled by
1175 return vhost_vdpa_set_dev_vring_base(dev
, ring
);
1178 static int vhost_vdpa_get_vring_base(struct vhost_dev
*dev
,
1179 struct vhost_vring_state
*ring
)
1181 struct vhost_vdpa
*v
= dev
->opaque
;
1182 int vdpa_idx
= ring
->index
- dev
->vq_index
;
1185 if (v
->shadow_vqs_enabled
) {
1186 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, vdpa_idx
);
1189 * Setting base as last used idx, so destination will see as available
1190 * all the entries that the device did not use, including the in-flight
1193 * TODO: This is ok for networking, but other kinds of devices might
1194 * have problems with these retransmissions.
1196 ring
->num
= svq
->last_used_idx
;
1200 ret
= vhost_vdpa_call(dev
, VHOST_GET_VRING_BASE
, ring
);
1201 trace_vhost_vdpa_get_vring_base(dev
, ring
->index
, ring
->num
);
1205 static int vhost_vdpa_set_vring_kick(struct vhost_dev
*dev
,
1206 struct vhost_vring_file
*file
)
1208 struct vhost_vdpa
*v
= dev
->opaque
;
1209 int vdpa_idx
= file
->index
- dev
->vq_index
;
1211 if (v
->shadow_vqs_enabled
) {
1212 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, vdpa_idx
);
1213 vhost_svq_set_svq_kick_fd(svq
, file
->fd
);
1216 return vhost_vdpa_set_vring_dev_kick(dev
, file
);
1220 static int vhost_vdpa_set_vring_call(struct vhost_dev
*dev
,
1221 struct vhost_vring_file
*file
)
1223 struct vhost_vdpa
*v
= dev
->opaque
;
1225 if (v
->shadow_vqs_enabled
) {
1226 int vdpa_idx
= file
->index
- dev
->vq_index
;
1227 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, vdpa_idx
);
1229 vhost_svq_set_svq_call_fd(svq
, file
->fd
);
1232 return vhost_vdpa_set_vring_dev_call(dev
, file
);
1236 static int vhost_vdpa_get_features(struct vhost_dev
*dev
,
1239 struct vhost_vdpa
*v
= dev
->opaque
;
1240 int ret
= vhost_vdpa_get_dev_features(dev
, features
);
1242 if (ret
== 0 && v
->shadow_vqs_enabled
) {
1243 /* Add SVQ logging capabilities */
1244 *features
|= BIT_ULL(VHOST_F_LOG_ALL
);
1250 static int vhost_vdpa_set_owner(struct vhost_dev
*dev
)
1252 if (!vhost_vdpa_first_dev(dev
)) {
1256 trace_vhost_vdpa_set_owner(dev
);
1257 return vhost_vdpa_call(dev
, VHOST_SET_OWNER
, NULL
);
1260 static int vhost_vdpa_vq_get_addr(struct vhost_dev
*dev
,
1261 struct vhost_vring_addr
*addr
, struct vhost_virtqueue
*vq
)
1263 assert(dev
->vhost_ops
->backend_type
== VHOST_BACKEND_TYPE_VDPA
);
1264 addr
->desc_user_addr
= (uint64_t)(unsigned long)vq
->desc_phys
;
1265 addr
->avail_user_addr
= (uint64_t)(unsigned long)vq
->avail_phys
;
1266 addr
->used_user_addr
= (uint64_t)(unsigned long)vq
->used_phys
;
1267 trace_vhost_vdpa_vq_get_addr(dev
, vq
, addr
->desc_user_addr
,
1268 addr
->avail_user_addr
, addr
->used_user_addr
);
1272 static bool vhost_vdpa_force_iommu(struct vhost_dev
*dev
)
1277 const VhostOps vdpa_ops
= {
1278 .backend_type
= VHOST_BACKEND_TYPE_VDPA
,
1279 .vhost_backend_init
= vhost_vdpa_init
,
1280 .vhost_backend_cleanup
= vhost_vdpa_cleanup
,
1281 .vhost_set_log_base
= vhost_vdpa_set_log_base
,
1282 .vhost_set_vring_addr
= vhost_vdpa_set_vring_addr
,
1283 .vhost_set_vring_num
= vhost_vdpa_set_vring_num
,
1284 .vhost_set_vring_base
= vhost_vdpa_set_vring_base
,
1285 .vhost_get_vring_base
= vhost_vdpa_get_vring_base
,
1286 .vhost_set_vring_kick
= vhost_vdpa_set_vring_kick
,
1287 .vhost_set_vring_call
= vhost_vdpa_set_vring_call
,
1288 .vhost_get_features
= vhost_vdpa_get_features
,
1289 .vhost_set_backend_cap
= vhost_vdpa_set_backend_cap
,
1290 .vhost_set_owner
= vhost_vdpa_set_owner
,
1291 .vhost_set_vring_endian
= NULL
,
1292 .vhost_backend_memslots_limit
= vhost_vdpa_memslots_limit
,
1293 .vhost_set_mem_table
= vhost_vdpa_set_mem_table
,
1294 .vhost_set_features
= vhost_vdpa_set_features
,
1295 .vhost_reset_device
= vhost_vdpa_reset_device
,
1296 .vhost_get_vq_index
= vhost_vdpa_get_vq_index
,
1297 .vhost_get_config
= vhost_vdpa_get_config
,
1298 .vhost_set_config
= vhost_vdpa_set_config
,
1299 .vhost_requires_shm_log
= NULL
,
1300 .vhost_migration_done
= NULL
,
1301 .vhost_backend_can_merge
= NULL
,
1302 .vhost_net_set_mtu
= NULL
,
1303 .vhost_set_iotlb_callback
= NULL
,
1304 .vhost_send_device_iotlb_msg
= NULL
,
1305 .vhost_dev_start
= vhost_vdpa_dev_start
,
1306 .vhost_get_device_id
= vhost_vdpa_get_device_id
,
1307 .vhost_vq_get_addr
= vhost_vdpa_vq_get_addr
,
1308 .vhost_force_iommu
= vhost_vdpa_force_iommu
,