2 * vhost shadow virtqueue
4 * SPDX-FileCopyrightText: Red Hat, Inc. 2021
5 * SPDX-FileContributor: Author: Eugenio PĂ©rez <eperezma@redhat.com>
7 * SPDX-License-Identifier: GPL-2.0-or-later
10 #include "qemu/osdep.h"
11 #include "hw/virtio/vhost-shadow-virtqueue.h"
13 #include "qemu/error-report.h"
14 #include "qapi/error.h"
15 #include "qemu/main-loop.h"
17 #include "qemu/memalign.h"
18 #include "linux-headers/linux/vhost.h"
21 * Validate the transport device features that both guests can use with the SVQ
22 * and SVQs can use with the device.
24 * @dev_features: The features
25 * @errp: Error pointer
27 bool vhost_svq_valid_features(uint64_t features
, Error
**errp
)
30 uint64_t svq_features
= features
;
32 for (uint64_t b
= VIRTIO_TRANSPORT_F_START
; b
<= VIRTIO_TRANSPORT_F_END
;
35 case VIRTIO_F_ANY_LAYOUT
:
36 case VIRTIO_RING_F_EVENT_IDX
:
39 case VIRTIO_F_ACCESS_PLATFORM
:
40 /* SVQ trust in the host's IOMMU to translate addresses */
41 case VIRTIO_F_VERSION_1
:
42 /* SVQ trust that the guest vring is little endian */
43 if (!(svq_features
& BIT_ULL(b
))) {
44 svq_features
|= BIT_ULL(b
);
50 if (svq_features
& BIT_ULL(b
)) {
51 svq_features
&= ~BIT_ULL(b
);
58 error_setg(errp
, "SVQ Invalid device feature flags, offer: 0x%"PRIx64
59 ", ok: 0x%"PRIx64
, features
, svq_features
);
65 * Number of descriptors that the SVQ can make available from the guest.
69 static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue
*svq
)
75 * Translate addresses between the qemu's virtual address and the SVQ IOVA
77 * @svq: Shadow VirtQueue
78 * @vaddr: Translated IOVA addresses
79 * @iovec: Source qemu's VA addresses
80 * @num: Length of iovec and minimum length of vaddr
82 static bool vhost_svq_translate_addr(const VhostShadowVirtqueue
*svq
,
83 hwaddr
*addrs
, const struct iovec
*iovec
,
90 for (size_t i
= 0; i
< num
; ++i
) {
92 .translated_addr
= (hwaddr
)(uintptr_t)iovec
[i
].iov_base
,
93 .size
= iovec
[i
].iov_len
,
95 Int128 needle_last
, map_last
;
98 const DMAMap
*map
= vhost_iova_tree_find_iova(svq
->iova_tree
, &needle
);
100 * Map cannot be NULL since iova map contains all guest space and
101 * qemu already has a physical address mapped
103 if (unlikely(!map
)) {
104 qemu_log_mask(LOG_GUEST_ERROR
,
105 "Invalid address 0x%"HWADDR_PRIx
" given by guest",
106 needle
.translated_addr
);
110 off
= needle
.translated_addr
- map
->translated_addr
;
111 addrs
[i
] = map
->iova
+ off
;
113 needle_last
= int128_add(int128_make64(needle
.translated_addr
),
114 int128_makes64(iovec
[i
].iov_len
- 1));
115 map_last
= int128_make64(map
->translated_addr
+ map
->size
);
116 if (unlikely(int128_gt(needle_last
, map_last
))) {
117 qemu_log_mask(LOG_GUEST_ERROR
,
118 "Guest buffer expands over iova range");
127 * Write descriptors to SVQ vring
129 * @svq: The shadow virtqueue
130 * @sg: Cache for hwaddr
131 * @iovec: The iovec from the guest
133 * @more_descs: True if more descriptors come in the chain
134 * @write: True if they are writeable descriptors
136 * Return true if success, false otherwise and print error.
138 static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue
*svq
, hwaddr
*sg
,
139 const struct iovec
*iovec
, size_t num
,
140 bool more_descs
, bool write
)
142 uint16_t i
= svq
->free_head
, last
= svq
->free_head
;
144 uint16_t flags
= write
? cpu_to_le16(VRING_DESC_F_WRITE
) : 0;
145 vring_desc_t
*descs
= svq
->vring
.desc
;
152 ok
= vhost_svq_translate_addr(svq
, sg
, iovec
, num
);
157 for (n
= 0; n
< num
; n
++) {
158 if (more_descs
|| (n
+ 1 < num
)) {
159 descs
[i
].flags
= flags
| cpu_to_le16(VRING_DESC_F_NEXT
);
160 descs
[i
].next
= cpu_to_le16(svq
->desc_next
[i
]);
162 descs
[i
].flags
= flags
;
164 descs
[i
].addr
= cpu_to_le64(sg
[n
]);
165 descs
[i
].len
= cpu_to_le32(iovec
[n
].iov_len
);
168 i
= cpu_to_le16(svq
->desc_next
[i
]);
171 svq
->free_head
= le16_to_cpu(svq
->desc_next
[last
]);
175 static bool vhost_svq_add_split(VhostShadowVirtqueue
*svq
,
176 const struct iovec
*out_sg
, size_t out_num
,
177 const struct iovec
*in_sg
, size_t in_num
,
181 vring_avail_t
*avail
= svq
->vring
.avail
;
183 g_autofree hwaddr
*sgs
= g_new(hwaddr
, MAX(out_num
, in_num
));
185 *head
= svq
->free_head
;
187 /* We need some descriptors here */
188 if (unlikely(!out_num
&& !in_num
)) {
189 qemu_log_mask(LOG_GUEST_ERROR
,
190 "Guest provided element with no descriptors");
194 ok
= vhost_svq_vring_write_descs(svq
, sgs
, out_sg
, out_num
, in_num
> 0,
200 ok
= vhost_svq_vring_write_descs(svq
, sgs
, in_sg
, in_num
, false, true);
206 * Put the entry in the available array (but don't update avail->idx until
209 avail_idx
= svq
->shadow_avail_idx
& (svq
->vring
.num
- 1);
210 avail
->ring
[avail_idx
] = cpu_to_le16(*head
);
211 svq
->shadow_avail_idx
++;
213 /* Update the avail index after write the descriptor */
215 avail
->idx
= cpu_to_le16(svq
->shadow_avail_idx
);
220 static void vhost_svq_kick(VhostShadowVirtqueue
*svq
)
225 * We need to expose the available array entries before checking the used
230 if (virtio_vdev_has_feature(svq
->vdev
, VIRTIO_RING_F_EVENT_IDX
)) {
231 uint16_t avail_event
= *(uint16_t *)(&svq
->vring
.used
->ring
[svq
->vring
.num
]);
232 needs_kick
= vring_need_event(avail_event
, svq
->shadow_avail_idx
, svq
->shadow_avail_idx
- 1);
234 needs_kick
= !(svq
->vring
.used
->flags
& VRING_USED_F_NO_NOTIFY
);
241 event_notifier_set(&svq
->hdev_kick
);
245 * Add an element to a SVQ.
247 * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full
249 int vhost_svq_add(VhostShadowVirtqueue
*svq
, const struct iovec
*out_sg
,
250 size_t out_num
, const struct iovec
*in_sg
, size_t in_num
,
251 VirtQueueElement
*elem
)
254 unsigned ndescs
= in_num
+ out_num
;
257 if (unlikely(ndescs
> vhost_svq_available_slots(svq
))) {
261 ok
= vhost_svq_add_split(svq
, out_sg
, out_num
, in_sg
, in_num
, &qemu_head
);
266 svq
->num_free
-= ndescs
;
267 svq
->desc_state
[qemu_head
].elem
= elem
;
268 svq
->desc_state
[qemu_head
].ndescs
= ndescs
;
273 /* Convenience wrapper to add a guest's element to SVQ */
274 static int vhost_svq_add_element(VhostShadowVirtqueue
*svq
,
275 VirtQueueElement
*elem
)
277 return vhost_svq_add(svq
, elem
->out_sg
, elem
->out_num
, elem
->in_sg
,
282 * Forward available buffers.
284 * @svq: Shadow VirtQueue
286 * Note that this function does not guarantee that all guest's available
287 * buffers are available to the device in SVQ avail ring. The guest may have
288 * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
291 * If that happens, guest's kick notifications will be disabled until the
292 * device uses some buffers.
294 static void vhost_handle_guest_kick(VhostShadowVirtqueue
*svq
)
296 /* Clear event notifier */
297 event_notifier_test_and_clear(&svq
->svq_kick
);
299 /* Forward to the device as many available buffers as possible */
301 virtio_queue_set_notification(svq
->vq
, false);
304 g_autofree VirtQueueElement
*elem
= NULL
;
307 if (svq
->next_guest_avail_elem
) {
308 elem
= g_steal_pointer(&svq
->next_guest_avail_elem
);
310 elem
= virtqueue_pop(svq
->vq
, sizeof(*elem
));
318 r
= svq
->ops
->avail_handler(svq
, elem
, svq
->ops_opaque
);
320 r
= vhost_svq_add_element(svq
, elem
);
322 if (unlikely(r
!= 0)) {
325 * This condition is possible since a contiguous buffer in
326 * GPA does not imply a contiguous buffer in qemu's VA
327 * scatter-gather segments. If that happens, the buffer
328 * exposed to the device needs to be a chain of descriptors
331 * SVQ cannot hold more available buffers if we are here:
332 * queue the current guest descriptor and ignore kicks
333 * until some elements are used.
335 svq
->next_guest_avail_elem
= g_steal_pointer(&elem
);
338 /* VQ is full or broken, just return and ignore kicks */
341 /* elem belongs to SVQ or external caller now */
345 virtio_queue_set_notification(svq
->vq
, true);
346 } while (!virtio_queue_empty(svq
->vq
));
350 * Handle guest's kick.
352 * @n: guest kick event notifier, the one that guest set to notify svq.
354 static void vhost_handle_guest_kick_notifier(EventNotifier
*n
)
356 VhostShadowVirtqueue
*svq
= container_of(n
, VhostShadowVirtqueue
, svq_kick
);
357 event_notifier_test_and_clear(n
);
358 vhost_handle_guest_kick(svq
);
361 static bool vhost_svq_more_used(VhostShadowVirtqueue
*svq
)
363 uint16_t *used_idx
= &svq
->vring
.used
->idx
;
364 if (svq
->last_used_idx
!= svq
->shadow_used_idx
) {
368 svq
->shadow_used_idx
= cpu_to_le16(*(volatile uint16_t *)used_idx
);
370 return svq
->last_used_idx
!= svq
->shadow_used_idx
;
374 * Enable vhost device calls after disable them.
378 * It returns false if there are pending used buffers from the vhost device,
379 * avoiding the possible races between SVQ checking for more work and enabling
380 * callbacks. True if SVQ used vring has no more pending buffers.
382 static bool vhost_svq_enable_notification(VhostShadowVirtqueue
*svq
)
384 if (virtio_vdev_has_feature(svq
->vdev
, VIRTIO_RING_F_EVENT_IDX
)) {
385 uint16_t *used_event
= (uint16_t *)&svq
->vring
.avail
->ring
[svq
->vring
.num
];
386 *used_event
= svq
->shadow_used_idx
;
388 svq
->vring
.avail
->flags
&= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT
);
391 /* Make sure the event is enabled before the read of used_idx */
393 return !vhost_svq_more_used(svq
);
396 static void vhost_svq_disable_notification(VhostShadowVirtqueue
*svq
)
399 * No need to disable notification in the event idx case, since used event
400 * index is already an index too far away.
402 if (!virtio_vdev_has_feature(svq
->vdev
, VIRTIO_RING_F_EVENT_IDX
)) {
403 svq
->vring
.avail
->flags
|= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT
);
407 static uint16_t vhost_svq_last_desc_of_chain(const VhostShadowVirtqueue
*svq
,
408 uint16_t num
, uint16_t i
)
410 for (uint16_t j
= 0; j
< (num
- 1); ++j
) {
411 i
= le16_to_cpu(svq
->desc_next
[i
]);
417 static VirtQueueElement
*vhost_svq_get_buf(VhostShadowVirtqueue
*svq
,
420 const vring_used_t
*used
= svq
->vring
.used
;
421 vring_used_elem_t used_elem
;
422 uint16_t last_used
, last_used_chain
, num
;
424 if (!vhost_svq_more_used(svq
)) {
428 /* Only get used array entries after they have been exposed by dev */
430 last_used
= svq
->last_used_idx
& (svq
->vring
.num
- 1);
431 used_elem
.id
= le32_to_cpu(used
->ring
[last_used
].id
);
432 used_elem
.len
= le32_to_cpu(used
->ring
[last_used
].len
);
434 svq
->last_used_idx
++;
435 if (unlikely(used_elem
.id
>= svq
->vring
.num
)) {
436 qemu_log_mask(LOG_GUEST_ERROR
, "Device %s says index %u is used",
437 svq
->vdev
->name
, used_elem
.id
);
441 if (unlikely(!svq
->desc_state
[used_elem
.id
].ndescs
)) {
442 qemu_log_mask(LOG_GUEST_ERROR
,
443 "Device %s says index %u is used, but it was not available",
444 svq
->vdev
->name
, used_elem
.id
);
448 num
= svq
->desc_state
[used_elem
.id
].ndescs
;
449 svq
->desc_state
[used_elem
.id
].ndescs
= 0;
450 last_used_chain
= vhost_svq_last_desc_of_chain(svq
, num
, used_elem
.id
);
451 svq
->desc_next
[last_used_chain
] = svq
->free_head
;
452 svq
->free_head
= used_elem
.id
;
453 svq
->num_free
+= num
;
455 *len
= used_elem
.len
;
456 return g_steal_pointer(&svq
->desc_state
[used_elem
.id
].elem
);
460 * Push an element to SVQ, returning it to the guest.
462 void vhost_svq_push_elem(VhostShadowVirtqueue
*svq
,
463 const VirtQueueElement
*elem
, uint32_t len
)
465 virtqueue_push(svq
->vq
, elem
, len
);
466 if (svq
->next_guest_avail_elem
) {
468 * Avail ring was full when vhost_svq_flush was called, so it's a
469 * good moment to make more descriptors available if possible.
471 vhost_handle_guest_kick(svq
);
475 static void vhost_svq_flush(VhostShadowVirtqueue
*svq
,
476 bool check_for_avail_queue
)
478 VirtQueue
*vq
= svq
->vq
;
480 /* Forward as many used buffers as possible. */
484 vhost_svq_disable_notification(svq
);
487 g_autofree VirtQueueElement
*elem
= vhost_svq_get_buf(svq
, &len
);
492 if (unlikely(i
>= svq
->vring
.num
)) {
493 qemu_log_mask(LOG_GUEST_ERROR
,
494 "More than %u used buffers obtained in a %u size SVQ",
496 virtqueue_fill(vq
, elem
, len
, i
);
497 virtqueue_flush(vq
, i
);
500 virtqueue_fill(vq
, elem
, len
, i
++);
503 virtqueue_flush(vq
, i
);
504 event_notifier_set(&svq
->svq_call
);
506 if (check_for_avail_queue
&& svq
->next_guest_avail_elem
) {
508 * Avail ring was full when vhost_svq_flush was called, so it's a
509 * good moment to make more descriptors available if possible.
511 vhost_handle_guest_kick(svq
);
513 } while (!vhost_svq_enable_notification(svq
));
517 * Poll the SVQ to wait for the device to use the specified number
518 * of elements and return the total length written by the device.
520 * This function race with main event loop SVQ polling, so extra
521 * synchronization is needed.
524 * @num: The number of elements that need to be used
526 size_t vhost_svq_poll(VhostShadowVirtqueue
*svq
, size_t num
)
532 int64_t start_us
= g_get_monotonic_time();
535 if (vhost_svq_more_used(svq
)) {
539 if (unlikely(g_get_monotonic_time() - start_us
> 10e6
)) {
544 vhost_svq_get_buf(svq
, &r
);
552 * Forward used buffers.
554 * @n: hdev call event notifier, the one that device set to notify svq.
556 * Note that we are not making any buffers available in the loop, there is no
557 * way that it runs more than virtqueue size times.
559 static void vhost_svq_handle_call(EventNotifier
*n
)
561 VhostShadowVirtqueue
*svq
= container_of(n
, VhostShadowVirtqueue
,
563 event_notifier_test_and_clear(n
);
564 vhost_svq_flush(svq
, true);
568 * Set the call notifier for the SVQ to call the guest
570 * @svq: Shadow virtqueue
571 * @call_fd: call notifier
573 * Called on BQL context.
575 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue
*svq
, int call_fd
)
577 if (call_fd
== VHOST_FILE_UNBIND
) {
579 * Fail event_notifier_set if called handling device call.
581 * SVQ still needs device notifications, since it needs to keep
582 * forwarding used buffers even with the unbind.
584 memset(&svq
->svq_call
, 0, sizeof(svq
->svq_call
));
586 event_notifier_init_fd(&svq
->svq_call
, call_fd
);
591 * Get the shadow vq vring address.
592 * @svq: Shadow virtqueue
593 * @addr: Destination to store address
595 void vhost_svq_get_vring_addr(const VhostShadowVirtqueue
*svq
,
596 struct vhost_vring_addr
*addr
)
598 addr
->desc_user_addr
= (uint64_t)(uintptr_t)svq
->vring
.desc
;
599 addr
->avail_user_addr
= (uint64_t)(uintptr_t)svq
->vring
.avail
;
600 addr
->used_user_addr
= (uint64_t)(uintptr_t)svq
->vring
.used
;
603 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue
*svq
)
605 size_t desc_size
= sizeof(vring_desc_t
) * svq
->vring
.num
;
606 size_t avail_size
= offsetof(vring_avail_t
, ring
[svq
->vring
.num
]) +
609 return ROUND_UP(desc_size
+ avail_size
, qemu_real_host_page_size());
612 size_t vhost_svq_device_area_size(const VhostShadowVirtqueue
*svq
)
614 size_t used_size
= offsetof(vring_used_t
, ring
[svq
->vring
.num
]) +
616 return ROUND_UP(used_size
, qemu_real_host_page_size());
620 * Set a new file descriptor for the guest to kick the SVQ and notify for avail
623 * @svq_kick_fd: The svq kick fd
625 * Note that the SVQ will never close the old file descriptor.
627 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue
*svq
, int svq_kick_fd
)
629 EventNotifier
*svq_kick
= &svq
->svq_kick
;
630 bool poll_stop
= VHOST_FILE_UNBIND
!= event_notifier_get_fd(svq_kick
);
631 bool poll_start
= svq_kick_fd
!= VHOST_FILE_UNBIND
;
634 event_notifier_set_handler(svq_kick
, NULL
);
637 event_notifier_init_fd(svq_kick
, svq_kick_fd
);
639 * event_notifier_set_handler already checks for guest's notifications if
640 * they arrive at the new file descriptor in the switch, so there is no
641 * need to explicitly check for them.
644 event_notifier_set(svq_kick
);
645 event_notifier_set_handler(svq_kick
, vhost_handle_guest_kick_notifier
);
650 * Start the shadow virtqueue operation.
652 * @svq: Shadow Virtqueue
653 * @vdev: VirtIO device
654 * @vq: Virtqueue to shadow
655 * @iova_tree: Tree to perform descriptors translations
657 void vhost_svq_start(VhostShadowVirtqueue
*svq
, VirtIODevice
*vdev
,
658 VirtQueue
*vq
, VhostIOVATree
*iova_tree
)
662 event_notifier_set_handler(&svq
->hdev_call
, vhost_svq_handle_call
);
663 svq
->next_guest_avail_elem
= NULL
;
664 svq
->shadow_avail_idx
= 0;
665 svq
->shadow_used_idx
= 0;
666 svq
->last_used_idx
= 0;
669 svq
->iova_tree
= iova_tree
;
671 svq
->vring
.num
= virtio_queue_get_num(vdev
, virtio_get_queue_index(vq
));
672 svq
->num_free
= svq
->vring
.num
;
673 svq
->vring
.desc
= mmap(NULL
, vhost_svq_driver_area_size(svq
),
674 PROT_READ
| PROT_WRITE
, MAP_SHARED
| MAP_ANONYMOUS
,
676 desc_size
= sizeof(vring_desc_t
) * svq
->vring
.num
;
677 svq
->vring
.avail
= (void *)((char *)svq
->vring
.desc
+ desc_size
);
678 svq
->vring
.used
= mmap(NULL
, vhost_svq_device_area_size(svq
),
679 PROT_READ
| PROT_WRITE
, MAP_SHARED
| MAP_ANONYMOUS
,
681 svq
->desc_state
= g_new0(SVQDescState
, svq
->vring
.num
);
682 svq
->desc_next
= g_new0(uint16_t, svq
->vring
.num
);
683 for (unsigned i
= 0; i
< svq
->vring
.num
- 1; i
++) {
684 svq
->desc_next
[i
] = cpu_to_le16(i
+ 1);
689 * Stop the shadow virtqueue operation.
690 * @svq: Shadow Virtqueue
692 void vhost_svq_stop(VhostShadowVirtqueue
*svq
)
694 vhost_svq_set_svq_kick_fd(svq
, VHOST_FILE_UNBIND
);
695 g_autofree VirtQueueElement
*next_avail_elem
= NULL
;
701 /* Send all pending used descriptors to guest */
702 vhost_svq_flush(svq
, false);
704 for (unsigned i
= 0; i
< svq
->vring
.num
; ++i
) {
705 g_autofree VirtQueueElement
*elem
= NULL
;
706 elem
= g_steal_pointer(&svq
->desc_state
[i
].elem
);
709 * TODO: This is ok for networking, but other kinds of devices
710 * might have problems with just unpop these.
712 virtqueue_unpop(svq
->vq
, elem
, 0);
716 next_avail_elem
= g_steal_pointer(&svq
->next_guest_avail_elem
);
717 if (next_avail_elem
) {
718 virtqueue_unpop(svq
->vq
, next_avail_elem
, 0);
721 g_free(svq
->desc_next
);
722 g_free(svq
->desc_state
);
723 munmap(svq
->vring
.desc
, vhost_svq_driver_area_size(svq
));
724 munmap(svq
->vring
.used
, vhost_svq_device_area_size(svq
));
725 event_notifier_set_handler(&svq
->hdev_call
, NULL
);
729 * Creates vhost shadow virtqueue, and instructs the vhost device to use the
730 * shadow methods and file descriptors.
732 * @ops: SVQ owner callbacks
733 * @ops_opaque: ops opaque pointer
735 VhostShadowVirtqueue
*vhost_svq_new(const VhostShadowVirtqueueOps
*ops
,
738 VhostShadowVirtqueue
*svq
= g_new0(VhostShadowVirtqueue
, 1);
740 event_notifier_init_fd(&svq
->svq_kick
, VHOST_FILE_UNBIND
);
742 svq
->ops_opaque
= ops_opaque
;
747 * Free the resources of the shadow virtqueue.
749 * @pvq: gpointer to SVQ so it can be used by autofree functions.
751 void vhost_svq_free(gpointer pvq
)
753 VhostShadowVirtqueue
*vq
= pvq
;