2 * vhost shadow virtqueue
4 * SPDX-FileCopyrightText: Red Hat, Inc. 2021
5 * SPDX-FileContributor: Author: Eugenio PĂ©rez <eperezma@redhat.com>
7 * SPDX-License-Identifier: GPL-2.0-or-later
10 #include "qemu/osdep.h"
11 #include "hw/virtio/vhost-shadow-virtqueue.h"
13 #include "qemu/error-report.h"
14 #include "qapi/error.h"
15 #include "qemu/main-loop.h"
17 #include "qemu/memalign.h"
18 #include "linux-headers/linux/vhost.h"
21 * Validate the transport device features that both guests can use with the SVQ
22 * and SVQs can use with the device.
24 * @dev_features: The features
25 * @errp: Error pointer
27 bool vhost_svq_valid_features(uint64_t features
, Error
**errp
)
30 uint64_t svq_features
= features
;
32 for (uint64_t b
= VIRTIO_TRANSPORT_F_START
; b
<= VIRTIO_TRANSPORT_F_END
;
35 case VIRTIO_F_ANY_LAYOUT
:
36 case VIRTIO_RING_F_EVENT_IDX
:
39 case VIRTIO_F_ACCESS_PLATFORM
:
40 /* SVQ trust in the host's IOMMU to translate addresses */
41 case VIRTIO_F_VERSION_1
:
42 /* SVQ trust that the guest vring is little endian */
43 if (!(svq_features
& BIT_ULL(b
))) {
44 svq_features
|= BIT_ULL(b
);
50 if (svq_features
& BIT_ULL(b
)) {
51 svq_features
&= ~BIT_ULL(b
);
58 error_setg(errp
, "SVQ Invalid device feature flags, offer: 0x%"PRIx64
59 ", ok: 0x%"PRIx64
, features
, svq_features
);
65 * Number of descriptors that the SVQ can make available from the guest.
69 static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue
*svq
)
71 return svq
->vring
.num
- (svq
->shadow_avail_idx
- svq
->shadow_used_idx
);
75 * Translate addresses between the qemu's virtual address and the SVQ IOVA
77 * @svq: Shadow VirtQueue
78 * @vaddr: Translated IOVA addresses
79 * @iovec: Source qemu's VA addresses
80 * @num: Length of iovec and minimum length of vaddr
82 static bool vhost_svq_translate_addr(const VhostShadowVirtqueue
*svq
,
83 hwaddr
*addrs
, const struct iovec
*iovec
,
90 for (size_t i
= 0; i
< num
; ++i
) {
92 .translated_addr
= (hwaddr
)(uintptr_t)iovec
[i
].iov_base
,
93 .size
= iovec
[i
].iov_len
,
95 Int128 needle_last
, map_last
;
98 const DMAMap
*map
= vhost_iova_tree_find_iova(svq
->iova_tree
, &needle
);
100 * Map cannot be NULL since iova map contains all guest space and
101 * qemu already has a physical address mapped
103 if (unlikely(!map
)) {
104 qemu_log_mask(LOG_GUEST_ERROR
,
105 "Invalid address 0x%"HWADDR_PRIx
" given by guest",
106 needle
.translated_addr
);
110 off
= needle
.translated_addr
- map
->translated_addr
;
111 addrs
[i
] = map
->iova
+ off
;
113 needle_last
= int128_add(int128_make64(needle
.translated_addr
),
114 int128_make64(iovec
[i
].iov_len
));
115 map_last
= int128_make64(map
->translated_addr
+ map
->size
);
116 if (unlikely(int128_gt(needle_last
, map_last
))) {
117 qemu_log_mask(LOG_GUEST_ERROR
,
118 "Guest buffer expands over iova range");
127 * Write descriptors to SVQ vring
129 * @svq: The shadow virtqueue
130 * @sg: Cache for hwaddr
131 * @iovec: The iovec from the guest
133 * @more_descs: True if more descriptors come in the chain
134 * @write: True if they are writeable descriptors
136 * Return true if success, false otherwise and print error.
138 static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue
*svq
, hwaddr
*sg
,
139 const struct iovec
*iovec
, size_t num
,
140 bool more_descs
, bool write
)
142 uint16_t i
= svq
->free_head
, last
= svq
->free_head
;
144 uint16_t flags
= write
? cpu_to_le16(VRING_DESC_F_WRITE
) : 0;
145 vring_desc_t
*descs
= svq
->vring
.desc
;
152 ok
= vhost_svq_translate_addr(svq
, sg
, iovec
, num
);
157 for (n
= 0; n
< num
; n
++) {
158 if (more_descs
|| (n
+ 1 < num
)) {
159 descs
[i
].flags
= flags
| cpu_to_le16(VRING_DESC_F_NEXT
);
160 descs
[i
].next
= cpu_to_le16(svq
->desc_next
[i
]);
162 descs
[i
].flags
= flags
;
164 descs
[i
].addr
= cpu_to_le64(sg
[n
]);
165 descs
[i
].len
= cpu_to_le32(iovec
[n
].iov_len
);
168 i
= cpu_to_le16(svq
->desc_next
[i
]);
171 svq
->free_head
= le16_to_cpu(svq
->desc_next
[last
]);
175 static bool vhost_svq_add_split(VhostShadowVirtqueue
*svq
,
176 const struct iovec
*out_sg
, size_t out_num
,
177 const struct iovec
*in_sg
, size_t in_num
,
181 vring_avail_t
*avail
= svq
->vring
.avail
;
183 g_autofree hwaddr
*sgs
= g_new(hwaddr
, MAX(out_num
, in_num
));
185 *head
= svq
->free_head
;
187 /* We need some descriptors here */
188 if (unlikely(!out_num
&& !in_num
)) {
189 qemu_log_mask(LOG_GUEST_ERROR
,
190 "Guest provided element with no descriptors");
194 ok
= vhost_svq_vring_write_descs(svq
, sgs
, out_sg
, out_num
, in_num
> 0,
200 ok
= vhost_svq_vring_write_descs(svq
, sgs
, in_sg
, in_num
, false, true);
206 * Put the entry in the available array (but don't update avail->idx until
209 avail_idx
= svq
->shadow_avail_idx
& (svq
->vring
.num
- 1);
210 avail
->ring
[avail_idx
] = cpu_to_le16(*head
);
211 svq
->shadow_avail_idx
++;
213 /* Update the avail index after write the descriptor */
215 avail
->idx
= cpu_to_le16(svq
->shadow_avail_idx
);
220 static void vhost_svq_kick(VhostShadowVirtqueue
*svq
)
225 * We need to expose the available array entries before checking the used
230 if (virtio_vdev_has_feature(svq
->vdev
, VIRTIO_RING_F_EVENT_IDX
)) {
231 uint16_t avail_event
= *(uint16_t *)(&svq
->vring
.used
->ring
[svq
->vring
.num
]);
232 needs_kick
= vring_need_event(avail_event
, svq
->shadow_avail_idx
, svq
->shadow_avail_idx
- 1);
234 needs_kick
= !(svq
->vring
.used
->flags
& VRING_USED_F_NO_NOTIFY
);
241 event_notifier_set(&svq
->hdev_kick
);
245 * Add an element to a SVQ.
247 * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full
249 int vhost_svq_add(VhostShadowVirtqueue
*svq
, const struct iovec
*out_sg
,
250 size_t out_num
, const struct iovec
*in_sg
, size_t in_num
,
251 VirtQueueElement
*elem
)
254 unsigned ndescs
= in_num
+ out_num
;
257 if (unlikely(ndescs
> vhost_svq_available_slots(svq
))) {
261 ok
= vhost_svq_add_split(svq
, out_sg
, out_num
, in_sg
, in_num
, &qemu_head
);
266 svq
->desc_state
[qemu_head
].elem
= elem
;
267 svq
->desc_state
[qemu_head
].ndescs
= ndescs
;
272 /* Convenience wrapper to add a guest's element to SVQ */
273 static int vhost_svq_add_element(VhostShadowVirtqueue
*svq
,
274 VirtQueueElement
*elem
)
276 return vhost_svq_add(svq
, elem
->out_sg
, elem
->out_num
, elem
->in_sg
,
281 * Forward available buffers.
283 * @svq: Shadow VirtQueue
285 * Note that this function does not guarantee that all guest's available
286 * buffers are available to the device in SVQ avail ring. The guest may have
287 * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
290 * If that happens, guest's kick notifications will be disabled until the
291 * device uses some buffers.
293 static void vhost_handle_guest_kick(VhostShadowVirtqueue
*svq
)
295 /* Clear event notifier */
296 event_notifier_test_and_clear(&svq
->svq_kick
);
298 /* Forward to the device as many available buffers as possible */
300 virtio_queue_set_notification(svq
->vq
, false);
303 g_autofree VirtQueueElement
*elem
= NULL
;
306 if (svq
->next_guest_avail_elem
) {
307 elem
= g_steal_pointer(&svq
->next_guest_avail_elem
);
309 elem
= virtqueue_pop(svq
->vq
, sizeof(*elem
));
317 r
= svq
->ops
->avail_handler(svq
, elem
, svq
->ops_opaque
);
319 r
= vhost_svq_add_element(svq
, elem
);
321 if (unlikely(r
!= 0)) {
324 * This condition is possible since a contiguous buffer in
325 * GPA does not imply a contiguous buffer in qemu's VA
326 * scatter-gather segments. If that happens, the buffer
327 * exposed to the device needs to be a chain of descriptors
330 * SVQ cannot hold more available buffers if we are here:
331 * queue the current guest descriptor and ignore kicks
332 * until some elements are used.
334 svq
->next_guest_avail_elem
= g_steal_pointer(&elem
);
337 /* VQ is full or broken, just return and ignore kicks */
340 /* elem belongs to SVQ or external caller now */
344 virtio_queue_set_notification(svq
->vq
, true);
345 } while (!virtio_queue_empty(svq
->vq
));
349 * Handle guest's kick.
351 * @n: guest kick event notifier, the one that guest set to notify svq.
353 static void vhost_handle_guest_kick_notifier(EventNotifier
*n
)
355 VhostShadowVirtqueue
*svq
= container_of(n
, VhostShadowVirtqueue
, svq_kick
);
356 event_notifier_test_and_clear(n
);
357 vhost_handle_guest_kick(svq
);
360 static bool vhost_svq_more_used(VhostShadowVirtqueue
*svq
)
362 uint16_t *used_idx
= &svq
->vring
.used
->idx
;
363 if (svq
->last_used_idx
!= svq
->shadow_used_idx
) {
367 svq
->shadow_used_idx
= cpu_to_le16(*(volatile uint16_t *)used_idx
);
369 return svq
->last_used_idx
!= svq
->shadow_used_idx
;
373 * Enable vhost device calls after disable them.
377 * It returns false if there are pending used buffers from the vhost device,
378 * avoiding the possible races between SVQ checking for more work and enabling
379 * callbacks. True if SVQ used vring has no more pending buffers.
381 static bool vhost_svq_enable_notification(VhostShadowVirtqueue
*svq
)
383 if (virtio_vdev_has_feature(svq
->vdev
, VIRTIO_RING_F_EVENT_IDX
)) {
384 uint16_t *used_event
= (uint16_t *)&svq
->vring
.avail
->ring
[svq
->vring
.num
];
385 *used_event
= svq
->shadow_used_idx
;
387 svq
->vring
.avail
->flags
&= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT
);
390 /* Make sure the event is enabled before the read of used_idx */
392 return !vhost_svq_more_used(svq
);
395 static void vhost_svq_disable_notification(VhostShadowVirtqueue
*svq
)
398 * No need to disable notification in the event idx case, since used event
399 * index is already an index too far away.
401 if (!virtio_vdev_has_feature(svq
->vdev
, VIRTIO_RING_F_EVENT_IDX
)) {
402 svq
->vring
.avail
->flags
|= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT
);
406 static uint16_t vhost_svq_last_desc_of_chain(const VhostShadowVirtqueue
*svq
,
407 uint16_t num
, uint16_t i
)
409 for (uint16_t j
= 0; j
< (num
- 1); ++j
) {
410 i
= le16_to_cpu(svq
->desc_next
[i
]);
416 static VirtQueueElement
*vhost_svq_get_buf(VhostShadowVirtqueue
*svq
,
419 const vring_used_t
*used
= svq
->vring
.used
;
420 vring_used_elem_t used_elem
;
421 uint16_t last_used
, last_used_chain
, num
;
423 if (!vhost_svq_more_used(svq
)) {
427 /* Only get used array entries after they have been exposed by dev */
429 last_used
= svq
->last_used_idx
& (svq
->vring
.num
- 1);
430 used_elem
.id
= le32_to_cpu(used
->ring
[last_used
].id
);
431 used_elem
.len
= le32_to_cpu(used
->ring
[last_used
].len
);
433 svq
->last_used_idx
++;
434 if (unlikely(used_elem
.id
>= svq
->vring
.num
)) {
435 qemu_log_mask(LOG_GUEST_ERROR
, "Device %s says index %u is used",
436 svq
->vdev
->name
, used_elem
.id
);
440 if (unlikely(!svq
->desc_state
[used_elem
.id
].ndescs
)) {
441 qemu_log_mask(LOG_GUEST_ERROR
,
442 "Device %s says index %u is used, but it was not available",
443 svq
->vdev
->name
, used_elem
.id
);
447 num
= svq
->desc_state
[used_elem
.id
].ndescs
;
448 svq
->desc_state
[used_elem
.id
].ndescs
= 0;
449 last_used_chain
= vhost_svq_last_desc_of_chain(svq
, num
, used_elem
.id
);
450 svq
->desc_next
[last_used_chain
] = svq
->free_head
;
451 svq
->free_head
= used_elem
.id
;
453 *len
= used_elem
.len
;
454 return g_steal_pointer(&svq
->desc_state
[used_elem
.id
].elem
);
458 * Push an element to SVQ, returning it to the guest.
460 void vhost_svq_push_elem(VhostShadowVirtqueue
*svq
,
461 const VirtQueueElement
*elem
, uint32_t len
)
463 virtqueue_push(svq
->vq
, elem
, len
);
464 if (svq
->next_guest_avail_elem
) {
466 * Avail ring was full when vhost_svq_flush was called, so it's a
467 * good moment to make more descriptors available if possible.
469 vhost_handle_guest_kick(svq
);
473 static void vhost_svq_flush(VhostShadowVirtqueue
*svq
,
474 bool check_for_avail_queue
)
476 VirtQueue
*vq
= svq
->vq
;
478 /* Forward as many used buffers as possible. */
482 vhost_svq_disable_notification(svq
);
485 g_autofree VirtQueueElement
*elem
= vhost_svq_get_buf(svq
, &len
);
490 if (unlikely(i
>= svq
->vring
.num
)) {
491 qemu_log_mask(LOG_GUEST_ERROR
,
492 "More than %u used buffers obtained in a %u size SVQ",
494 virtqueue_fill(vq
, elem
, len
, i
);
495 virtqueue_flush(vq
, i
);
498 virtqueue_fill(vq
, elem
, len
, i
++);
501 virtqueue_flush(vq
, i
);
502 event_notifier_set(&svq
->svq_call
);
504 if (check_for_avail_queue
&& svq
->next_guest_avail_elem
) {
506 * Avail ring was full when vhost_svq_flush was called, so it's a
507 * good moment to make more descriptors available if possible.
509 vhost_handle_guest_kick(svq
);
511 } while (!vhost_svq_enable_notification(svq
));
515 * Poll the SVQ for one device used buffer.
517 * This function race with main event loop SVQ polling, so extra
518 * synchronization is needed.
520 * Return the length written by the device.
522 size_t vhost_svq_poll(VhostShadowVirtqueue
*svq
)
524 int64_t start_us
= g_get_monotonic_time();
528 if (vhost_svq_more_used(svq
)) {
532 if (unlikely(g_get_monotonic_time() - start_us
> 10e6
)) {
537 vhost_svq_get_buf(svq
, &len
);
542 * Forward used buffers.
544 * @n: hdev call event notifier, the one that device set to notify svq.
546 * Note that we are not making any buffers available in the loop, there is no
547 * way that it runs more than virtqueue size times.
549 static void vhost_svq_handle_call(EventNotifier
*n
)
551 VhostShadowVirtqueue
*svq
= container_of(n
, VhostShadowVirtqueue
,
553 event_notifier_test_and_clear(n
);
554 vhost_svq_flush(svq
, true);
558 * Set the call notifier for the SVQ to call the guest
560 * @svq: Shadow virtqueue
561 * @call_fd: call notifier
563 * Called on BQL context.
565 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue
*svq
, int call_fd
)
567 if (call_fd
== VHOST_FILE_UNBIND
) {
569 * Fail event_notifier_set if called handling device call.
571 * SVQ still needs device notifications, since it needs to keep
572 * forwarding used buffers even with the unbind.
574 memset(&svq
->svq_call
, 0, sizeof(svq
->svq_call
));
576 event_notifier_init_fd(&svq
->svq_call
, call_fd
);
581 * Get the shadow vq vring address.
582 * @svq: Shadow virtqueue
583 * @addr: Destination to store address
585 void vhost_svq_get_vring_addr(const VhostShadowVirtqueue
*svq
,
586 struct vhost_vring_addr
*addr
)
588 addr
->desc_user_addr
= (uint64_t)(uintptr_t)svq
->vring
.desc
;
589 addr
->avail_user_addr
= (uint64_t)(uintptr_t)svq
->vring
.avail
;
590 addr
->used_user_addr
= (uint64_t)(uintptr_t)svq
->vring
.used
;
593 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue
*svq
)
595 size_t desc_size
= sizeof(vring_desc_t
) * svq
->vring
.num
;
596 size_t avail_size
= offsetof(vring_avail_t
, ring
[svq
->vring
.num
]) +
599 return ROUND_UP(desc_size
+ avail_size
, qemu_real_host_page_size());
602 size_t vhost_svq_device_area_size(const VhostShadowVirtqueue
*svq
)
604 size_t used_size
= offsetof(vring_used_t
, ring
[svq
->vring
.num
]) +
606 return ROUND_UP(used_size
, qemu_real_host_page_size());
610 * Set a new file descriptor for the guest to kick the SVQ and notify for avail
613 * @svq_kick_fd: The svq kick fd
615 * Note that the SVQ will never close the old file descriptor.
617 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue
*svq
, int svq_kick_fd
)
619 EventNotifier
*svq_kick
= &svq
->svq_kick
;
620 bool poll_stop
= VHOST_FILE_UNBIND
!= event_notifier_get_fd(svq_kick
);
621 bool poll_start
= svq_kick_fd
!= VHOST_FILE_UNBIND
;
624 event_notifier_set_handler(svq_kick
, NULL
);
627 event_notifier_init_fd(svq_kick
, svq_kick_fd
);
629 * event_notifier_set_handler already checks for guest's notifications if
630 * they arrive at the new file descriptor in the switch, so there is no
631 * need to explicitly check for them.
634 event_notifier_set(svq_kick
);
635 event_notifier_set_handler(svq_kick
, vhost_handle_guest_kick_notifier
);
640 * Start the shadow virtqueue operation.
642 * @svq: Shadow Virtqueue
643 * @vdev: VirtIO device
644 * @vq: Virtqueue to shadow
645 * @iova_tree: Tree to perform descriptors translations
647 void vhost_svq_start(VhostShadowVirtqueue
*svq
, VirtIODevice
*vdev
,
648 VirtQueue
*vq
, VhostIOVATree
*iova_tree
)
650 size_t desc_size
, driver_size
, device_size
;
652 event_notifier_set_handler(&svq
->hdev_call
, vhost_svq_handle_call
);
653 svq
->next_guest_avail_elem
= NULL
;
654 svq
->shadow_avail_idx
= 0;
655 svq
->shadow_used_idx
= 0;
656 svq
->last_used_idx
= 0;
659 svq
->iova_tree
= iova_tree
;
661 svq
->vring
.num
= virtio_queue_get_num(vdev
, virtio_get_queue_index(vq
));
662 driver_size
= vhost_svq_driver_area_size(svq
);
663 device_size
= vhost_svq_device_area_size(svq
);
664 svq
->vring
.desc
= qemu_memalign(qemu_real_host_page_size(), driver_size
);
665 desc_size
= sizeof(vring_desc_t
) * svq
->vring
.num
;
666 svq
->vring
.avail
= (void *)((char *)svq
->vring
.desc
+ desc_size
);
667 memset(svq
->vring
.desc
, 0, driver_size
);
668 svq
->vring
.used
= qemu_memalign(qemu_real_host_page_size(), device_size
);
669 memset(svq
->vring
.used
, 0, device_size
);
670 svq
->desc_state
= g_new0(SVQDescState
, svq
->vring
.num
);
671 svq
->desc_next
= g_new0(uint16_t, svq
->vring
.num
);
672 for (unsigned i
= 0; i
< svq
->vring
.num
- 1; i
++) {
673 svq
->desc_next
[i
] = cpu_to_le16(i
+ 1);
678 * Stop the shadow virtqueue operation.
679 * @svq: Shadow Virtqueue
681 void vhost_svq_stop(VhostShadowVirtqueue
*svq
)
683 vhost_svq_set_svq_kick_fd(svq
, VHOST_FILE_UNBIND
);
684 g_autofree VirtQueueElement
*next_avail_elem
= NULL
;
690 /* Send all pending used descriptors to guest */
691 vhost_svq_flush(svq
, false);
693 for (unsigned i
= 0; i
< svq
->vring
.num
; ++i
) {
694 g_autofree VirtQueueElement
*elem
= NULL
;
695 elem
= g_steal_pointer(&svq
->desc_state
[i
].elem
);
697 virtqueue_detach_element(svq
->vq
, elem
, 0);
701 next_avail_elem
= g_steal_pointer(&svq
->next_guest_avail_elem
);
702 if (next_avail_elem
) {
703 virtqueue_detach_element(svq
->vq
, next_avail_elem
, 0);
706 g_free(svq
->desc_next
);
707 g_free(svq
->desc_state
);
708 qemu_vfree(svq
->vring
.desc
);
709 qemu_vfree(svq
->vring
.used
);
710 event_notifier_set_handler(&svq
->hdev_call
, NULL
);
714 * Creates vhost shadow virtqueue, and instructs the vhost device to use the
715 * shadow methods and file descriptors.
717 * @ops: SVQ owner callbacks
718 * @ops_opaque: ops opaque pointer
720 VhostShadowVirtqueue
*vhost_svq_new(const VhostShadowVirtqueueOps
*ops
,
723 VhostShadowVirtqueue
*svq
= g_new0(VhostShadowVirtqueue
, 1);
725 event_notifier_init_fd(&svq
->svq_kick
, VHOST_FILE_UNBIND
);
727 svq
->ops_opaque
= ops_opaque
;
732 * Free the resources of the shadow virtqueue.
734 * @pvq: gpointer to SVQ so it can be used by autofree functions.
736 void vhost_svq_free(gpointer pvq
)
738 VhostShadowVirtqueue
*vq
= pvq
;