2 * vhost shadow virtqueue
4 * SPDX-FileCopyrightText: Red Hat, Inc. 2021
5 * SPDX-FileContributor: Author: Eugenio PĂ©rez <eperezma@redhat.com>
7 * SPDX-License-Identifier: GPL-2.0-or-later
10 #include "qemu/osdep.h"
11 #include "hw/virtio/vhost-shadow-virtqueue.h"
13 #include "qemu/error-report.h"
14 #include "qapi/error.h"
15 #include "qemu/main-loop.h"
17 #include "qemu/memalign.h"
18 #include "linux-headers/linux/vhost.h"
21 * Validate the transport device features that both guests can use with the SVQ
22 * and SVQs can use with the device.
24 * @dev_features: The features
25 * @errp: Error pointer
27 bool vhost_svq_valid_features(uint64_t features
, Error
**errp
)
30 uint64_t svq_features
= features
;
32 for (uint64_t b
= VIRTIO_TRANSPORT_F_START
; b
<= VIRTIO_TRANSPORT_F_END
;
35 case VIRTIO_F_ANY_LAYOUT
:
38 case VIRTIO_F_ACCESS_PLATFORM
:
39 /* SVQ trust in the host's IOMMU to translate addresses */
40 case VIRTIO_F_VERSION_1
:
41 /* SVQ trust that the guest vring is little endian */
42 if (!(svq_features
& BIT_ULL(b
))) {
43 svq_features
|= BIT_ULL(b
);
49 if (svq_features
& BIT_ULL(b
)) {
50 svq_features
&= ~BIT_ULL(b
);
57 error_setg(errp
, "SVQ Invalid device feature flags, offer: 0x%"PRIx64
58 ", ok: 0x%"PRIx64
, features
, svq_features
);
64 * Number of descriptors that the SVQ can make available from the guest.
68 static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue
*svq
)
70 return svq
->vring
.num
- (svq
->shadow_avail_idx
- svq
->shadow_used_idx
);
74 * Translate addresses between the qemu's virtual address and the SVQ IOVA
76 * @svq: Shadow VirtQueue
77 * @vaddr: Translated IOVA addresses
78 * @iovec: Source qemu's VA addresses
79 * @num: Length of iovec and minimum length of vaddr
81 static bool vhost_svq_translate_addr(const VhostShadowVirtqueue
*svq
,
82 hwaddr
*addrs
, const struct iovec
*iovec
,
89 for (size_t i
= 0; i
< num
; ++i
) {
91 .translated_addr
= (hwaddr
)(uintptr_t)iovec
[i
].iov_base
,
92 .size
= iovec
[i
].iov_len
,
94 Int128 needle_last
, map_last
;
97 const DMAMap
*map
= vhost_iova_tree_find_iova(svq
->iova_tree
, &needle
);
99 * Map cannot be NULL since iova map contains all guest space and
100 * qemu already has a physical address mapped
102 if (unlikely(!map
)) {
103 qemu_log_mask(LOG_GUEST_ERROR
,
104 "Invalid address 0x%"HWADDR_PRIx
" given by guest",
105 needle
.translated_addr
);
109 off
= needle
.translated_addr
- map
->translated_addr
;
110 addrs
[i
] = map
->iova
+ off
;
112 needle_last
= int128_add(int128_make64(needle
.translated_addr
),
113 int128_make64(iovec
[i
].iov_len
));
114 map_last
= int128_make64(map
->translated_addr
+ map
->size
);
115 if (unlikely(int128_gt(needle_last
, map_last
))) {
116 qemu_log_mask(LOG_GUEST_ERROR
,
117 "Guest buffer expands over iova range");
126 * Write descriptors to SVQ vring
128 * @svq: The shadow virtqueue
129 * @sg: Cache for hwaddr
130 * @iovec: The iovec from the guest
132 * @more_descs: True if more descriptors come in the chain
133 * @write: True if they are writeable descriptors
135 * Return true if success, false otherwise and print error.
137 static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue
*svq
, hwaddr
*sg
,
138 const struct iovec
*iovec
, size_t num
,
139 bool more_descs
, bool write
)
141 uint16_t i
= svq
->free_head
, last
= svq
->free_head
;
143 uint16_t flags
= write
? cpu_to_le16(VRING_DESC_F_WRITE
) : 0;
144 vring_desc_t
*descs
= svq
->vring
.desc
;
151 ok
= vhost_svq_translate_addr(svq
, sg
, iovec
, num
);
156 for (n
= 0; n
< num
; n
++) {
157 if (more_descs
|| (n
+ 1 < num
)) {
158 descs
[i
].flags
= flags
| cpu_to_le16(VRING_DESC_F_NEXT
);
159 descs
[i
].next
= cpu_to_le16(svq
->desc_next
[i
]);
161 descs
[i
].flags
= flags
;
163 descs
[i
].addr
= cpu_to_le64(sg
[n
]);
164 descs
[i
].len
= cpu_to_le32(iovec
[n
].iov_len
);
167 i
= cpu_to_le16(svq
->desc_next
[i
]);
170 svq
->free_head
= le16_to_cpu(svq
->desc_next
[last
]);
174 static bool vhost_svq_add_split(VhostShadowVirtqueue
*svq
,
175 const struct iovec
*out_sg
, size_t out_num
,
176 const struct iovec
*in_sg
, size_t in_num
,
180 vring_avail_t
*avail
= svq
->vring
.avail
;
182 g_autofree hwaddr
*sgs
= g_new(hwaddr
, MAX(out_num
, in_num
));
184 *head
= svq
->free_head
;
186 /* We need some descriptors here */
187 if (unlikely(!out_num
&& !in_num
)) {
188 qemu_log_mask(LOG_GUEST_ERROR
,
189 "Guest provided element with no descriptors");
193 ok
= vhost_svq_vring_write_descs(svq
, sgs
, out_sg
, out_num
, in_num
> 0,
199 ok
= vhost_svq_vring_write_descs(svq
, sgs
, in_sg
, in_num
, false, true);
205 * Put the entry in the available array (but don't update avail->idx until
208 avail_idx
= svq
->shadow_avail_idx
& (svq
->vring
.num
- 1);
209 avail
->ring
[avail_idx
] = cpu_to_le16(*head
);
210 svq
->shadow_avail_idx
++;
212 /* Update the avail index after write the descriptor */
214 avail
->idx
= cpu_to_le16(svq
->shadow_avail_idx
);
219 static void vhost_svq_kick(VhostShadowVirtqueue
*svq
)
222 * We need to expose the available array entries before checking the used
226 if (svq
->vring
.used
->flags
& VRING_USED_F_NO_NOTIFY
) {
230 event_notifier_set(&svq
->hdev_kick
);
234 * Add an element to a SVQ.
236 * The caller must check that there is enough slots for the new element. It
237 * takes ownership of the element: In case of failure not ENOSPC, it is free.
239 * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full
241 int vhost_svq_add(VhostShadowVirtqueue
*svq
, const struct iovec
*out_sg
,
242 size_t out_num
, const struct iovec
*in_sg
, size_t in_num
,
243 VirtQueueElement
*elem
)
246 unsigned ndescs
= in_num
+ out_num
;
249 if (unlikely(ndescs
> vhost_svq_available_slots(svq
))) {
253 ok
= vhost_svq_add_split(svq
, out_sg
, out_num
, in_sg
, in_num
, &qemu_head
);
259 svq
->desc_state
[qemu_head
].elem
= elem
;
260 svq
->desc_state
[qemu_head
].ndescs
= ndescs
;
265 /* Convenience wrapper to add a guest's element to SVQ */
266 static int vhost_svq_add_element(VhostShadowVirtqueue
*svq
,
267 VirtQueueElement
*elem
)
269 return vhost_svq_add(svq
, elem
->out_sg
, elem
->out_num
, elem
->in_sg
,
274 * Forward available buffers.
276 * @svq: Shadow VirtQueue
278 * Note that this function does not guarantee that all guest's available
279 * buffers are available to the device in SVQ avail ring. The guest may have
280 * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
283 * If that happens, guest's kick notifications will be disabled until the
284 * device uses some buffers.
286 static void vhost_handle_guest_kick(VhostShadowVirtqueue
*svq
)
288 /* Clear event notifier */
289 event_notifier_test_and_clear(&svq
->svq_kick
);
291 /* Forward to the device as many available buffers as possible */
293 virtio_queue_set_notification(svq
->vq
, false);
296 VirtQueueElement
*elem
;
299 if (svq
->next_guest_avail_elem
) {
300 elem
= g_steal_pointer(&svq
->next_guest_avail_elem
);
302 elem
= virtqueue_pop(svq
->vq
, sizeof(*elem
));
310 r
= svq
->ops
->avail_handler(svq
, elem
, svq
->ops_opaque
);
312 r
= vhost_svq_add_element(svq
, elem
);
314 if (unlikely(r
!= 0)) {
317 * This condition is possible since a contiguous buffer in
318 * GPA does not imply a contiguous buffer in qemu's VA
319 * scatter-gather segments. If that happens, the buffer
320 * exposed to the device needs to be a chain of descriptors
323 * SVQ cannot hold more available buffers if we are here:
324 * queue the current guest descriptor and ignore kicks
325 * until some elements are used.
327 svq
->next_guest_avail_elem
= elem
;
330 /* VQ is full or broken, just return and ignore kicks */
335 virtio_queue_set_notification(svq
->vq
, true);
336 } while (!virtio_queue_empty(svq
->vq
));
340 * Handle guest's kick.
342 * @n: guest kick event notifier, the one that guest set to notify svq.
344 static void vhost_handle_guest_kick_notifier(EventNotifier
*n
)
346 VhostShadowVirtqueue
*svq
= container_of(n
, VhostShadowVirtqueue
, svq_kick
);
347 event_notifier_test_and_clear(n
);
348 vhost_handle_guest_kick(svq
);
351 static bool vhost_svq_more_used(VhostShadowVirtqueue
*svq
)
353 uint16_t *used_idx
= &svq
->vring
.used
->idx
;
354 if (svq
->last_used_idx
!= svq
->shadow_used_idx
) {
358 svq
->shadow_used_idx
= cpu_to_le16(*(volatile uint16_t *)used_idx
);
360 return svq
->last_used_idx
!= svq
->shadow_used_idx
;
364 * Enable vhost device calls after disable them.
368 * It returns false if there are pending used buffers from the vhost device,
369 * avoiding the possible races between SVQ checking for more work and enabling
370 * callbacks. True if SVQ used vring has no more pending buffers.
372 static bool vhost_svq_enable_notification(VhostShadowVirtqueue
*svq
)
374 svq
->vring
.avail
->flags
&= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT
);
375 /* Make sure the flag is written before the read of used_idx */
377 return !vhost_svq_more_used(svq
);
380 static void vhost_svq_disable_notification(VhostShadowVirtqueue
*svq
)
382 svq
->vring
.avail
->flags
|= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT
);
385 static uint16_t vhost_svq_last_desc_of_chain(const VhostShadowVirtqueue
*svq
,
386 uint16_t num
, uint16_t i
)
388 for (uint16_t j
= 0; j
< (num
- 1); ++j
) {
389 i
= le16_to_cpu(svq
->desc_next
[i
]);
395 static VirtQueueElement
*vhost_svq_get_buf(VhostShadowVirtqueue
*svq
,
398 const vring_used_t
*used
= svq
->vring
.used
;
399 vring_used_elem_t used_elem
;
400 uint16_t last_used
, last_used_chain
, num
;
402 if (!vhost_svq_more_used(svq
)) {
406 /* Only get used array entries after they have been exposed by dev */
408 last_used
= svq
->last_used_idx
& (svq
->vring
.num
- 1);
409 used_elem
.id
= le32_to_cpu(used
->ring
[last_used
].id
);
410 used_elem
.len
= le32_to_cpu(used
->ring
[last_used
].len
);
412 svq
->last_used_idx
++;
413 if (unlikely(used_elem
.id
>= svq
->vring
.num
)) {
414 qemu_log_mask(LOG_GUEST_ERROR
, "Device %s says index %u is used",
415 svq
->vdev
->name
, used_elem
.id
);
419 if (unlikely(!svq
->desc_state
[used_elem
.id
].elem
)) {
420 qemu_log_mask(LOG_GUEST_ERROR
,
421 "Device %s says index %u is used, but it was not available",
422 svq
->vdev
->name
, used_elem
.id
);
426 num
= svq
->desc_state
[used_elem
.id
].ndescs
;
427 last_used_chain
= vhost_svq_last_desc_of_chain(svq
, num
, used_elem
.id
);
428 svq
->desc_next
[last_used_chain
] = svq
->free_head
;
429 svq
->free_head
= used_elem
.id
;
431 *len
= used_elem
.len
;
432 return g_steal_pointer(&svq
->desc_state
[used_elem
.id
].elem
);
436 * Push an element to SVQ, returning it to the guest.
438 void vhost_svq_push_elem(VhostShadowVirtqueue
*svq
,
439 const VirtQueueElement
*elem
, uint32_t len
)
441 virtqueue_push(svq
->vq
, elem
, len
);
442 if (svq
->next_guest_avail_elem
) {
444 * Avail ring was full when vhost_svq_flush was called, so it's a
445 * good moment to make more descriptors available if possible.
447 vhost_handle_guest_kick(svq
);
451 static void vhost_svq_flush(VhostShadowVirtqueue
*svq
,
452 bool check_for_avail_queue
)
454 VirtQueue
*vq
= svq
->vq
;
456 /* Forward as many used buffers as possible. */
460 vhost_svq_disable_notification(svq
);
463 g_autofree VirtQueueElement
*elem
= vhost_svq_get_buf(svq
, &len
);
468 if (unlikely(i
>= svq
->vring
.num
)) {
469 qemu_log_mask(LOG_GUEST_ERROR
,
470 "More than %u used buffers obtained in a %u size SVQ",
472 virtqueue_fill(vq
, elem
, len
, i
);
473 virtqueue_flush(vq
, i
);
476 virtqueue_fill(vq
, elem
, len
, i
++);
479 virtqueue_flush(vq
, i
);
480 event_notifier_set(&svq
->svq_call
);
482 if (check_for_avail_queue
&& svq
->next_guest_avail_elem
) {
484 * Avail ring was full when vhost_svq_flush was called, so it's a
485 * good moment to make more descriptors available if possible.
487 vhost_handle_guest_kick(svq
);
489 } while (!vhost_svq_enable_notification(svq
));
493 * Poll the SVQ for one device used buffer.
495 * This function race with main event loop SVQ polling, so extra
496 * synchronization is needed.
498 * Return the length written by the device.
500 size_t vhost_svq_poll(VhostShadowVirtqueue
*svq
)
502 int64_t start_us
= g_get_monotonic_time();
505 VirtQueueElement
*elem
= vhost_svq_get_buf(svq
, &len
);
510 if (unlikely(g_get_monotonic_time() - start_us
> 10e6
)) {
514 /* Make sure we read new used_idx */
520 * Forward used buffers.
522 * @n: hdev call event notifier, the one that device set to notify svq.
524 * Note that we are not making any buffers available in the loop, there is no
525 * way that it runs more than virtqueue size times.
527 static void vhost_svq_handle_call(EventNotifier
*n
)
529 VhostShadowVirtqueue
*svq
= container_of(n
, VhostShadowVirtqueue
,
531 event_notifier_test_and_clear(n
);
532 vhost_svq_flush(svq
, true);
536 * Set the call notifier for the SVQ to call the guest
538 * @svq: Shadow virtqueue
539 * @call_fd: call notifier
541 * Called on BQL context.
543 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue
*svq
, int call_fd
)
545 if (call_fd
== VHOST_FILE_UNBIND
) {
547 * Fail event_notifier_set if called handling device call.
549 * SVQ still needs device notifications, since it needs to keep
550 * forwarding used buffers even with the unbind.
552 memset(&svq
->svq_call
, 0, sizeof(svq
->svq_call
));
554 event_notifier_init_fd(&svq
->svq_call
, call_fd
);
559 * Get the shadow vq vring address.
560 * @svq: Shadow virtqueue
561 * @addr: Destination to store address
563 void vhost_svq_get_vring_addr(const VhostShadowVirtqueue
*svq
,
564 struct vhost_vring_addr
*addr
)
566 addr
->desc_user_addr
= (uint64_t)(uintptr_t)svq
->vring
.desc
;
567 addr
->avail_user_addr
= (uint64_t)(uintptr_t)svq
->vring
.avail
;
568 addr
->used_user_addr
= (uint64_t)(uintptr_t)svq
->vring
.used
;
571 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue
*svq
)
573 size_t desc_size
= sizeof(vring_desc_t
) * svq
->vring
.num
;
574 size_t avail_size
= offsetof(vring_avail_t
, ring
) +
575 sizeof(uint16_t) * svq
->vring
.num
;
577 return ROUND_UP(desc_size
+ avail_size
, qemu_real_host_page_size());
580 size_t vhost_svq_device_area_size(const VhostShadowVirtqueue
*svq
)
582 size_t used_size
= offsetof(vring_used_t
, ring
) +
583 sizeof(vring_used_elem_t
) * svq
->vring
.num
;
584 return ROUND_UP(used_size
, qemu_real_host_page_size());
588 * Set a new file descriptor for the guest to kick the SVQ and notify for avail
591 * @svq_kick_fd: The svq kick fd
593 * Note that the SVQ will never close the old file descriptor.
595 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue
*svq
, int svq_kick_fd
)
597 EventNotifier
*svq_kick
= &svq
->svq_kick
;
598 bool poll_stop
= VHOST_FILE_UNBIND
!= event_notifier_get_fd(svq_kick
);
599 bool poll_start
= svq_kick_fd
!= VHOST_FILE_UNBIND
;
602 event_notifier_set_handler(svq_kick
, NULL
);
606 * event_notifier_set_handler already checks for guest's notifications if
607 * they arrive at the new file descriptor in the switch, so there is no
608 * need to explicitly check for them.
611 event_notifier_init_fd(svq_kick
, svq_kick_fd
);
612 event_notifier_set(svq_kick
);
613 event_notifier_set_handler(svq_kick
, vhost_handle_guest_kick_notifier
);
618 * Start the shadow virtqueue operation.
620 * @svq: Shadow Virtqueue
621 * @vdev: VirtIO device
622 * @vq: Virtqueue to shadow
624 void vhost_svq_start(VhostShadowVirtqueue
*svq
, VirtIODevice
*vdev
,
627 size_t desc_size
, driver_size
, device_size
;
629 svq
->next_guest_avail_elem
= NULL
;
630 svq
->shadow_avail_idx
= 0;
631 svq
->shadow_used_idx
= 0;
632 svq
->last_used_idx
= 0;
636 svq
->vring
.num
= virtio_queue_get_num(vdev
, virtio_get_queue_index(vq
));
637 driver_size
= vhost_svq_driver_area_size(svq
);
638 device_size
= vhost_svq_device_area_size(svq
);
639 svq
->vring
.desc
= qemu_memalign(qemu_real_host_page_size(), driver_size
);
640 desc_size
= sizeof(vring_desc_t
) * svq
->vring
.num
;
641 svq
->vring
.avail
= (void *)((char *)svq
->vring
.desc
+ desc_size
);
642 memset(svq
->vring
.desc
, 0, driver_size
);
643 svq
->vring
.used
= qemu_memalign(qemu_real_host_page_size(), device_size
);
644 memset(svq
->vring
.used
, 0, device_size
);
645 svq
->desc_state
= g_new0(SVQDescState
, svq
->vring
.num
);
646 svq
->desc_next
= g_new0(uint16_t, svq
->vring
.num
);
647 for (unsigned i
= 0; i
< svq
->vring
.num
- 1; i
++) {
648 svq
->desc_next
[i
] = cpu_to_le16(i
+ 1);
653 * Stop the shadow virtqueue operation.
654 * @svq: Shadow Virtqueue
656 void vhost_svq_stop(VhostShadowVirtqueue
*svq
)
658 event_notifier_set_handler(&svq
->svq_kick
, NULL
);
659 g_autofree VirtQueueElement
*next_avail_elem
= NULL
;
665 /* Send all pending used descriptors to guest */
666 vhost_svq_flush(svq
, false);
668 for (unsigned i
= 0; i
< svq
->vring
.num
; ++i
) {
669 g_autofree VirtQueueElement
*elem
= NULL
;
670 elem
= g_steal_pointer(&svq
->desc_state
[i
].elem
);
672 virtqueue_detach_element(svq
->vq
, elem
, 0);
676 next_avail_elem
= g_steal_pointer(&svq
->next_guest_avail_elem
);
677 if (next_avail_elem
) {
678 virtqueue_detach_element(svq
->vq
, next_avail_elem
, 0);
681 g_free(svq
->desc_next
);
682 g_free(svq
->desc_state
);
683 qemu_vfree(svq
->vring
.desc
);
684 qemu_vfree(svq
->vring
.used
);
688 * Creates vhost shadow virtqueue, and instructs the vhost device to use the
689 * shadow methods and file descriptors.
691 * @iova_tree: Tree to perform descriptors translations
692 * @ops: SVQ owner callbacks
693 * @ops_opaque: ops opaque pointer
695 * Returns the new virtqueue or NULL.
697 * In case of error, reason is reported through error_report.
699 VhostShadowVirtqueue
*vhost_svq_new(VhostIOVATree
*iova_tree
,
700 const VhostShadowVirtqueueOps
*ops
,
703 g_autofree VhostShadowVirtqueue
*svq
= g_new0(VhostShadowVirtqueue
, 1);
706 r
= event_notifier_init(&svq
->hdev_kick
, 0);
708 error_report("Couldn't create kick event notifier: %s (%d)",
709 g_strerror(errno
), errno
);
710 goto err_init_hdev_kick
;
713 r
= event_notifier_init(&svq
->hdev_call
, 0);
715 error_report("Couldn't create call event notifier: %s (%d)",
716 g_strerror(errno
), errno
);
717 goto err_init_hdev_call
;
720 event_notifier_init_fd(&svq
->svq_kick
, VHOST_FILE_UNBIND
);
721 event_notifier_set_handler(&svq
->hdev_call
, vhost_svq_handle_call
);
722 svq
->iova_tree
= iova_tree
;
724 svq
->ops_opaque
= ops_opaque
;
725 return g_steal_pointer(&svq
);
728 event_notifier_cleanup(&svq
->hdev_kick
);
735 * Free the resources of the shadow virtqueue.
737 * @pvq: gpointer to SVQ so it can be used by autofree functions.
739 void vhost_svq_free(gpointer pvq
)
741 VhostShadowVirtqueue
*vq
= pvq
;
743 event_notifier_cleanup(&vq
->hdev_kick
);
744 event_notifier_set_handler(&vq
->hdev_call
, NULL
);
745 event_notifier_cleanup(&vq
->hdev_call
);