4 * Copyright IBM, Corp. 2007
5 * Copyright (c) 2016 Red Hat, Inc.
8 * Anthony Liguori <aliguori@us.ibm.com>
9 * Marc-André Lureau <mlureau@redhat.com>
10 * Victor Kaplansky <victork@redhat.com>
12 * This work is licensed under the terms of the GNU GPL, version 2 or
13 * later. See the COPYING file in the top-level directory.
20 /* this code avoids GLib dependency */
29 #include <sys/types.h>
30 #include <sys/socket.h>
31 #include <sys/eventfd.h>
35 /* Necessary to provide VIRTIO_F_VERSION_1 on system
36 * with older linux headers. Must appear before
37 * <linux/vhost.h> below.
39 #include "standard-headers/linux/virtio_config.h"
41 #if defined(__linux__)
42 #include <sys/syscall.h>
44 #include <sys/ioctl.h>
45 #include <linux/vhost.h>
47 #ifdef __NR_userfaultfd
48 #include <linux/userfaultfd.h>
53 #include "include/atomic.h"
55 #include "libvhost-user.h"
57 /* usually provided by GLib */
58 #if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)
59 #if !defined(__clang__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 4)
60 #define G_GNUC_PRINTF(format_idx, arg_idx) \
61 __attribute__((__format__(gnu_printf, format_idx, arg_idx)))
63 #define G_GNUC_PRINTF(format_idx, arg_idx) \
64 __attribute__((__format__(__printf__, format_idx, arg_idx)))
67 #define G_GNUC_PRINTF(format_idx, arg_idx)
68 #endif /* !__GNUC__ */
70 #define MIN(x, y) ({ \
71 __typeof__(x) _min1 = (x); \
72 __typeof__(y) _min2 = (y); \
73 (void) (&_min1 == &_min2); \
74 _min1 < _min2 ? _min1 : _min2; })
77 /* Round number down to multiple */
78 #define ALIGN_DOWN(n, m) ((n) / (m) * (m))
80 /* Round number up to multiple */
81 #define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
84 #define unlikely(x) __builtin_expect(!!(x), 0)
87 /* Align each region to cache line size in inflight buffer */
88 #define INFLIGHT_ALIGNMENT 64
90 /* The version of inflight buffer */
91 #define INFLIGHT_VERSION 1
93 /* The version of the protocol we support */
94 #define VHOST_USER_VERSION 1
95 #define LIBVHOST_USER_DEBUG 0
99 if (LIBVHOST_USER_DEBUG) { \
100 fprintf(stderr, __VA_ARGS__); \
105 bool has_feature(uint64_t features
, unsigned int fbit
)
108 return !!(features
& (1ULL << fbit
));
112 bool vu_has_feature(VuDev
*dev
,
115 return has_feature(dev
->features
, fbit
);
118 static inline bool vu_has_protocol_feature(VuDev
*dev
, unsigned int fbit
)
120 return has_feature(dev
->protocol_features
, fbit
);
124 vu_request_to_string(unsigned int req
)
126 #define REQ(req) [req] = #req
127 static const char *vu_request_str
[] = {
128 REQ(VHOST_USER_NONE
),
129 REQ(VHOST_USER_GET_FEATURES
),
130 REQ(VHOST_USER_SET_FEATURES
),
131 REQ(VHOST_USER_SET_OWNER
),
132 REQ(VHOST_USER_RESET_OWNER
),
133 REQ(VHOST_USER_SET_MEM_TABLE
),
134 REQ(VHOST_USER_SET_LOG_BASE
),
135 REQ(VHOST_USER_SET_LOG_FD
),
136 REQ(VHOST_USER_SET_VRING_NUM
),
137 REQ(VHOST_USER_SET_VRING_ADDR
),
138 REQ(VHOST_USER_SET_VRING_BASE
),
139 REQ(VHOST_USER_GET_VRING_BASE
),
140 REQ(VHOST_USER_SET_VRING_KICK
),
141 REQ(VHOST_USER_SET_VRING_CALL
),
142 REQ(VHOST_USER_SET_VRING_ERR
),
143 REQ(VHOST_USER_GET_PROTOCOL_FEATURES
),
144 REQ(VHOST_USER_SET_PROTOCOL_FEATURES
),
145 REQ(VHOST_USER_GET_QUEUE_NUM
),
146 REQ(VHOST_USER_SET_VRING_ENABLE
),
147 REQ(VHOST_USER_SEND_RARP
),
148 REQ(VHOST_USER_NET_SET_MTU
),
149 REQ(VHOST_USER_SET_BACKEND_REQ_FD
),
150 REQ(VHOST_USER_IOTLB_MSG
),
151 REQ(VHOST_USER_SET_VRING_ENDIAN
),
152 REQ(VHOST_USER_GET_CONFIG
),
153 REQ(VHOST_USER_SET_CONFIG
),
154 REQ(VHOST_USER_POSTCOPY_ADVISE
),
155 REQ(VHOST_USER_POSTCOPY_LISTEN
),
156 REQ(VHOST_USER_POSTCOPY_END
),
157 REQ(VHOST_USER_GET_INFLIGHT_FD
),
158 REQ(VHOST_USER_SET_INFLIGHT_FD
),
159 REQ(VHOST_USER_GPU_SET_SOCKET
),
160 REQ(VHOST_USER_VRING_KICK
),
161 REQ(VHOST_USER_GET_MAX_MEM_SLOTS
),
162 REQ(VHOST_USER_ADD_MEM_REG
),
163 REQ(VHOST_USER_REM_MEM_REG
),
164 REQ(VHOST_USER_GET_SHARED_OBJECT
),
169 if (req
< VHOST_USER_MAX
) {
170 return vu_request_str
[req
];
176 static void G_GNUC_PRINTF(2, 3)
177 vu_panic(VuDev
*dev
, const char *msg
, ...)
183 if (vasprintf(&buf
, msg
, ap
) < 0) {
189 dev
->panic(dev
, buf
);
194 * find a way to call virtio_error, or perhaps close the connection?
198 /* Translate guest physical address to our virtual address. */
200 vu_gpa_to_va(VuDev
*dev
, uint64_t *plen
, uint64_t guest_addr
)
208 /* Find matching memory region. */
209 for (i
= 0; i
< dev
->nregions
; i
++) {
210 VuDevRegion
*r
= &dev
->regions
[i
];
212 if ((guest_addr
>= r
->gpa
) && (guest_addr
< (r
->gpa
+ r
->size
))) {
213 if ((guest_addr
+ *plen
) > (r
->gpa
+ r
->size
)) {
214 *plen
= r
->gpa
+ r
->size
- guest_addr
;
216 return (void *)(uintptr_t)
217 guest_addr
- r
->gpa
+ r
->mmap_addr
+ r
->mmap_offset
;
224 /* Translate qemu virtual address to our virtual address. */
226 qva_to_va(VuDev
*dev
, uint64_t qemu_addr
)
230 /* Find matching memory region. */
231 for (i
= 0; i
< dev
->nregions
; i
++) {
232 VuDevRegion
*r
= &dev
->regions
[i
];
234 if ((qemu_addr
>= r
->qva
) && (qemu_addr
< (r
->qva
+ r
->size
))) {
235 return (void *)(uintptr_t)
236 qemu_addr
- r
->qva
+ r
->mmap_addr
+ r
->mmap_offset
;
244 vmsg_close_fds(VhostUserMsg
*vmsg
)
248 for (i
= 0; i
< vmsg
->fd_num
; i
++) {
253 /* Set reply payload.u64 and clear request flags and fd_num */
254 static void vmsg_set_reply_u64(VhostUserMsg
*vmsg
, uint64_t val
)
256 vmsg
->flags
= 0; /* defaults will be set by vu_send_reply() */
257 vmsg
->size
= sizeof(vmsg
->payload
.u64
);
258 vmsg
->payload
.u64
= val
;
262 /* A test to see if we have userfault available */
266 #if defined(__linux__) && defined(__NR_userfaultfd) &&\
267 defined(UFFD_FEATURE_MISSING_SHMEM) &&\
268 defined(UFFD_FEATURE_MISSING_HUGETLBFS)
269 /* Now test the kernel we're running on really has the features */
270 int ufd
= syscall(__NR_userfaultfd
, O_CLOEXEC
| O_NONBLOCK
);
271 struct uffdio_api api_struct
;
276 api_struct
.api
= UFFD_API
;
277 api_struct
.features
= UFFD_FEATURE_MISSING_SHMEM
|
278 UFFD_FEATURE_MISSING_HUGETLBFS
;
279 if (ioctl(ufd
, UFFDIO_API
, &api_struct
)) {
292 vu_message_read_default(VuDev
*dev
, int conn_fd
, VhostUserMsg
*vmsg
)
294 char control
[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS
* sizeof(int))] = {};
296 .iov_base
= (char *)vmsg
,
297 .iov_len
= VHOST_USER_HDR_SIZE
,
299 struct msghdr msg
= {
302 .msg_control
= control
,
303 .msg_controllen
= sizeof(control
),
306 struct cmsghdr
*cmsg
;
310 rc
= recvmsg(conn_fd
, &msg
, 0);
311 } while (rc
< 0 && (errno
== EINTR
|| errno
== EAGAIN
));
314 vu_panic(dev
, "Error while recvmsg: %s", strerror(errno
));
319 for (cmsg
= CMSG_FIRSTHDR(&msg
);
321 cmsg
= CMSG_NXTHDR(&msg
, cmsg
))
323 if (cmsg
->cmsg_level
== SOL_SOCKET
&& cmsg
->cmsg_type
== SCM_RIGHTS
) {
324 fd_size
= cmsg
->cmsg_len
- CMSG_LEN(0);
325 vmsg
->fd_num
= fd_size
/ sizeof(int);
326 assert(fd_size
< VHOST_MEMORY_BASELINE_NREGIONS
);
327 memcpy(vmsg
->fds
, CMSG_DATA(cmsg
), fd_size
);
332 if (vmsg
->size
> sizeof(vmsg
->payload
)) {
334 "Error: too big message request: %d, size: vmsg->size: %u, "
335 "while sizeof(vmsg->payload) = %zu\n",
336 vmsg
->request
, vmsg
->size
, sizeof(vmsg
->payload
));
342 rc
= read(conn_fd
, &vmsg
->payload
, vmsg
->size
);
343 } while (rc
< 0 && (errno
== EINTR
|| errno
== EAGAIN
));
346 vu_panic(dev
, "Error while reading: %s", strerror(errno
));
350 assert((uint32_t)rc
== vmsg
->size
);
356 vmsg_close_fds(vmsg
);
362 vu_message_write(VuDev
*dev
, int conn_fd
, VhostUserMsg
*vmsg
)
365 uint8_t *p
= (uint8_t *)vmsg
;
366 char control
[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS
* sizeof(int))] = {};
368 .iov_base
= (char *)vmsg
,
369 .iov_len
= VHOST_USER_HDR_SIZE
,
371 struct msghdr msg
= {
374 .msg_control
= control
,
376 struct cmsghdr
*cmsg
;
378 memset(control
, 0, sizeof(control
));
379 assert(vmsg
->fd_num
<= VHOST_MEMORY_BASELINE_NREGIONS
);
380 if (vmsg
->fd_num
> 0) {
381 size_t fdsize
= vmsg
->fd_num
* sizeof(int);
382 msg
.msg_controllen
= CMSG_SPACE(fdsize
);
383 cmsg
= CMSG_FIRSTHDR(&msg
);
384 cmsg
->cmsg_len
= CMSG_LEN(fdsize
);
385 cmsg
->cmsg_level
= SOL_SOCKET
;
386 cmsg
->cmsg_type
= SCM_RIGHTS
;
387 memcpy(CMSG_DATA(cmsg
), vmsg
->fds
, fdsize
);
389 msg
.msg_controllen
= 0;
393 rc
= sendmsg(conn_fd
, &msg
, 0);
394 } while (rc
< 0 && (errno
== EINTR
|| errno
== EAGAIN
));
399 rc
= write(conn_fd
, vmsg
->data
, vmsg
->size
);
401 rc
= write(conn_fd
, p
+ VHOST_USER_HDR_SIZE
, vmsg
->size
);
403 } while (rc
< 0 && (errno
== EINTR
|| errno
== EAGAIN
));
407 vu_panic(dev
, "Error while writing: %s", strerror(errno
));
415 vu_send_reply(VuDev
*dev
, int conn_fd
, VhostUserMsg
*vmsg
)
417 /* Set the version in the flags when sending the reply */
418 vmsg
->flags
&= ~VHOST_USER_VERSION_MASK
;
419 vmsg
->flags
|= VHOST_USER_VERSION
;
420 vmsg
->flags
|= VHOST_USER_REPLY_MASK
;
422 return vu_message_write(dev
, conn_fd
, vmsg
);
426 * Processes a reply on the backend channel.
427 * Entered with backend_mutex held and releases it before exit.
428 * Returns true on success.
431 vu_process_message_reply(VuDev
*dev
, const VhostUserMsg
*vmsg
)
433 VhostUserMsg msg_reply
;
436 if ((vmsg
->flags
& VHOST_USER_NEED_REPLY_MASK
) == 0) {
441 if (!vu_message_read_default(dev
, dev
->backend_fd
, &msg_reply
)) {
445 if (msg_reply
.request
!= vmsg
->request
) {
446 DPRINT("Received unexpected msg type. Expected %d received %d",
447 vmsg
->request
, msg_reply
.request
);
451 result
= msg_reply
.payload
.u64
== 0;
454 pthread_mutex_unlock(&dev
->backend_mutex
);
458 /* Kick the log_call_fd if required. */
460 vu_log_kick(VuDev
*dev
)
462 if (dev
->log_call_fd
!= -1) {
463 DPRINT("Kicking the QEMU's log...\n");
464 if (eventfd_write(dev
->log_call_fd
, 1) < 0) {
465 vu_panic(dev
, "Error writing eventfd: %s", strerror(errno
));
471 vu_log_page(uint8_t *log_table
, uint64_t page
)
473 DPRINT("Logged dirty guest page: %"PRId64
"\n", page
);
474 qatomic_or(&log_table
[page
/ 8], 1 << (page
% 8));
478 vu_log_write(VuDev
*dev
, uint64_t address
, uint64_t length
)
482 if (!(dev
->features
& (1ULL << VHOST_F_LOG_ALL
)) ||
483 !dev
->log_table
|| !length
) {
487 assert(dev
->log_size
> ((address
+ length
- 1) / VHOST_LOG_PAGE
/ 8));
489 page
= address
/ VHOST_LOG_PAGE
;
490 while (page
* VHOST_LOG_PAGE
< address
+ length
) {
491 vu_log_page(dev
->log_table
, page
);
499 vu_kick_cb(VuDev
*dev
, int condition
, void *data
)
501 int index
= (intptr_t)data
;
502 VuVirtq
*vq
= &dev
->vq
[index
];
503 int sock
= vq
->kick_fd
;
507 rc
= eventfd_read(sock
, &kick_data
);
509 vu_panic(dev
, "kick eventfd_read(): %s", strerror(errno
));
510 dev
->remove_watch(dev
, dev
->vq
[index
].kick_fd
);
512 DPRINT("Got kick_data: %016"PRIx64
" handler:%p idx:%d\n",
513 kick_data
, vq
->handler
, index
);
515 vq
->handler(dev
, index
);
521 vu_get_features_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
525 * The following VIRTIO feature bits are supported by our virtqueue
528 1ULL << VIRTIO_F_NOTIFY_ON_EMPTY
|
529 1ULL << VIRTIO_RING_F_INDIRECT_DESC
|
530 1ULL << VIRTIO_RING_F_EVENT_IDX
|
531 1ULL << VIRTIO_F_VERSION_1
|
533 /* vhost-user feature bits */
534 1ULL << VHOST_F_LOG_ALL
|
535 1ULL << VHOST_USER_F_PROTOCOL_FEATURES
;
537 if (dev
->iface
->get_features
) {
538 vmsg
->payload
.u64
|= dev
->iface
->get_features(dev
);
541 vmsg
->size
= sizeof(vmsg
->payload
.u64
);
544 DPRINT("Sending back to guest u64: 0x%016"PRIx64
"\n", vmsg
->payload
.u64
);
550 vu_set_enable_all_rings(VuDev
*dev
, bool enabled
)
554 for (i
= 0; i
< dev
->max_queues
; i
++) {
555 dev
->vq
[i
].enable
= enabled
;
560 vu_set_features_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
562 DPRINT("u64: 0x%016"PRIx64
"\n", vmsg
->payload
.u64
);
564 dev
->features
= vmsg
->payload
.u64
;
565 if (!vu_has_feature(dev
, VIRTIO_F_VERSION_1
)) {
567 * We only support devices conforming to VIRTIO 1.0 or
570 vu_panic(dev
, "virtio legacy devices aren't supported by libvhost-user");
574 if (!(dev
->features
& VHOST_USER_F_PROTOCOL_FEATURES
)) {
575 vu_set_enable_all_rings(dev
, true);
578 if (dev
->iface
->set_features
) {
579 dev
->iface
->set_features(dev
, dev
->features
);
586 vu_set_owner_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
592 vu_close_log(VuDev
*dev
)
594 if (dev
->log_table
) {
595 if (munmap(dev
->log_table
, dev
->log_size
) != 0) {
596 perror("close log munmap() error");
599 dev
->log_table
= NULL
;
601 if (dev
->log_call_fd
!= -1) {
602 close(dev
->log_call_fd
);
603 dev
->log_call_fd
= -1;
608 vu_reset_device_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
610 vu_set_enable_all_rings(dev
, false);
616 map_ring(VuDev
*dev
, VuVirtq
*vq
)
618 vq
->vring
.desc
= qva_to_va(dev
, vq
->vra
.desc_user_addr
);
619 vq
->vring
.used
= qva_to_va(dev
, vq
->vra
.used_user_addr
);
620 vq
->vring
.avail
= qva_to_va(dev
, vq
->vra
.avail_user_addr
);
622 DPRINT("Setting virtq addresses:\n");
623 DPRINT(" vring_desc at %p\n", vq
->vring
.desc
);
624 DPRINT(" vring_used at %p\n", vq
->vring
.used
);
625 DPRINT(" vring_avail at %p\n", vq
->vring
.avail
);
627 return !(vq
->vring
.desc
&& vq
->vring
.used
&& vq
->vring
.avail
);
631 generate_faults(VuDev
*dev
) {
633 for (i
= 0; i
< dev
->nregions
; i
++) {
634 #ifdef UFFDIO_REGISTER
635 VuDevRegion
*dev_region
= &dev
->regions
[i
];
637 struct uffdio_register reg_struct
;
640 * We should already have an open ufd. Mark each memory
642 * Discard any mapping we have here; note I can't use MADV_REMOVE
643 * or fallocate to make the hole since I don't want to lose
644 * data that's already arrived in the shared process.
645 * TODO: How to do hugepage
647 ret
= madvise((void *)(uintptr_t)dev_region
->mmap_addr
,
648 dev_region
->size
+ dev_region
->mmap_offset
,
652 "%s: Failed to madvise(DONTNEED) region %d: %s\n",
653 __func__
, i
, strerror(errno
));
656 * Turn off transparent hugepages so we dont get lose wakeups
657 * in neighbouring pages.
658 * TODO: Turn this backon later.
660 ret
= madvise((void *)(uintptr_t)dev_region
->mmap_addr
,
661 dev_region
->size
+ dev_region
->mmap_offset
,
665 * Note: This can happen legally on kernels that are configured
666 * without madvise'able hugepages
669 "%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n",
670 __func__
, i
, strerror(errno
));
673 reg_struct
.range
.start
= (uintptr_t)dev_region
->mmap_addr
;
674 reg_struct
.range
.len
= dev_region
->size
+ dev_region
->mmap_offset
;
675 reg_struct
.mode
= UFFDIO_REGISTER_MODE_MISSING
;
677 if (ioctl(dev
->postcopy_ufd
, UFFDIO_REGISTER
, ®_struct
)) {
678 vu_panic(dev
, "%s: Failed to userfault region %d "
679 "@%" PRIx64
" + size:%" PRIx64
" offset: %" PRIx64
682 dev_region
->mmap_addr
,
683 dev_region
->size
, dev_region
->mmap_offset
,
684 dev
->postcopy_ufd
, strerror(errno
));
687 if (!(reg_struct
.ioctls
& ((__u64
)1 << _UFFDIO_COPY
))) {
688 vu_panic(dev
, "%s Region (%d) doesn't support COPY",
692 DPRINT("%s: region %d: Registered userfault for %"
693 PRIx64
" + %" PRIx64
"\n", __func__
, i
,
694 (uint64_t)reg_struct
.range
.start
,
695 (uint64_t)reg_struct
.range
.len
);
696 /* Now it's registered we can let the client at it */
697 if (mprotect((void *)(uintptr_t)dev_region
->mmap_addr
,
698 dev_region
->size
+ dev_region
->mmap_offset
,
699 PROT_READ
| PROT_WRITE
)) {
700 vu_panic(dev
, "failed to mprotect region %d for postcopy (%s)",
704 /* TODO: Stash 'zero' support flags somewhere */
712 vu_add_mem_reg(VuDev
*dev
, VhostUserMsg
*vmsg
) {
714 bool track_ramblocks
= dev
->postcopy_listening
;
715 VhostUserMemoryRegion m
= vmsg
->payload
.memreg
.region
, *msg_region
= &m
;
716 VuDevRegion
*dev_region
= &dev
->regions
[dev
->nregions
];
719 if (vmsg
->fd_num
!= 1) {
720 vmsg_close_fds(vmsg
);
721 vu_panic(dev
, "VHOST_USER_ADD_MEM_REG received %d fds - only 1 fd "
722 "should be sent for this message type", vmsg
->fd_num
);
726 if (vmsg
->size
< VHOST_USER_MEM_REG_SIZE
) {
728 vu_panic(dev
, "VHOST_USER_ADD_MEM_REG requires a message size of at "
729 "least %zu bytes and only %d bytes were received",
730 VHOST_USER_MEM_REG_SIZE
, vmsg
->size
);
734 if (dev
->nregions
== VHOST_USER_MAX_RAM_SLOTS
) {
736 vu_panic(dev
, "failing attempt to hot add memory via "
737 "VHOST_USER_ADD_MEM_REG message because the backend has "
738 "no free ram slots available");
743 * If we are in postcopy mode and we receive a u64 payload with a 0 value
744 * we know all the postcopy client bases have been received, and we
745 * should start generating faults.
747 if (track_ramblocks
&&
748 vmsg
->size
== sizeof(vmsg
->payload
.u64
) &&
749 vmsg
->payload
.u64
== 0) {
750 (void)generate_faults(dev
);
754 DPRINT("Adding region: %u\n", dev
->nregions
);
755 DPRINT(" guest_phys_addr: 0x%016"PRIx64
"\n",
756 msg_region
->guest_phys_addr
);
757 DPRINT(" memory_size: 0x%016"PRIx64
"\n",
758 msg_region
->memory_size
);
759 DPRINT(" userspace_addr 0x%016"PRIx64
"\n",
760 msg_region
->userspace_addr
);
761 DPRINT(" mmap_offset 0x%016"PRIx64
"\n",
762 msg_region
->mmap_offset
);
764 dev_region
->gpa
= msg_region
->guest_phys_addr
;
765 dev_region
->size
= msg_region
->memory_size
;
766 dev_region
->qva
= msg_region
->userspace_addr
;
767 dev_region
->mmap_offset
= msg_region
->mmap_offset
;
770 * We don't use offset argument of mmap() since the
771 * mapped address has to be page aligned, and we use huge
774 if (track_ramblocks
) {
776 * In postcopy we're using PROT_NONE here to catch anyone
777 * accessing it before we userfault.
779 mmap_addr
= mmap(0, dev_region
->size
+ dev_region
->mmap_offset
,
780 PROT_NONE
, MAP_SHARED
| MAP_NORESERVE
,
783 mmap_addr
= mmap(0, dev_region
->size
+ dev_region
->mmap_offset
,
784 PROT_READ
| PROT_WRITE
, MAP_SHARED
| MAP_NORESERVE
,
788 if (mmap_addr
== MAP_FAILED
) {
789 vu_panic(dev
, "region mmap error: %s", strerror(errno
));
791 dev_region
->mmap_addr
= (uint64_t)(uintptr_t)mmap_addr
;
792 DPRINT(" mmap_addr: 0x%016"PRIx64
"\n",
793 dev_region
->mmap_addr
);
798 if (track_ramblocks
) {
800 * Return the address to QEMU so that it can translate the ufd
801 * fault addresses back.
803 msg_region
->userspace_addr
= (uintptr_t)(mmap_addr
+
804 dev_region
->mmap_offset
);
806 /* Send the message back to qemu with the addresses filled in. */
808 DPRINT("Successfully added new region in postcopy\n");
812 for (i
= 0; i
< dev
->max_queues
; i
++) {
813 if (dev
->vq
[i
].vring
.desc
) {
814 if (map_ring(dev
, &dev
->vq
[i
])) {
815 vu_panic(dev
, "remapping queue %d for new memory region",
821 DPRINT("Successfully added new region\n");
827 static inline bool reg_equal(VuDevRegion
*vudev_reg
,
828 VhostUserMemoryRegion
*msg_reg
)
830 if (vudev_reg
->gpa
== msg_reg
->guest_phys_addr
&&
831 vudev_reg
->qva
== msg_reg
->userspace_addr
&&
832 vudev_reg
->size
== msg_reg
->memory_size
) {
840 vu_rem_mem_reg(VuDev
*dev
, VhostUserMsg
*vmsg
) {
841 VhostUserMemoryRegion m
= vmsg
->payload
.memreg
.region
, *msg_region
= &m
;
845 if (vmsg
->fd_num
> 1) {
846 vmsg_close_fds(vmsg
);
847 vu_panic(dev
, "VHOST_USER_REM_MEM_REG received %d fds - at most 1 fd "
848 "should be sent for this message type", vmsg
->fd_num
);
852 if (vmsg
->size
< VHOST_USER_MEM_REG_SIZE
) {
853 vmsg_close_fds(vmsg
);
854 vu_panic(dev
, "VHOST_USER_REM_MEM_REG requires a message size of at "
855 "least %zu bytes and only %d bytes were received",
856 VHOST_USER_MEM_REG_SIZE
, vmsg
->size
);
860 DPRINT("Removing region:\n");
861 DPRINT(" guest_phys_addr: 0x%016"PRIx64
"\n",
862 msg_region
->guest_phys_addr
);
863 DPRINT(" memory_size: 0x%016"PRIx64
"\n",
864 msg_region
->memory_size
);
865 DPRINT(" userspace_addr 0x%016"PRIx64
"\n",
866 msg_region
->userspace_addr
);
867 DPRINT(" mmap_offset 0x%016"PRIx64
"\n",
868 msg_region
->mmap_offset
);
870 for (i
= 0; i
< dev
->nregions
; i
++) {
871 if (reg_equal(&dev
->regions
[i
], msg_region
)) {
872 VuDevRegion
*r
= &dev
->regions
[i
];
873 void *ma
= (void *) (uintptr_t) r
->mmap_addr
;
876 munmap(ma
, r
->size
+ r
->mmap_offset
);
880 * Shift all affected entries by 1 to close the hole at index i and
881 * zero out the last entry.
883 memmove(dev
->regions
+ i
, dev
->regions
+ i
+ 1,
884 sizeof(VuDevRegion
) * (dev
->nregions
- i
- 1));
885 memset(dev
->regions
+ dev
->nregions
- 1, 0, sizeof(VuDevRegion
));
886 DPRINT("Successfully removed a region\n");
892 /* Continue the search for eventual duplicates. */
897 vu_panic(dev
, "Specified region not found\n");
900 vmsg_close_fds(vmsg
);
906 vu_get_shared_object(VuDev
*dev
, VhostUserMsg
*vmsg
)
910 if (dev
->iface
->get_shared_object
) {
911 dmabuf_fd
= dev
->iface
->get_shared_object(
912 dev
, &vmsg
->payload
.object
.uuid
[0]);
914 if (dmabuf_fd
!= -1) {
915 DPRINT("dmabuf_fd found for requested UUID\n");
916 vmsg
->fds
[fd_num
++] = dmabuf_fd
;
918 vmsg
->fd_num
= fd_num
;
924 vu_set_mem_table_exec_postcopy(VuDev
*dev
, VhostUserMsg
*vmsg
)
927 VhostUserMemory m
= vmsg
->payload
.memory
, *memory
= &m
;
928 dev
->nregions
= memory
->nregions
;
930 DPRINT("Nregions: %u\n", memory
->nregions
);
931 for (i
= 0; i
< dev
->nregions
; i
++) {
933 VhostUserMemoryRegion
*msg_region
= &memory
->regions
[i
];
934 VuDevRegion
*dev_region
= &dev
->regions
[i
];
936 DPRINT("Region %d\n", i
);
937 DPRINT(" guest_phys_addr: 0x%016"PRIx64
"\n",
938 msg_region
->guest_phys_addr
);
939 DPRINT(" memory_size: 0x%016"PRIx64
"\n",
940 msg_region
->memory_size
);
941 DPRINT(" userspace_addr 0x%016"PRIx64
"\n",
942 msg_region
->userspace_addr
);
943 DPRINT(" mmap_offset 0x%016"PRIx64
"\n",
944 msg_region
->mmap_offset
);
946 dev_region
->gpa
= msg_region
->guest_phys_addr
;
947 dev_region
->size
= msg_region
->memory_size
;
948 dev_region
->qva
= msg_region
->userspace_addr
;
949 dev_region
->mmap_offset
= msg_region
->mmap_offset
;
951 /* We don't use offset argument of mmap() since the
952 * mapped address has to be page aligned, and we use huge
954 * In postcopy we're using PROT_NONE here to catch anyone
955 * accessing it before we userfault
957 mmap_addr
= mmap(0, dev_region
->size
+ dev_region
->mmap_offset
,
958 PROT_NONE
, MAP_SHARED
| MAP_NORESERVE
,
961 if (mmap_addr
== MAP_FAILED
) {
962 vu_panic(dev
, "region mmap error: %s", strerror(errno
));
964 dev_region
->mmap_addr
= (uint64_t)(uintptr_t)mmap_addr
;
965 DPRINT(" mmap_addr: 0x%016"PRIx64
"\n",
966 dev_region
->mmap_addr
);
969 /* Return the address to QEMU so that it can translate the ufd
970 * fault addresses back.
972 msg_region
->userspace_addr
= (uintptr_t)(mmap_addr
+
973 dev_region
->mmap_offset
);
977 /* Send the message back to qemu with the addresses filled in */
979 if (!vu_send_reply(dev
, dev
->sock
, vmsg
)) {
980 vu_panic(dev
, "failed to respond to set-mem-table for postcopy");
984 /* Wait for QEMU to confirm that it's registered the handler for the
987 if (!dev
->read_msg(dev
, dev
->sock
, vmsg
) ||
988 vmsg
->size
!= sizeof(vmsg
->payload
.u64
) ||
989 vmsg
->payload
.u64
!= 0) {
990 vu_panic(dev
, "failed to receive valid ack for postcopy set-mem-table");
994 /* OK, now we can go and register the memory and generate faults */
995 (void)generate_faults(dev
);
1001 vu_set_mem_table_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
1004 VhostUserMemory m
= vmsg
->payload
.memory
, *memory
= &m
;
1006 for (i
= 0; i
< dev
->nregions
; i
++) {
1007 VuDevRegion
*r
= &dev
->regions
[i
];
1008 void *ma
= (void *) (uintptr_t) r
->mmap_addr
;
1011 munmap(ma
, r
->size
+ r
->mmap_offset
);
1014 dev
->nregions
= memory
->nregions
;
1016 if (dev
->postcopy_listening
) {
1017 return vu_set_mem_table_exec_postcopy(dev
, vmsg
);
1020 DPRINT("Nregions: %u\n", memory
->nregions
);
1021 for (i
= 0; i
< dev
->nregions
; i
++) {
1023 VhostUserMemoryRegion
*msg_region
= &memory
->regions
[i
];
1024 VuDevRegion
*dev_region
= &dev
->regions
[i
];
1026 DPRINT("Region %d\n", i
);
1027 DPRINT(" guest_phys_addr: 0x%016"PRIx64
"\n",
1028 msg_region
->guest_phys_addr
);
1029 DPRINT(" memory_size: 0x%016"PRIx64
"\n",
1030 msg_region
->memory_size
);
1031 DPRINT(" userspace_addr 0x%016"PRIx64
"\n",
1032 msg_region
->userspace_addr
);
1033 DPRINT(" mmap_offset 0x%016"PRIx64
"\n",
1034 msg_region
->mmap_offset
);
1036 dev_region
->gpa
= msg_region
->guest_phys_addr
;
1037 dev_region
->size
= msg_region
->memory_size
;
1038 dev_region
->qva
= msg_region
->userspace_addr
;
1039 dev_region
->mmap_offset
= msg_region
->mmap_offset
;
1041 /* We don't use offset argument of mmap() since the
1042 * mapped address has to be page aligned, and we use huge
1044 mmap_addr
= mmap(0, dev_region
->size
+ dev_region
->mmap_offset
,
1045 PROT_READ
| PROT_WRITE
, MAP_SHARED
| MAP_NORESERVE
,
1048 if (mmap_addr
== MAP_FAILED
) {
1049 vu_panic(dev
, "region mmap error: %s", strerror(errno
));
1051 dev_region
->mmap_addr
= (uint64_t)(uintptr_t)mmap_addr
;
1052 DPRINT(" mmap_addr: 0x%016"PRIx64
"\n",
1053 dev_region
->mmap_addr
);
1056 close(vmsg
->fds
[i
]);
1059 for (i
= 0; i
< dev
->max_queues
; i
++) {
1060 if (dev
->vq
[i
].vring
.desc
) {
1061 if (map_ring(dev
, &dev
->vq
[i
])) {
1062 vu_panic(dev
, "remapping queue %d during setmemtable", i
);
1071 vu_set_log_base_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
1074 uint64_t log_mmap_size
, log_mmap_offset
;
1077 if (vmsg
->fd_num
!= 1 ||
1078 vmsg
->size
!= sizeof(vmsg
->payload
.log
)) {
1079 vu_panic(dev
, "Invalid log_base message");
1084 log_mmap_offset
= vmsg
->payload
.log
.mmap_offset
;
1085 log_mmap_size
= vmsg
->payload
.log
.mmap_size
;
1086 DPRINT("Log mmap_offset: %"PRId64
"\n", log_mmap_offset
);
1087 DPRINT("Log mmap_size: %"PRId64
"\n", log_mmap_size
);
1089 rc
= mmap(0, log_mmap_size
, PROT_READ
| PROT_WRITE
, MAP_SHARED
, fd
,
1092 if (rc
== MAP_FAILED
) {
1093 perror("log mmap error");
1096 if (dev
->log_table
) {
1097 munmap(dev
->log_table
, dev
->log_size
);
1099 dev
->log_table
= rc
;
1100 dev
->log_size
= log_mmap_size
;
1102 vmsg
->size
= sizeof(vmsg
->payload
.u64
);
1109 vu_set_log_fd_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
1111 if (vmsg
->fd_num
!= 1) {
1112 vu_panic(dev
, "Invalid log_fd message");
1116 if (dev
->log_call_fd
!= -1) {
1117 close(dev
->log_call_fd
);
1119 dev
->log_call_fd
= vmsg
->fds
[0];
1120 DPRINT("Got log_call_fd: %d\n", vmsg
->fds
[0]);
1126 vu_set_vring_num_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
1128 unsigned int index
= vmsg
->payload
.state
.index
;
1129 unsigned int num
= vmsg
->payload
.state
.num
;
1131 DPRINT("State.index: %u\n", index
);
1132 DPRINT("State.num: %u\n", num
);
1133 dev
->vq
[index
].vring
.num
= num
;
1139 vu_set_vring_addr_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
1141 struct vhost_vring_addr addr
= vmsg
->payload
.addr
, *vra
= &addr
;
1142 unsigned int index
= vra
->index
;
1143 VuVirtq
*vq
= &dev
->vq
[index
];
1145 DPRINT("vhost_vring_addr:\n");
1146 DPRINT(" index: %d\n", vra
->index
);
1147 DPRINT(" flags: %d\n", vra
->flags
);
1148 DPRINT(" desc_user_addr: 0x%016" PRIx64
"\n", (uint64_t)vra
->desc_user_addr
);
1149 DPRINT(" used_user_addr: 0x%016" PRIx64
"\n", (uint64_t)vra
->used_user_addr
);
1150 DPRINT(" avail_user_addr: 0x%016" PRIx64
"\n", (uint64_t)vra
->avail_user_addr
);
1151 DPRINT(" log_guest_addr: 0x%016" PRIx64
"\n", (uint64_t)vra
->log_guest_addr
);
1154 vq
->vring
.flags
= vra
->flags
;
1155 vq
->vring
.log_guest_addr
= vra
->log_guest_addr
;
1158 if (map_ring(dev
, vq
)) {
1159 vu_panic(dev
, "Invalid vring_addr message");
1163 vq
->used_idx
= le16toh(vq
->vring
.used
->idx
);
1165 if (vq
->last_avail_idx
!= vq
->used_idx
) {
1166 bool resume
= dev
->iface
->queue_is_processed_in_order
&&
1167 dev
->iface
->queue_is_processed_in_order(dev
, index
);
1169 DPRINT("Last avail index != used index: %u != %u%s\n",
1170 vq
->last_avail_idx
, vq
->used_idx
,
1171 resume
? ", resuming" : "");
1174 vq
->shadow_avail_idx
= vq
->last_avail_idx
= vq
->used_idx
;
1182 vu_set_vring_base_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
1184 unsigned int index
= vmsg
->payload
.state
.index
;
1185 unsigned int num
= vmsg
->payload
.state
.num
;
1187 DPRINT("State.index: %u\n", index
);
1188 DPRINT("State.num: %u\n", num
);
1189 dev
->vq
[index
].shadow_avail_idx
= dev
->vq
[index
].last_avail_idx
= num
;
1195 vu_get_vring_base_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
1197 unsigned int index
= vmsg
->payload
.state
.index
;
1199 DPRINT("State.index: %u\n", index
);
1200 vmsg
->payload
.state
.num
= dev
->vq
[index
].last_avail_idx
;
1201 vmsg
->size
= sizeof(vmsg
->payload
.state
);
1203 dev
->vq
[index
].started
= false;
1204 if (dev
->iface
->queue_set_started
) {
1205 dev
->iface
->queue_set_started(dev
, index
, false);
1208 if (dev
->vq
[index
].call_fd
!= -1) {
1209 close(dev
->vq
[index
].call_fd
);
1210 dev
->vq
[index
].call_fd
= -1;
1212 if (dev
->vq
[index
].kick_fd
!= -1) {
1213 dev
->remove_watch(dev
, dev
->vq
[index
].kick_fd
);
1214 close(dev
->vq
[index
].kick_fd
);
1215 dev
->vq
[index
].kick_fd
= -1;
1222 vu_check_queue_msg_file(VuDev
*dev
, VhostUserMsg
*vmsg
)
1224 int index
= vmsg
->payload
.u64
& VHOST_USER_VRING_IDX_MASK
;
1225 bool nofd
= vmsg
->payload
.u64
& VHOST_USER_VRING_NOFD_MASK
;
1227 if (index
>= dev
->max_queues
) {
1228 vmsg_close_fds(vmsg
);
1229 vu_panic(dev
, "Invalid queue index: %u", index
);
1234 vmsg_close_fds(vmsg
);
1238 if (vmsg
->fd_num
!= 1) {
1239 vmsg_close_fds(vmsg
);
1240 vu_panic(dev
, "Invalid fds in request: %d", vmsg
->request
);
1248 inflight_desc_compare(const void *a
, const void *b
)
1250 VuVirtqInflightDesc
*desc0
= (VuVirtqInflightDesc
*)a
,
1251 *desc1
= (VuVirtqInflightDesc
*)b
;
1253 if (desc1
->counter
> desc0
->counter
&&
1254 (desc1
->counter
- desc0
->counter
) < VIRTQUEUE_MAX_SIZE
* 2) {
1262 vu_check_queue_inflights(VuDev
*dev
, VuVirtq
*vq
)
1266 if (!vu_has_protocol_feature(dev
, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
)) {
1270 if (unlikely(!vq
->inflight
)) {
1274 if (unlikely(!vq
->inflight
->version
)) {
1275 /* initialize the buffer */
1276 vq
->inflight
->version
= INFLIGHT_VERSION
;
1280 vq
->used_idx
= le16toh(vq
->vring
.used
->idx
);
1281 vq
->resubmit_num
= 0;
1282 vq
->resubmit_list
= NULL
;
1285 if (unlikely(vq
->inflight
->used_idx
!= vq
->used_idx
)) {
1286 vq
->inflight
->desc
[vq
->inflight
->last_batch_head
].inflight
= 0;
1290 vq
->inflight
->used_idx
= vq
->used_idx
;
1293 for (i
= 0; i
< vq
->inflight
->desc_num
; i
++) {
1294 if (vq
->inflight
->desc
[i
].inflight
== 1) {
1299 vq
->shadow_avail_idx
= vq
->last_avail_idx
= vq
->inuse
+ vq
->used_idx
;
1302 vq
->resubmit_list
= calloc(vq
->inuse
, sizeof(VuVirtqInflightDesc
));
1303 if (!vq
->resubmit_list
) {
1307 for (i
= 0; i
< vq
->inflight
->desc_num
; i
++) {
1308 if (vq
->inflight
->desc
[i
].inflight
) {
1309 vq
->resubmit_list
[vq
->resubmit_num
].index
= i
;
1310 vq
->resubmit_list
[vq
->resubmit_num
].counter
=
1311 vq
->inflight
->desc
[i
].counter
;
1316 if (vq
->resubmit_num
> 1) {
1317 qsort(vq
->resubmit_list
, vq
->resubmit_num
,
1318 sizeof(VuVirtqInflightDesc
), inflight_desc_compare
);
1320 vq
->counter
= vq
->resubmit_list
[0].counter
+ 1;
1323 /* in case of I/O hang after reconnecting */
1324 if (eventfd_write(vq
->kick_fd
, 1)) {
1332 vu_set_vring_kick_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
1334 int index
= vmsg
->payload
.u64
& VHOST_USER_VRING_IDX_MASK
;
1335 bool nofd
= vmsg
->payload
.u64
& VHOST_USER_VRING_NOFD_MASK
;
1337 DPRINT("u64: 0x%016"PRIx64
"\n", vmsg
->payload
.u64
);
1339 if (!vu_check_queue_msg_file(dev
, vmsg
)) {
1343 if (dev
->vq
[index
].kick_fd
!= -1) {
1344 dev
->remove_watch(dev
, dev
->vq
[index
].kick_fd
);
1345 close(dev
->vq
[index
].kick_fd
);
1346 dev
->vq
[index
].kick_fd
= -1;
1349 dev
->vq
[index
].kick_fd
= nofd
? -1 : vmsg
->fds
[0];
1350 DPRINT("Got kick_fd: %d for vq: %d\n", dev
->vq
[index
].kick_fd
, index
);
1352 dev
->vq
[index
].started
= true;
1353 if (dev
->iface
->queue_set_started
) {
1354 dev
->iface
->queue_set_started(dev
, index
, true);
1357 if (dev
->vq
[index
].kick_fd
!= -1 && dev
->vq
[index
].handler
) {
1358 dev
->set_watch(dev
, dev
->vq
[index
].kick_fd
, VU_WATCH_IN
,
1359 vu_kick_cb
, (void *)(long)index
);
1361 DPRINT("Waiting for kicks on fd: %d for vq: %d\n",
1362 dev
->vq
[index
].kick_fd
, index
);
1365 if (vu_check_queue_inflights(dev
, &dev
->vq
[index
])) {
1366 vu_panic(dev
, "Failed to check inflights for vq: %d\n", index
);
1372 void vu_set_queue_handler(VuDev
*dev
, VuVirtq
*vq
,
1373 vu_queue_handler_cb handler
)
1375 int qidx
= vq
- dev
->vq
;
1377 vq
->handler
= handler
;
1378 if (vq
->kick_fd
>= 0) {
1380 dev
->set_watch(dev
, vq
->kick_fd
, VU_WATCH_IN
,
1381 vu_kick_cb
, (void *)(long)qidx
);
1383 dev
->remove_watch(dev
, vq
->kick_fd
);
1388 bool vu_set_queue_host_notifier(VuDev
*dev
, VuVirtq
*vq
, int fd
,
1389 int size
, int offset
)
1391 int qidx
= vq
- dev
->vq
;
1393 VhostUserMsg vmsg
= {
1394 .request
= VHOST_USER_BACKEND_VRING_HOST_NOTIFIER_MSG
,
1395 .flags
= VHOST_USER_VERSION
| VHOST_USER_NEED_REPLY_MASK
,
1396 .size
= sizeof(vmsg
.payload
.area
),
1398 .u64
= qidx
& VHOST_USER_VRING_IDX_MASK
,
1405 vmsg
.payload
.area
.u64
|= VHOST_USER_VRING_NOFD_MASK
;
1407 vmsg
.fds
[fd_num
++] = fd
;
1410 vmsg
.fd_num
= fd_num
;
1412 if (!vu_has_protocol_feature(dev
, VHOST_USER_PROTOCOL_F_BACKEND_SEND_FD
)) {
1416 pthread_mutex_lock(&dev
->backend_mutex
);
1417 if (!vu_message_write(dev
, dev
->backend_fd
, &vmsg
)) {
1418 pthread_mutex_unlock(&dev
->backend_mutex
);
1422 /* Also unlocks the backend_mutex */
1423 return vu_process_message_reply(dev
, &vmsg
);
1427 vu_lookup_shared_object(VuDev
*dev
, unsigned char uuid
[UUID_LEN
],
1430 bool result
= false;
1431 VhostUserMsg msg_reply
;
1432 VhostUserMsg msg
= {
1433 .request
= VHOST_USER_BACKEND_SHARED_OBJECT_LOOKUP
,
1434 .size
= sizeof(msg
.payload
.object
),
1435 .flags
= VHOST_USER_VERSION
| VHOST_USER_NEED_REPLY_MASK
,
1438 memcpy(msg
.payload
.object
.uuid
, uuid
, sizeof(uuid
[0]) * UUID_LEN
);
1440 if (!vu_has_protocol_feature(dev
, VHOST_USER_PROTOCOL_F_SHARED_OBJECT
)) {
1444 pthread_mutex_lock(&dev
->backend_mutex
);
1445 if (!vu_message_write(dev
, dev
->backend_fd
, &msg
)) {
1449 if (!vu_message_read_default(dev
, dev
->backend_fd
, &msg_reply
)) {
1453 if (msg_reply
.request
!= msg
.request
) {
1454 DPRINT("Received unexpected msg type. Expected %d, received %d",
1455 msg
.request
, msg_reply
.request
);
1459 if (msg_reply
.fd_num
!= 1) {
1460 DPRINT("Received unexpected number of fds. Expected 1, received %d",
1465 *dmabuf_fd
= msg_reply
.fds
[0];
1466 result
= *dmabuf_fd
> 0 && msg_reply
.payload
.u64
== 0;
1468 pthread_mutex_unlock(&dev
->backend_mutex
);
1474 vu_send_message(VuDev
*dev
, VhostUserMsg
*vmsg
)
1476 bool result
= false;
1477 pthread_mutex_lock(&dev
->backend_mutex
);
1478 if (!vu_message_write(dev
, dev
->backend_fd
, vmsg
)) {
1484 pthread_mutex_unlock(&dev
->backend_mutex
);
1490 vu_add_shared_object(VuDev
*dev
, unsigned char uuid
[UUID_LEN
])
1492 VhostUserMsg msg
= {
1493 .request
= VHOST_USER_BACKEND_SHARED_OBJECT_ADD
,
1494 .size
= sizeof(msg
.payload
.object
),
1495 .flags
= VHOST_USER_VERSION
,
1498 memcpy(msg
.payload
.object
.uuid
, uuid
, sizeof(uuid
[0]) * UUID_LEN
);
1500 if (!vu_has_protocol_feature(dev
, VHOST_USER_PROTOCOL_F_SHARED_OBJECT
)) {
1504 return vu_send_message(dev
, &msg
);
1508 vu_rm_shared_object(VuDev
*dev
, unsigned char uuid
[UUID_LEN
])
1510 VhostUserMsg msg
= {
1511 .request
= VHOST_USER_BACKEND_SHARED_OBJECT_REMOVE
,
1512 .size
= sizeof(msg
.payload
.object
),
1513 .flags
= VHOST_USER_VERSION
,
1516 memcpy(msg
.payload
.object
.uuid
, uuid
, sizeof(uuid
[0]) * UUID_LEN
);
1518 if (!vu_has_protocol_feature(dev
, VHOST_USER_PROTOCOL_F_SHARED_OBJECT
)) {
1522 return vu_send_message(dev
, &msg
);
1526 vu_set_vring_call_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
1528 int index
= vmsg
->payload
.u64
& VHOST_USER_VRING_IDX_MASK
;
1529 bool nofd
= vmsg
->payload
.u64
& VHOST_USER_VRING_NOFD_MASK
;
1531 DPRINT("u64: 0x%016"PRIx64
"\n", vmsg
->payload
.u64
);
1533 if (!vu_check_queue_msg_file(dev
, vmsg
)) {
1537 if (dev
->vq
[index
].call_fd
!= -1) {
1538 close(dev
->vq
[index
].call_fd
);
1539 dev
->vq
[index
].call_fd
= -1;
1542 dev
->vq
[index
].call_fd
= nofd
? -1 : vmsg
->fds
[0];
1544 /* in case of I/O hang after reconnecting */
1545 if (dev
->vq
[index
].call_fd
!= -1 && eventfd_write(vmsg
->fds
[0], 1)) {
1549 DPRINT("Got call_fd: %d for vq: %d\n", dev
->vq
[index
].call_fd
, index
);
1555 vu_set_vring_err_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
1557 int index
= vmsg
->payload
.u64
& VHOST_USER_VRING_IDX_MASK
;
1558 bool nofd
= vmsg
->payload
.u64
& VHOST_USER_VRING_NOFD_MASK
;
1560 DPRINT("u64: 0x%016"PRIx64
"\n", vmsg
->payload
.u64
);
1562 if (!vu_check_queue_msg_file(dev
, vmsg
)) {
1566 if (dev
->vq
[index
].err_fd
!= -1) {
1567 close(dev
->vq
[index
].err_fd
);
1568 dev
->vq
[index
].err_fd
= -1;
1571 dev
->vq
[index
].err_fd
= nofd
? -1 : vmsg
->fds
[0];
1577 vu_get_protocol_features_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
1580 * Note that we support, but intentionally do not set,
1581 * VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS. This means that
1582 * a device implementation can return it in its callback
1583 * (get_protocol_features) if it wants to use this for
1584 * simulation, but it is otherwise not desirable (if even
1585 * implemented by the frontend.)
1587 uint64_t features
= 1ULL << VHOST_USER_PROTOCOL_F_MQ
|
1588 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD
|
1589 1ULL << VHOST_USER_PROTOCOL_F_BACKEND_REQ
|
1590 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
|
1591 1ULL << VHOST_USER_PROTOCOL_F_BACKEND_SEND_FD
|
1592 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK
|
1593 1ULL << VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS
;
1595 if (have_userfault()) {
1596 features
|= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT
;
1599 if (dev
->iface
->get_config
&& dev
->iface
->set_config
) {
1600 features
|= 1ULL << VHOST_USER_PROTOCOL_F_CONFIG
;
1603 if (dev
->iface
->get_protocol_features
) {
1604 features
|= dev
->iface
->get_protocol_features(dev
);
1607 vmsg_set_reply_u64(vmsg
, features
);
1612 vu_set_protocol_features_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
1614 uint64_t features
= vmsg
->payload
.u64
;
1616 DPRINT("u64: 0x%016"PRIx64
"\n", features
);
1618 dev
->protocol_features
= vmsg
->payload
.u64
;
1620 if (vu_has_protocol_feature(dev
,
1621 VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS
) &&
1622 (!vu_has_protocol_feature(dev
, VHOST_USER_PROTOCOL_F_BACKEND_REQ
) ||
1623 !vu_has_protocol_feature(dev
, VHOST_USER_PROTOCOL_F_REPLY_ACK
))) {
1625 * The use case for using messages for kick/call is simulation, to make
1626 * the kick and call synchronous. To actually get that behaviour, both
1627 * of the other features are required.
1628 * Theoretically, one could use only kick messages, or do them without
1629 * having F_REPLY_ACK, but too many (possibly pending) messages on the
1630 * socket will eventually cause the frontend to hang, to avoid this in
1631 * scenarios where not desired enforce that the settings are in a way
1632 * that actually enables the simulation case.
1635 "F_IN_BAND_NOTIFICATIONS requires F_BACKEND_REQ && F_REPLY_ACK");
1639 if (dev
->iface
->set_protocol_features
) {
1640 dev
->iface
->set_protocol_features(dev
, features
);
1647 vu_get_queue_num_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
1649 vmsg_set_reply_u64(vmsg
, dev
->max_queues
);
1654 vu_set_vring_enable_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
1656 unsigned int index
= vmsg
->payload
.state
.index
;
1657 unsigned int enable
= vmsg
->payload
.state
.num
;
1659 DPRINT("State.index: %u\n", index
);
1660 DPRINT("State.enable: %u\n", enable
);
1662 if (index
>= dev
->max_queues
) {
1663 vu_panic(dev
, "Invalid vring_enable index: %u", index
);
1667 dev
->vq
[index
].enable
= enable
;
1672 vu_set_backend_req_fd(VuDev
*dev
, VhostUserMsg
*vmsg
)
1674 if (vmsg
->fd_num
!= 1) {
1675 vu_panic(dev
, "Invalid backend_req_fd message (%d fd's)", vmsg
->fd_num
);
1679 if (dev
->backend_fd
!= -1) {
1680 close(dev
->backend_fd
);
1682 dev
->backend_fd
= vmsg
->fds
[0];
1683 DPRINT("Got backend_fd: %d\n", vmsg
->fds
[0]);
1689 vu_get_config(VuDev
*dev
, VhostUserMsg
*vmsg
)
1693 if (dev
->iface
->get_config
) {
1694 ret
= dev
->iface
->get_config(dev
, vmsg
->payload
.config
.region
,
1695 vmsg
->payload
.config
.size
);
1699 /* resize to zero to indicate an error to frontend */
1707 vu_set_config(VuDev
*dev
, VhostUserMsg
*vmsg
)
1711 if (dev
->iface
->set_config
) {
1712 ret
= dev
->iface
->set_config(dev
, vmsg
->payload
.config
.region
,
1713 vmsg
->payload
.config
.offset
,
1714 vmsg
->payload
.config
.size
,
1715 vmsg
->payload
.config
.flags
);
1717 vu_panic(dev
, "Set virtio configuration space failed");
1725 vu_set_postcopy_advise(VuDev
*dev
, VhostUserMsg
*vmsg
)
1728 struct uffdio_api api_struct
;
1730 dev
->postcopy_ufd
= syscall(__NR_userfaultfd
, O_CLOEXEC
| O_NONBLOCK
);
1733 dev
->postcopy_ufd
= -1;
1736 if (dev
->postcopy_ufd
== -1) {
1737 vu_panic(dev
, "Userfaultfd not available: %s", strerror(errno
));
1742 api_struct
.api
= UFFD_API
;
1743 api_struct
.features
= 0;
1744 if (ioctl(dev
->postcopy_ufd
, UFFDIO_API
, &api_struct
)) {
1745 vu_panic(dev
, "Failed UFFDIO_API: %s", strerror(errno
));
1746 close(dev
->postcopy_ufd
);
1747 dev
->postcopy_ufd
= -1;
1750 /* TODO: Stash feature flags somewhere */
1754 /* Return a ufd to the QEMU */
1756 vmsg
->fds
[0] = dev
->postcopy_ufd
;
1757 return true; /* = send a reply */
1761 vu_set_postcopy_listen(VuDev
*dev
, VhostUserMsg
*vmsg
)
1763 if (dev
->nregions
) {
1764 vu_panic(dev
, "Regions already registered at postcopy-listen");
1765 vmsg_set_reply_u64(vmsg
, -1);
1768 dev
->postcopy_listening
= true;
1770 vmsg_set_reply_u64(vmsg
, 0);
1775 vu_set_postcopy_end(VuDev
*dev
, VhostUserMsg
*vmsg
)
1777 DPRINT("%s: Entry\n", __func__
);
1778 dev
->postcopy_listening
= false;
1779 if (dev
->postcopy_ufd
> 0) {
1780 close(dev
->postcopy_ufd
);
1781 dev
->postcopy_ufd
= -1;
1782 DPRINT("%s: Done close\n", __func__
);
1785 vmsg_set_reply_u64(vmsg
, 0);
1786 DPRINT("%s: exit\n", __func__
);
1790 static inline uint64_t
1791 vu_inflight_queue_size(uint16_t queue_size
)
1793 return ALIGN_UP(sizeof(VuDescStateSplit
) * queue_size
+
1794 sizeof(uint16_t), INFLIGHT_ALIGNMENT
);
1797 #ifdef MFD_ALLOW_SEALING
1799 memfd_alloc(const char *name
, size_t size
, unsigned int flags
, int *fd
)
1804 *fd
= memfd_create(name
, MFD_ALLOW_SEALING
);
1809 ret
= ftruncate(*fd
, size
);
1815 ret
= fcntl(*fd
, F_ADD_SEALS
, flags
);
1821 ptr
= mmap(0, size
, PROT_READ
| PROT_WRITE
, MAP_SHARED
, *fd
, 0);
1822 if (ptr
== MAP_FAILED
) {
1832 vu_get_inflight_fd(VuDev
*dev
, VhostUserMsg
*vmsg
)
1837 uint16_t num_queues
, queue_size
;
1839 if (vmsg
->size
!= sizeof(vmsg
->payload
.inflight
)) {
1840 vu_panic(dev
, "Invalid get_inflight_fd message:%d", vmsg
->size
);
1841 vmsg
->payload
.inflight
.mmap_size
= 0;
1845 num_queues
= vmsg
->payload
.inflight
.num_queues
;
1846 queue_size
= vmsg
->payload
.inflight
.queue_size
;
1848 DPRINT("set_inflight_fd num_queues: %"PRId16
"\n", num_queues
);
1849 DPRINT("set_inflight_fd queue_size: %"PRId16
"\n", queue_size
);
1851 mmap_size
= vu_inflight_queue_size(queue_size
) * num_queues
;
1853 #ifdef MFD_ALLOW_SEALING
1854 addr
= memfd_alloc("vhost-inflight", mmap_size
,
1855 F_SEAL_GROW
| F_SEAL_SHRINK
| F_SEAL_SEAL
,
1858 vu_panic(dev
, "Not implemented: memfd support is missing");
1862 vu_panic(dev
, "Failed to alloc vhost inflight area");
1863 vmsg
->payload
.inflight
.mmap_size
= 0;
1867 memset(addr
, 0, mmap_size
);
1869 dev
->inflight_info
.addr
= addr
;
1870 dev
->inflight_info
.size
= vmsg
->payload
.inflight
.mmap_size
= mmap_size
;
1871 dev
->inflight_info
.fd
= vmsg
->fds
[0] = fd
;
1873 vmsg
->payload
.inflight
.mmap_offset
= 0;
1875 DPRINT("send inflight mmap_size: %"PRId64
"\n",
1876 vmsg
->payload
.inflight
.mmap_size
);
1877 DPRINT("send inflight mmap offset: %"PRId64
"\n",
1878 vmsg
->payload
.inflight
.mmap_offset
);
1884 vu_set_inflight_fd(VuDev
*dev
, VhostUserMsg
*vmsg
)
1887 uint64_t mmap_size
, mmap_offset
;
1888 uint16_t num_queues
, queue_size
;
1891 if (vmsg
->fd_num
!= 1 ||
1892 vmsg
->size
!= sizeof(vmsg
->payload
.inflight
)) {
1893 vu_panic(dev
, "Invalid set_inflight_fd message size:%d fds:%d",
1894 vmsg
->size
, vmsg
->fd_num
);
1899 mmap_size
= vmsg
->payload
.inflight
.mmap_size
;
1900 mmap_offset
= vmsg
->payload
.inflight
.mmap_offset
;
1901 num_queues
= vmsg
->payload
.inflight
.num_queues
;
1902 queue_size
= vmsg
->payload
.inflight
.queue_size
;
1904 DPRINT("set_inflight_fd mmap_size: %"PRId64
"\n", mmap_size
);
1905 DPRINT("set_inflight_fd mmap_offset: %"PRId64
"\n", mmap_offset
);
1906 DPRINT("set_inflight_fd num_queues: %"PRId16
"\n", num_queues
);
1907 DPRINT("set_inflight_fd queue_size: %"PRId16
"\n", queue_size
);
1909 rc
= mmap(0, mmap_size
, PROT_READ
| PROT_WRITE
, MAP_SHARED
,
1912 if (rc
== MAP_FAILED
) {
1913 vu_panic(dev
, "set_inflight_fd mmap error: %s", strerror(errno
));
1917 if (dev
->inflight_info
.fd
) {
1918 close(dev
->inflight_info
.fd
);
1921 if (dev
->inflight_info
.addr
) {
1922 munmap(dev
->inflight_info
.addr
, dev
->inflight_info
.size
);
1925 dev
->inflight_info
.fd
= fd
;
1926 dev
->inflight_info
.addr
= rc
;
1927 dev
->inflight_info
.size
= mmap_size
;
1929 for (i
= 0; i
< num_queues
; i
++) {
1930 dev
->vq
[i
].inflight
= (VuVirtqInflight
*)rc
;
1931 dev
->vq
[i
].inflight
->desc_num
= queue_size
;
1932 rc
= (void *)((char *)rc
+ vu_inflight_queue_size(queue_size
));
1939 vu_handle_vring_kick(VuDev
*dev
, VhostUserMsg
*vmsg
)
1941 unsigned int index
= vmsg
->payload
.state
.index
;
1943 if (index
>= dev
->max_queues
) {
1944 vu_panic(dev
, "Invalid queue index: %u", index
);
1948 DPRINT("Got kick message: handler:%p idx:%u\n",
1949 dev
->vq
[index
].handler
, index
);
1951 if (!dev
->vq
[index
].started
) {
1952 dev
->vq
[index
].started
= true;
1954 if (dev
->iface
->queue_set_started
) {
1955 dev
->iface
->queue_set_started(dev
, index
, true);
1959 if (dev
->vq
[index
].handler
) {
1960 dev
->vq
[index
].handler(dev
, index
);
1966 static bool vu_handle_get_max_memslots(VuDev
*dev
, VhostUserMsg
*vmsg
)
1968 vmsg_set_reply_u64(vmsg
, VHOST_USER_MAX_RAM_SLOTS
);
1970 DPRINT("u64: 0x%016"PRIx64
"\n", (uint64_t) VHOST_USER_MAX_RAM_SLOTS
);
1976 vu_process_message(VuDev
*dev
, VhostUserMsg
*vmsg
)
1980 /* Print out generic part of the request. */
1981 DPRINT("================ Vhost user message ================\n");
1982 DPRINT("Request: %s (%d)\n", vu_request_to_string(vmsg
->request
),
1984 DPRINT("Flags: 0x%x\n", vmsg
->flags
);
1985 DPRINT("Size: %u\n", vmsg
->size
);
1990 for (i
= 0; i
< vmsg
->fd_num
; i
++) {
1991 DPRINT(" %d", vmsg
->fds
[i
]);
1996 if (dev
->iface
->process_msg
&&
1997 dev
->iface
->process_msg(dev
, vmsg
, &do_reply
)) {
2001 switch (vmsg
->request
) {
2002 case VHOST_USER_GET_FEATURES
:
2003 return vu_get_features_exec(dev
, vmsg
);
2004 case VHOST_USER_SET_FEATURES
:
2005 return vu_set_features_exec(dev
, vmsg
);
2006 case VHOST_USER_GET_PROTOCOL_FEATURES
:
2007 return vu_get_protocol_features_exec(dev
, vmsg
);
2008 case VHOST_USER_SET_PROTOCOL_FEATURES
:
2009 return vu_set_protocol_features_exec(dev
, vmsg
);
2010 case VHOST_USER_SET_OWNER
:
2011 return vu_set_owner_exec(dev
, vmsg
);
2012 case VHOST_USER_RESET_OWNER
:
2013 return vu_reset_device_exec(dev
, vmsg
);
2014 case VHOST_USER_SET_MEM_TABLE
:
2015 return vu_set_mem_table_exec(dev
, vmsg
);
2016 case VHOST_USER_SET_LOG_BASE
:
2017 return vu_set_log_base_exec(dev
, vmsg
);
2018 case VHOST_USER_SET_LOG_FD
:
2019 return vu_set_log_fd_exec(dev
, vmsg
);
2020 case VHOST_USER_SET_VRING_NUM
:
2021 return vu_set_vring_num_exec(dev
, vmsg
);
2022 case VHOST_USER_SET_VRING_ADDR
:
2023 return vu_set_vring_addr_exec(dev
, vmsg
);
2024 case VHOST_USER_SET_VRING_BASE
:
2025 return vu_set_vring_base_exec(dev
, vmsg
);
2026 case VHOST_USER_GET_VRING_BASE
:
2027 return vu_get_vring_base_exec(dev
, vmsg
);
2028 case VHOST_USER_SET_VRING_KICK
:
2029 return vu_set_vring_kick_exec(dev
, vmsg
);
2030 case VHOST_USER_SET_VRING_CALL
:
2031 return vu_set_vring_call_exec(dev
, vmsg
);
2032 case VHOST_USER_SET_VRING_ERR
:
2033 return vu_set_vring_err_exec(dev
, vmsg
);
2034 case VHOST_USER_GET_QUEUE_NUM
:
2035 return vu_get_queue_num_exec(dev
, vmsg
);
2036 case VHOST_USER_SET_VRING_ENABLE
:
2037 return vu_set_vring_enable_exec(dev
, vmsg
);
2038 case VHOST_USER_SET_BACKEND_REQ_FD
:
2039 return vu_set_backend_req_fd(dev
, vmsg
);
2040 case VHOST_USER_GET_CONFIG
:
2041 return vu_get_config(dev
, vmsg
);
2042 case VHOST_USER_SET_CONFIG
:
2043 return vu_set_config(dev
, vmsg
);
2044 case VHOST_USER_NONE
:
2045 /* if you need processing before exit, override iface->process_msg */
2047 case VHOST_USER_POSTCOPY_ADVISE
:
2048 return vu_set_postcopy_advise(dev
, vmsg
);
2049 case VHOST_USER_POSTCOPY_LISTEN
:
2050 return vu_set_postcopy_listen(dev
, vmsg
);
2051 case VHOST_USER_POSTCOPY_END
:
2052 return vu_set_postcopy_end(dev
, vmsg
);
2053 case VHOST_USER_GET_INFLIGHT_FD
:
2054 return vu_get_inflight_fd(dev
, vmsg
);
2055 case VHOST_USER_SET_INFLIGHT_FD
:
2056 return vu_set_inflight_fd(dev
, vmsg
);
2057 case VHOST_USER_VRING_KICK
:
2058 return vu_handle_vring_kick(dev
, vmsg
);
2059 case VHOST_USER_GET_MAX_MEM_SLOTS
:
2060 return vu_handle_get_max_memslots(dev
, vmsg
);
2061 case VHOST_USER_ADD_MEM_REG
:
2062 return vu_add_mem_reg(dev
, vmsg
);
2063 case VHOST_USER_REM_MEM_REG
:
2064 return vu_rem_mem_reg(dev
, vmsg
);
2065 case VHOST_USER_GET_SHARED_OBJECT
:
2066 return vu_get_shared_object(dev
, vmsg
);
2068 vmsg_close_fds(vmsg
);
2069 vu_panic(dev
, "Unhandled request: %d", vmsg
->request
);
2076 vu_dispatch(VuDev
*dev
)
2078 VhostUserMsg vmsg
= { 0, };
2079 int reply_requested
;
2080 bool need_reply
, success
= false;
2082 if (!dev
->read_msg(dev
, dev
->sock
, &vmsg
)) {
2086 need_reply
= vmsg
.flags
& VHOST_USER_NEED_REPLY_MASK
;
2088 reply_requested
= vu_process_message(dev
, &vmsg
);
2089 if (!reply_requested
&& need_reply
) {
2090 vmsg_set_reply_u64(&vmsg
, 0);
2091 reply_requested
= 1;
2094 if (!reply_requested
) {
2099 if (!vu_send_reply(dev
, dev
->sock
, &vmsg
)) {
2111 vu_deinit(VuDev
*dev
)
2115 for (i
= 0; i
< dev
->nregions
; i
++) {
2116 VuDevRegion
*r
= &dev
->regions
[i
];
2117 void *m
= (void *) (uintptr_t) r
->mmap_addr
;
2118 if (m
!= MAP_FAILED
) {
2119 munmap(m
, r
->size
+ r
->mmap_offset
);
2124 for (i
= 0; i
< dev
->max_queues
; i
++) {
2125 VuVirtq
*vq
= &dev
->vq
[i
];
2127 if (vq
->call_fd
!= -1) {
2132 if (vq
->kick_fd
!= -1) {
2133 dev
->remove_watch(dev
, vq
->kick_fd
);
2138 if (vq
->err_fd
!= -1) {
2143 if (vq
->resubmit_list
) {
2144 free(vq
->resubmit_list
);
2145 vq
->resubmit_list
= NULL
;
2148 vq
->inflight
= NULL
;
2151 if (dev
->inflight_info
.addr
) {
2152 munmap(dev
->inflight_info
.addr
, dev
->inflight_info
.size
);
2153 dev
->inflight_info
.addr
= NULL
;
2156 if (dev
->inflight_info
.fd
> 0) {
2157 close(dev
->inflight_info
.fd
);
2158 dev
->inflight_info
.fd
= -1;
2162 if (dev
->backend_fd
!= -1) {
2163 close(dev
->backend_fd
);
2164 dev
->backend_fd
= -1;
2166 pthread_mutex_destroy(&dev
->backend_mutex
);
2168 if (dev
->sock
!= -1) {
2178 uint16_t max_queues
,
2181 vu_read_msg_cb read_msg
,
2182 vu_set_watch_cb set_watch
,
2183 vu_remove_watch_cb remove_watch
,
2184 const VuDevIface
*iface
)
2188 assert(max_queues
> 0);
2189 assert(socket
>= 0);
2191 assert(remove_watch
);
2195 memset(dev
, 0, sizeof(*dev
));
2199 dev
->read_msg
= read_msg
? read_msg
: vu_message_read_default
;
2200 dev
->set_watch
= set_watch
;
2201 dev
->remove_watch
= remove_watch
;
2203 dev
->log_call_fd
= -1;
2204 pthread_mutex_init(&dev
->backend_mutex
, NULL
);
2205 dev
->backend_fd
= -1;
2206 dev
->max_queues
= max_queues
;
2208 dev
->vq
= malloc(max_queues
* sizeof(dev
->vq
[0]));
2210 DPRINT("%s: failed to malloc virtqueues\n", __func__
);
2214 for (i
= 0; i
< max_queues
; i
++) {
2215 dev
->vq
[i
] = (VuVirtq
) {
2216 .call_fd
= -1, .kick_fd
= -1, .err_fd
= -1,
2217 .notification
= true,
2225 vu_get_queue(VuDev
*dev
, int qidx
)
2227 assert(qidx
< dev
->max_queues
);
2228 return &dev
->vq
[qidx
];
2232 vu_queue_enabled(VuDev
*dev
, VuVirtq
*vq
)
2238 vu_queue_started(const VuDev
*dev
, const VuVirtq
*vq
)
2243 static inline uint16_t
2244 vring_avail_flags(VuVirtq
*vq
)
2246 return le16toh(vq
->vring
.avail
->flags
);
2249 static inline uint16_t
2250 vring_avail_idx(VuVirtq
*vq
)
2252 vq
->shadow_avail_idx
= le16toh(vq
->vring
.avail
->idx
);
2254 return vq
->shadow_avail_idx
;
2257 static inline uint16_t
2258 vring_avail_ring(VuVirtq
*vq
, int i
)
2260 return le16toh(vq
->vring
.avail
->ring
[i
]);
2263 static inline uint16_t
2264 vring_get_used_event(VuVirtq
*vq
)
2266 return vring_avail_ring(vq
, vq
->vring
.num
);
2270 virtqueue_num_heads(VuDev
*dev
, VuVirtq
*vq
, unsigned int idx
)
2272 uint16_t num_heads
= vring_avail_idx(vq
) - idx
;
2274 /* Check it isn't doing very strange things with descriptor numbers. */
2275 if (num_heads
> vq
->vring
.num
) {
2276 vu_panic(dev
, "Guest moved used index from %u to %u",
2277 idx
, vq
->shadow_avail_idx
);
2281 /* On success, callers read a descriptor at vq->last_avail_idx.
2282 * Make sure descriptor read does not bypass avail index read. */
2290 virtqueue_get_head(VuDev
*dev
, VuVirtq
*vq
,
2291 unsigned int idx
, unsigned int *head
)
2293 /* Grab the next descriptor number they're advertising, and increment
2294 * the index we've seen. */
2295 *head
= vring_avail_ring(vq
, idx
% vq
->vring
.num
);
2297 /* If their number is silly, that's a fatal mistake. */
2298 if (*head
>= vq
->vring
.num
) {
2299 vu_panic(dev
, "Guest says index %u is available", *head
);
2307 virtqueue_read_indirect_desc(VuDev
*dev
, struct vring_desc
*desc
,
2308 uint64_t addr
, size_t len
)
2310 struct vring_desc
*ori_desc
;
2313 if (len
> (VIRTQUEUE_MAX_SIZE
* sizeof(struct vring_desc
))) {
2323 ori_desc
= vu_gpa_to_va(dev
, &read_len
, addr
);
2328 memcpy(desc
, ori_desc
, read_len
);
2338 VIRTQUEUE_READ_DESC_ERROR
= -1,
2339 VIRTQUEUE_READ_DESC_DONE
= 0, /* end of chain */
2340 VIRTQUEUE_READ_DESC_MORE
= 1, /* more buffers in chain */
2344 virtqueue_read_next_desc(VuDev
*dev
, struct vring_desc
*desc
,
2345 int i
, unsigned int max
, unsigned int *next
)
2347 /* If this descriptor says it doesn't chain, we're done. */
2348 if (!(le16toh(desc
[i
].flags
) & VRING_DESC_F_NEXT
)) {
2349 return VIRTQUEUE_READ_DESC_DONE
;
2352 /* Check they're not leading us off end of descriptors. */
2353 *next
= le16toh(desc
[i
].next
);
2354 /* Make sure compiler knows to grab that: we don't want it changing! */
2358 vu_panic(dev
, "Desc next is %u", *next
);
2359 return VIRTQUEUE_READ_DESC_ERROR
;
2362 return VIRTQUEUE_READ_DESC_MORE
;
2366 vu_queue_get_avail_bytes(VuDev
*dev
, VuVirtq
*vq
, unsigned int *in_bytes
,
2367 unsigned int *out_bytes
,
2368 unsigned max_in_bytes
, unsigned max_out_bytes
)
2371 unsigned int total_bufs
, in_total
, out_total
;
2374 idx
= vq
->last_avail_idx
;
2376 total_bufs
= in_total
= out_total
= 0;
2377 if (unlikely(dev
->broken
) ||
2378 unlikely(!vq
->vring
.avail
)) {
2382 while ((rc
= virtqueue_num_heads(dev
, vq
, idx
)) > 0) {
2383 unsigned int max
, desc_len
, num_bufs
, indirect
= 0;
2384 uint64_t desc_addr
, read_len
;
2385 struct vring_desc
*desc
;
2386 struct vring_desc desc_buf
[VIRTQUEUE_MAX_SIZE
];
2389 max
= vq
->vring
.num
;
2390 num_bufs
= total_bufs
;
2391 if (!virtqueue_get_head(dev
, vq
, idx
++, &i
)) {
2394 desc
= vq
->vring
.desc
;
2396 if (le16toh(desc
[i
].flags
) & VRING_DESC_F_INDIRECT
) {
2397 if (le32toh(desc
[i
].len
) % sizeof(struct vring_desc
)) {
2398 vu_panic(dev
, "Invalid size for indirect buffer table");
2402 /* If we've got too many, that implies a descriptor loop. */
2403 if (num_bufs
>= max
) {
2404 vu_panic(dev
, "Looped descriptor");
2408 /* loop over the indirect descriptor table */
2410 desc_addr
= le64toh(desc
[i
].addr
);
2411 desc_len
= le32toh(desc
[i
].len
);
2412 max
= desc_len
/ sizeof(struct vring_desc
);
2413 read_len
= desc_len
;
2414 desc
= vu_gpa_to_va(dev
, &read_len
, desc_addr
);
2415 if (unlikely(desc
&& read_len
!= desc_len
)) {
2416 /* Failed to use zero copy */
2418 if (!virtqueue_read_indirect_desc(dev
, desc_buf
,
2425 vu_panic(dev
, "Invalid indirect buffer table");
2432 /* If we've got too many, that implies a descriptor loop. */
2433 if (++num_bufs
> max
) {
2434 vu_panic(dev
, "Looped descriptor");
2438 if (le16toh(desc
[i
].flags
) & VRING_DESC_F_WRITE
) {
2439 in_total
+= le32toh(desc
[i
].len
);
2441 out_total
+= le32toh(desc
[i
].len
);
2443 if (in_total
>= max_in_bytes
&& out_total
>= max_out_bytes
) {
2446 rc
= virtqueue_read_next_desc(dev
, desc
, i
, max
, &i
);
2447 } while (rc
== VIRTQUEUE_READ_DESC_MORE
);
2449 if (rc
== VIRTQUEUE_READ_DESC_ERROR
) {
2454 total_bufs
= num_bufs
;
2464 *in_bytes
= in_total
;
2467 *out_bytes
= out_total
;
2472 in_total
= out_total
= 0;
2477 vu_queue_avail_bytes(VuDev
*dev
, VuVirtq
*vq
, unsigned int in_bytes
,
2478 unsigned int out_bytes
)
2480 unsigned int in_total
, out_total
;
2482 vu_queue_get_avail_bytes(dev
, vq
, &in_total
, &out_total
,
2483 in_bytes
, out_bytes
);
2485 return in_bytes
<= in_total
&& out_bytes
<= out_total
;
2488 /* Fetch avail_idx from VQ memory only when we really need to know if
2489 * guest has added some buffers. */
2491 vu_queue_empty(VuDev
*dev
, VuVirtq
*vq
)
2493 if (unlikely(dev
->broken
) ||
2494 unlikely(!vq
->vring
.avail
)) {
2498 if (vq
->shadow_avail_idx
!= vq
->last_avail_idx
) {
2502 return vring_avail_idx(vq
) == vq
->last_avail_idx
;
2506 vring_notify(VuDev
*dev
, VuVirtq
*vq
)
2511 /* We need to expose used array entries before checking used event. */
2514 /* Always notify when queue is empty (when feature acknowledge) */
2515 if (vu_has_feature(dev
, VIRTIO_F_NOTIFY_ON_EMPTY
) &&
2516 !vq
->inuse
&& vu_queue_empty(dev
, vq
)) {
2520 if (!vu_has_feature(dev
, VIRTIO_RING_F_EVENT_IDX
)) {
2521 return !(vring_avail_flags(vq
) & VRING_AVAIL_F_NO_INTERRUPT
);
2524 v
= vq
->signalled_used_valid
;
2525 vq
->signalled_used_valid
= true;
2526 old
= vq
->signalled_used
;
2527 new = vq
->signalled_used
= vq
->used_idx
;
2528 return !v
|| vring_need_event(vring_get_used_event(vq
), new, old
);
2531 static void _vu_queue_notify(VuDev
*dev
, VuVirtq
*vq
, bool sync
)
2533 if (unlikely(dev
->broken
) ||
2534 unlikely(!vq
->vring
.avail
)) {
2538 if (!vring_notify(dev
, vq
)) {
2539 DPRINT("skipped notify...\n");
2543 if (vq
->call_fd
< 0 &&
2544 vu_has_protocol_feature(dev
,
2545 VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS
) &&
2546 vu_has_protocol_feature(dev
, VHOST_USER_PROTOCOL_F_BACKEND_REQ
)) {
2547 VhostUserMsg vmsg
= {
2548 .request
= VHOST_USER_BACKEND_VRING_CALL
,
2549 .flags
= VHOST_USER_VERSION
,
2550 .size
= sizeof(vmsg
.payload
.state
),
2552 .index
= vq
- dev
->vq
,
2556 vu_has_protocol_feature(dev
,
2557 VHOST_USER_PROTOCOL_F_REPLY_ACK
);
2560 vmsg
.flags
|= VHOST_USER_NEED_REPLY_MASK
;
2563 vu_message_write(dev
, dev
->backend_fd
, &vmsg
);
2565 vu_message_read_default(dev
, dev
->backend_fd
, &vmsg
);
2570 if (eventfd_write(vq
->call_fd
, 1) < 0) {
2571 vu_panic(dev
, "Error writing eventfd: %s", strerror(errno
));
2575 void vu_queue_notify(VuDev
*dev
, VuVirtq
*vq
)
2577 _vu_queue_notify(dev
, vq
, false);
2580 void vu_queue_notify_sync(VuDev
*dev
, VuVirtq
*vq
)
2582 _vu_queue_notify(dev
, vq
, true);
2585 void vu_config_change_msg(VuDev
*dev
)
2587 VhostUserMsg vmsg
= {
2588 .request
= VHOST_USER_BACKEND_CONFIG_CHANGE_MSG
,
2589 .flags
= VHOST_USER_VERSION
,
2592 vu_message_write(dev
, dev
->backend_fd
, &vmsg
);
2596 vring_used_flags_set_bit(VuVirtq
*vq
, int mask
)
2600 flags
= (uint16_t *)((char*)vq
->vring
.used
+
2601 offsetof(struct vring_used
, flags
));
2602 *flags
= htole16(le16toh(*flags
) | mask
);
2606 vring_used_flags_unset_bit(VuVirtq
*vq
, int mask
)
2610 flags
= (uint16_t *)((char*)vq
->vring
.used
+
2611 offsetof(struct vring_used
, flags
));
2612 *flags
= htole16(le16toh(*flags
) & ~mask
);
2616 vring_set_avail_event(VuVirtq
*vq
, uint16_t val
)
2618 uint16_t val_le
= htole16(val
);
2620 if (!vq
->notification
) {
2624 memcpy(&vq
->vring
.used
->ring
[vq
->vring
.num
], &val_le
, sizeof(uint16_t));
2628 vu_queue_set_notification(VuDev
*dev
, VuVirtq
*vq
, int enable
)
2630 vq
->notification
= enable
;
2631 if (vu_has_feature(dev
, VIRTIO_RING_F_EVENT_IDX
)) {
2632 vring_set_avail_event(vq
, vring_avail_idx(vq
));
2633 } else if (enable
) {
2634 vring_used_flags_unset_bit(vq
, VRING_USED_F_NO_NOTIFY
);
2636 vring_used_flags_set_bit(vq
, VRING_USED_F_NO_NOTIFY
);
2639 /* Expose avail event/used flags before caller checks the avail idx. */
2645 virtqueue_map_desc(VuDev
*dev
,
2646 unsigned int *p_num_sg
, struct iovec
*iov
,
2647 unsigned int max_num_sg
, bool is_write
,
2648 uint64_t pa
, size_t sz
)
2650 unsigned num_sg
= *p_num_sg
;
2652 assert(num_sg
<= max_num_sg
);
2655 vu_panic(dev
, "virtio: zero sized buffers are not allowed");
2662 if (num_sg
== max_num_sg
) {
2663 vu_panic(dev
, "virtio: too many descriptors in indirect table");
2667 iov
[num_sg
].iov_base
= vu_gpa_to_va(dev
, &len
, pa
);
2668 if (iov
[num_sg
].iov_base
== NULL
) {
2669 vu_panic(dev
, "virtio: invalid address for buffers");
2672 iov
[num_sg
].iov_len
= len
;
2683 virtqueue_alloc_element(size_t sz
,
2684 unsigned out_num
, unsigned in_num
)
2686 VuVirtqElement
*elem
;
2687 size_t in_sg_ofs
= ALIGN_UP(sz
, __alignof__(elem
->in_sg
[0]));
2688 size_t out_sg_ofs
= in_sg_ofs
+ in_num
* sizeof(elem
->in_sg
[0]);
2689 size_t out_sg_end
= out_sg_ofs
+ out_num
* sizeof(elem
->out_sg
[0]);
2691 assert(sz
>= sizeof(VuVirtqElement
));
2692 elem
= malloc(out_sg_end
);
2694 DPRINT("%s: failed to malloc virtqueue element\n", __func__
);
2697 elem
->out_num
= out_num
;
2698 elem
->in_num
= in_num
;
2699 elem
->in_sg
= (void *)elem
+ in_sg_ofs
;
2700 elem
->out_sg
= (void *)elem
+ out_sg_ofs
;
2705 vu_queue_map_desc(VuDev
*dev
, VuVirtq
*vq
, unsigned int idx
, size_t sz
)
2707 struct vring_desc
*desc
= vq
->vring
.desc
;
2708 uint64_t desc_addr
, read_len
;
2709 unsigned int desc_len
;
2710 unsigned int max
= vq
->vring
.num
;
2711 unsigned int i
= idx
;
2712 VuVirtqElement
*elem
;
2713 unsigned int out_num
= 0, in_num
= 0;
2714 struct iovec iov
[VIRTQUEUE_MAX_SIZE
];
2715 struct vring_desc desc_buf
[VIRTQUEUE_MAX_SIZE
];
2718 if (le16toh(desc
[i
].flags
) & VRING_DESC_F_INDIRECT
) {
2719 if (le32toh(desc
[i
].len
) % sizeof(struct vring_desc
)) {
2720 vu_panic(dev
, "Invalid size for indirect buffer table");
2724 /* loop over the indirect descriptor table */
2725 desc_addr
= le64toh(desc
[i
].addr
);
2726 desc_len
= le32toh(desc
[i
].len
);
2727 max
= desc_len
/ sizeof(struct vring_desc
);
2728 read_len
= desc_len
;
2729 desc
= vu_gpa_to_va(dev
, &read_len
, desc_addr
);
2730 if (unlikely(desc
&& read_len
!= desc_len
)) {
2731 /* Failed to use zero copy */
2733 if (!virtqueue_read_indirect_desc(dev
, desc_buf
,
2740 vu_panic(dev
, "Invalid indirect buffer table");
2746 /* Collect all the descriptors */
2748 if (le16toh(desc
[i
].flags
) & VRING_DESC_F_WRITE
) {
2749 if (!virtqueue_map_desc(dev
, &in_num
, iov
+ out_num
,
2750 VIRTQUEUE_MAX_SIZE
- out_num
, true,
2751 le64toh(desc
[i
].addr
),
2752 le32toh(desc
[i
].len
))) {
2757 vu_panic(dev
, "Incorrect order for descriptors");
2760 if (!virtqueue_map_desc(dev
, &out_num
, iov
,
2761 VIRTQUEUE_MAX_SIZE
, false,
2762 le64toh(desc
[i
].addr
),
2763 le32toh(desc
[i
].len
))) {
2768 /* If we've got too many, that implies a descriptor loop. */
2769 if ((in_num
+ out_num
) > max
) {
2770 vu_panic(dev
, "Looped descriptor");
2773 rc
= virtqueue_read_next_desc(dev
, desc
, i
, max
, &i
);
2774 } while (rc
== VIRTQUEUE_READ_DESC_MORE
);
2776 if (rc
== VIRTQUEUE_READ_DESC_ERROR
) {
2777 vu_panic(dev
, "read descriptor error");
2781 /* Now copy what we have collected and mapped */
2782 elem
= virtqueue_alloc_element(sz
, out_num
, in_num
);
2787 for (i
= 0; i
< out_num
; i
++) {
2788 elem
->out_sg
[i
] = iov
[i
];
2790 for (i
= 0; i
< in_num
; i
++) {
2791 elem
->in_sg
[i
] = iov
[out_num
+ i
];
2798 vu_queue_inflight_get(VuDev
*dev
, VuVirtq
*vq
, int desc_idx
)
2800 if (!vu_has_protocol_feature(dev
, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
)) {
2804 if (unlikely(!vq
->inflight
)) {
2808 vq
->inflight
->desc
[desc_idx
].counter
= vq
->counter
++;
2809 vq
->inflight
->desc
[desc_idx
].inflight
= 1;
2815 vu_queue_inflight_pre_put(VuDev
*dev
, VuVirtq
*vq
, int desc_idx
)
2817 if (!vu_has_protocol_feature(dev
, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
)) {
2821 if (unlikely(!vq
->inflight
)) {
2825 vq
->inflight
->last_batch_head
= desc_idx
;
2831 vu_queue_inflight_post_put(VuDev
*dev
, VuVirtq
*vq
, int desc_idx
)
2833 if (!vu_has_protocol_feature(dev
, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
)) {
2837 if (unlikely(!vq
->inflight
)) {
2843 vq
->inflight
->desc
[desc_idx
].inflight
= 0;
2847 vq
->inflight
->used_idx
= vq
->used_idx
;
2853 vu_queue_pop(VuDev
*dev
, VuVirtq
*vq
, size_t sz
)
2857 VuVirtqElement
*elem
;
2859 if (unlikely(dev
->broken
) ||
2860 unlikely(!vq
->vring
.avail
)) {
2864 if (unlikely(vq
->resubmit_list
&& vq
->resubmit_num
> 0)) {
2865 i
= (--vq
->resubmit_num
);
2866 elem
= vu_queue_map_desc(dev
, vq
, vq
->resubmit_list
[i
].index
, sz
);
2868 if (!vq
->resubmit_num
) {
2869 free(vq
->resubmit_list
);
2870 vq
->resubmit_list
= NULL
;
2876 if (vu_queue_empty(dev
, vq
)) {
2880 * Needed after virtio_queue_empty(), see comment in
2881 * virtqueue_num_heads().
2885 if (vq
->inuse
>= vq
->vring
.num
) {
2886 vu_panic(dev
, "Virtqueue size exceeded");
2890 if (!virtqueue_get_head(dev
, vq
, vq
->last_avail_idx
++, &head
)) {
2894 if (vu_has_feature(dev
, VIRTIO_RING_F_EVENT_IDX
)) {
2895 vring_set_avail_event(vq
, vq
->last_avail_idx
);
2898 elem
= vu_queue_map_desc(dev
, vq
, head
, sz
);
2906 vu_queue_inflight_get(dev
, vq
, head
);
2912 vu_queue_detach_element(VuDev
*dev
, VuVirtq
*vq
, VuVirtqElement
*elem
,
2916 /* unmap, when DMA support is added */
2920 vu_queue_unpop(VuDev
*dev
, VuVirtq
*vq
, VuVirtqElement
*elem
,
2923 vq
->last_avail_idx
--;
2924 vu_queue_detach_element(dev
, vq
, elem
, len
);
2928 vu_queue_rewind(VuDev
*dev
, VuVirtq
*vq
, unsigned int num
)
2930 if (num
> vq
->inuse
) {
2933 vq
->last_avail_idx
-= num
;
2939 void vring_used_write(VuDev
*dev
, VuVirtq
*vq
,
2940 struct vring_used_elem
*uelem
, int i
)
2942 struct vring_used
*used
= vq
->vring
.used
;
2944 used
->ring
[i
] = *uelem
;
2945 vu_log_write(dev
, vq
->vring
.log_guest_addr
+
2946 offsetof(struct vring_used
, ring
[i
]),
2947 sizeof(used
->ring
[i
]));
2952 vu_log_queue_fill(VuDev
*dev
, VuVirtq
*vq
,
2953 const VuVirtqElement
*elem
,
2956 struct vring_desc
*desc
= vq
->vring
.desc
;
2957 unsigned int i
, max
, min
, desc_len
;
2958 uint64_t desc_addr
, read_len
;
2959 struct vring_desc desc_buf
[VIRTQUEUE_MAX_SIZE
];
2960 unsigned num_bufs
= 0;
2962 max
= vq
->vring
.num
;
2965 if (le16toh(desc
[i
].flags
) & VRING_DESC_F_INDIRECT
) {
2966 if (le32toh(desc
[i
].len
) % sizeof(struct vring_desc
)) {
2967 vu_panic(dev
, "Invalid size for indirect buffer table");
2971 /* loop over the indirect descriptor table */
2972 desc_addr
= le64toh(desc
[i
].addr
);
2973 desc_len
= le32toh(desc
[i
].len
);
2974 max
= desc_len
/ sizeof(struct vring_desc
);
2975 read_len
= desc_len
;
2976 desc
= vu_gpa_to_va(dev
, &read_len
, desc_addr
);
2977 if (unlikely(desc
&& read_len
!= desc_len
)) {
2978 /* Failed to use zero copy */
2980 if (!virtqueue_read_indirect_desc(dev
, desc_buf
,
2987 vu_panic(dev
, "Invalid indirect buffer table");
2994 if (++num_bufs
> max
) {
2995 vu_panic(dev
, "Looped descriptor");
2999 if (le16toh(desc
[i
].flags
) & VRING_DESC_F_WRITE
) {
3000 min
= MIN(le32toh(desc
[i
].len
), len
);
3001 vu_log_write(dev
, le64toh(desc
[i
].addr
), min
);
3006 (virtqueue_read_next_desc(dev
, desc
, i
, max
, &i
)
3007 == VIRTQUEUE_READ_DESC_MORE
));
3011 vu_queue_fill(VuDev
*dev
, VuVirtq
*vq
,
3012 const VuVirtqElement
*elem
,
3013 unsigned int len
, unsigned int idx
)
3015 struct vring_used_elem uelem
;
3017 if (unlikely(dev
->broken
) ||
3018 unlikely(!vq
->vring
.avail
)) {
3022 vu_log_queue_fill(dev
, vq
, elem
, len
);
3024 idx
= (idx
+ vq
->used_idx
) % vq
->vring
.num
;
3026 uelem
.id
= htole32(elem
->index
);
3027 uelem
.len
= htole32(len
);
3028 vring_used_write(dev
, vq
, &uelem
, idx
);
3032 void vring_used_idx_set(VuDev
*dev
, VuVirtq
*vq
, uint16_t val
)
3034 vq
->vring
.used
->idx
= htole16(val
);
3036 vq
->vring
.log_guest_addr
+ offsetof(struct vring_used
, idx
),
3037 sizeof(vq
->vring
.used
->idx
));
3043 vu_queue_flush(VuDev
*dev
, VuVirtq
*vq
, unsigned int count
)
3047 if (unlikely(dev
->broken
) ||
3048 unlikely(!vq
->vring
.avail
)) {
3052 /* Make sure buffer is written before we update index. */
3057 vring_used_idx_set(dev
, vq
, new);
3059 if (unlikely((int16_t)(new - vq
->signalled_used
) < (uint16_t)(new - old
))) {
3060 vq
->signalled_used_valid
= false;
3065 vu_queue_push(VuDev
*dev
, VuVirtq
*vq
,
3066 const VuVirtqElement
*elem
, unsigned int len
)
3068 vu_queue_fill(dev
, vq
, elem
, len
, 0);
3069 vu_queue_inflight_pre_put(dev
, vq
, elem
->index
);
3070 vu_queue_flush(dev
, vq
, 1);
3071 vu_queue_inflight_post_put(dev
, vq
, elem
->index
);