2 * virtio-fs glue for FUSE
3 * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates
6 * Dave Gilbert <dgilbert@redhat.com>
8 * Implements the glue between libfuse and libvhost-user
10 * This program can be distributed under the terms of the GNU LGPLv2.
11 * See the file COPYING.LIB
14 #include "qemu/osdep.h"
16 #include "fuse_virtio.h"
18 #include "standard-headers/linux/fuse.h"
19 #include "fuse_misc.h"
28 #include <sys/eventfd.h>
29 #include <sys/socket.h>
30 #include <sys/types.h>
34 #include "contrib/libvhost-user/libvhost-user.h"
39 struct fv_VuDev
*virtio_dev
;
41 /* Our queue index, corresponds to array position */
45 /* The element for the command currently being processed */
51 * We pass the dev element into libvhost-user
52 * and then use it to get back to the outer
53 * container for other data.
57 struct fuse_session
*se
;
60 * The following pair of fields are only accessed in the main
64 struct fv_QueueInfo
**qi
;
68 struct virtio_fs_config
{
73 /* Callback from libvhost-user */
74 static uint64_t fv_get_features(VuDev
*dev
)
76 return 1ULL << VIRTIO_F_VERSION_1
;
79 /* Callback from libvhost-user */
80 static void fv_set_features(VuDev
*dev
, uint64_t features
)
85 * Callback from libvhost-user if there's a new fd we're supposed to listen
86 * to, typically a queue kick?
88 static void fv_set_watch(VuDev
*dev
, int fd
, int condition
, vu_watch_cb cb
,
91 fuse_log(FUSE_LOG_WARNING
, "%s: TODO! fd=%d\n", __func__
, fd
);
95 * Callback from libvhost-user if we're no longer supposed to listen on an fd
97 static void fv_remove_watch(VuDev
*dev
, int fd
)
99 fuse_log(FUSE_LOG_WARNING
, "%s: TODO! fd=%d\n", __func__
, fd
);
102 /* Callback from libvhost-user to panic */
103 static void fv_panic(VuDev
*dev
, const char *err
)
105 fuse_log(FUSE_LOG_ERR
, "%s: libvhost-user: %s\n", __func__
, err
);
106 /* TODO: Allow reconnects?? */
111 * Copy from an iovec into a fuse_buf (memory only)
112 * Caller must ensure there is space
114 static void copy_from_iov(struct fuse_buf
*buf
, size_t out_num
,
115 const struct iovec
*out_sg
)
117 void *dest
= buf
->mem
;
120 size_t onelen
= out_sg
->iov_len
;
121 memcpy(dest
, out_sg
->iov_base
, onelen
);
129 * Copy from one iov to another, the given number of bytes
130 * The caller must have checked sizes.
132 static void copy_iov(struct iovec
*src_iov
, int src_count
,
133 struct iovec
*dst_iov
, int dst_count
, size_t to_copy
)
135 size_t dst_offset
= 0;
136 /* Outer loop copies 'src' elements */
139 size_t src_len
= src_iov
[0].iov_len
;
140 size_t src_offset
= 0;
142 if (src_len
> to_copy
) {
145 /* Inner loop copies contents of one 'src' to maybe multiple dst. */
148 size_t dst_len
= dst_iov
[0].iov_len
- dst_offset
;
149 if (dst_len
> src_len
) {
153 memcpy(dst_iov
[0].iov_base
+ dst_offset
,
154 src_iov
[0].iov_base
+ src_offset
, dst_len
);
157 src_offset
+= dst_len
;
158 dst_offset
+= dst_len
;
160 assert(dst_offset
<= dst_iov
[0].iov_len
);
161 if (dst_offset
== dst_iov
[0].iov_len
) {
173 * Called back by ll whenever it wants to send a reply/message back
174 * The 1st element of the iov starts with the fuse_out_header
175 * 'unique'==0 means it's a notify message.
177 int virtio_send_msg(struct fuse_session
*se
, struct fuse_chan
*ch
,
178 struct iovec
*iov
, int count
)
180 VuVirtqElement
*elem
;
185 assert(iov
[0].iov_len
>= sizeof(struct fuse_out_header
));
187 struct fuse_out_header
*out
= iov
[0].iov_base
;
188 /* TODO: Endianness! */
190 size_t tosend_len
= iov_size(iov
, count
);
192 /* unique == 0 is notification, which we don't support */
194 /* For virtio we always have ch */
196 assert(!ch
->qi
->reply_sent
);
198 q
= &ch
->qi
->virtio_dev
->dev
.vq
[ch
->qi
->qidx
];
200 /* The 'in' part of the elem is to qemu */
201 unsigned int in_num
= elem
->in_num
;
202 struct iovec
*in_sg
= elem
->in_sg
;
203 size_t in_len
= iov_size(in_sg
, in_num
);
204 fuse_log(FUSE_LOG_DEBUG
, "%s: elem %d: with %d in desc of length %zd\n",
205 __func__
, elem
->index
, in_num
, in_len
);
208 * The elem should have room for a 'fuse_out_header' (out from fuse)
209 * plus the data based on the len in the header.
211 if (in_len
< sizeof(struct fuse_out_header
)) {
212 fuse_log(FUSE_LOG_ERR
, "%s: elem %d too short for out_header\n",
213 __func__
, elem
->index
);
217 if (in_len
< tosend_len
) {
218 fuse_log(FUSE_LOG_ERR
, "%s: elem %d too small for data len %zd\n",
219 __func__
, elem
->index
, tosend_len
);
224 copy_iov(iov
, count
, in_sg
, in_num
, tosend_len
);
225 vu_queue_push(&se
->virtio_dev
->dev
, q
, elem
, tosend_len
);
226 vu_queue_notify(&se
->virtio_dev
->dev
, q
);
227 ch
->qi
->reply_sent
= true;
234 * Callback from fuse_send_data_iov_* when it's virtio and the buffer
235 * is a single FD with FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK
236 * We need send the iov and then the buffer.
237 * Return 0 on success
239 int virtio_send_data_iov(struct fuse_session
*se
, struct fuse_chan
*ch
,
240 struct iovec
*iov
, int count
, struct fuse_bufvec
*buf
,
244 VuVirtqElement
*elem
;
248 assert(iov
[0].iov_len
>= sizeof(struct fuse_out_header
));
250 struct fuse_out_header
*out
= iov
[0].iov_base
;
251 /* TODO: Endianness! */
253 size_t iov_len
= iov_size(iov
, count
);
254 size_t tosend_len
= iov_len
+ len
;
256 out
->len
= tosend_len
;
258 fuse_log(FUSE_LOG_DEBUG
, "%s: count=%d len=%zd iov_len=%zd\n", __func__
,
259 count
, len
, iov_len
);
261 /* unique == 0 is notification which we don't support */
264 /* For virtio we always have ch */
266 assert(!ch
->qi
->reply_sent
);
268 q
= &ch
->qi
->virtio_dev
->dev
.vq
[ch
->qi
->qidx
];
270 /* The 'in' part of the elem is to qemu */
271 unsigned int in_num
= elem
->in_num
;
272 struct iovec
*in_sg
= elem
->in_sg
;
273 size_t in_len
= iov_size(in_sg
, in_num
);
274 fuse_log(FUSE_LOG_DEBUG
, "%s: elem %d: with %d in desc of length %zd\n",
275 __func__
, elem
->index
, in_num
, in_len
);
278 * The elem should have room for a 'fuse_out_header' (out from fuse)
279 * plus the data based on the len in the header.
281 if (in_len
< sizeof(struct fuse_out_header
)) {
282 fuse_log(FUSE_LOG_ERR
, "%s: elem %d too short for out_header\n",
283 __func__
, elem
->index
);
287 if (in_len
< tosend_len
) {
288 fuse_log(FUSE_LOG_ERR
, "%s: elem %d too small for data len %zd\n",
289 __func__
, elem
->index
, tosend_len
);
294 /* TODO: Limit to 'len' */
296 /* First copy the header data from iov->in_sg */
297 copy_iov(iov
, count
, in_sg
, in_num
, iov_len
);
300 * Build a copy of the the in_sg iov so we can skip bits in it,
301 * including changing the offsets
303 struct iovec
*in_sg_cpy
= calloc(sizeof(struct iovec
), in_num
);
305 memcpy(in_sg_cpy
, in_sg
, sizeof(struct iovec
) * in_num
);
306 /* These get updated as we skip */
307 struct iovec
*in_sg_ptr
= in_sg_cpy
;
308 int in_sg_cpy_count
= in_num
;
310 /* skip over parts of in_sg that contained the header iov */
311 size_t skip_size
= iov_len
;
313 size_t in_sg_left
= 0;
315 while (skip_size
!= 0 && in_sg_cpy_count
) {
316 if (skip_size
>= in_sg_ptr
[0].iov_len
) {
317 skip_size
-= in_sg_ptr
[0].iov_len
;
321 in_sg_ptr
[0].iov_len
-= skip_size
;
322 in_sg_ptr
[0].iov_base
+= skip_size
;
328 for (i
= 0, in_sg_left
= 0; i
< in_sg_cpy_count
; i
++) {
329 in_sg_left
+= in_sg_ptr
[i
].iov_len
;
331 fuse_log(FUSE_LOG_DEBUG
,
332 "%s: after skip skip_size=%zd in_sg_cpy_count=%d "
334 __func__
, skip_size
, in_sg_cpy_count
, in_sg_left
);
335 ret
= preadv(buf
->buf
[0].fd
, in_sg_ptr
, in_sg_cpy_count
,
340 fuse_log(FUSE_LOG_DEBUG
, "%s: preadv failed (%m) len=%zd\n",
345 fuse_log(FUSE_LOG_DEBUG
, "%s: preadv ret=%d len=%zd\n", __func__
,
347 if (ret
< len
&& ret
) {
348 fuse_log(FUSE_LOG_DEBUG
, "%s: ret < len\n", __func__
);
349 /* Skip over this much next time around */
351 buf
->buf
[0].pos
+= ret
;
354 /* Lets do another read */
359 fuse_log(FUSE_LOG_DEBUG
, "%s: !ret in_sg_left=%zd\n", __func__
,
364 fuse_log(FUSE_LOG_DEBUG
, "%s: ret!=len\n", __func__
);
371 } while (in_sg_left
);
374 /* Need to fix out->len on EOF */
376 struct fuse_out_header
*out_sg
= in_sg
[0].iov_base
;
379 out_sg
->len
= tosend_len
;
384 vu_queue_push(&se
->virtio_dev
->dev
, q
, elem
, tosend_len
);
385 vu_queue_notify(&se
->virtio_dev
->dev
, q
);
389 ch
->qi
->reply_sent
= true;
395 /* Thread function for individual queues, created when a queue is 'started' */
396 static void *fv_queue_thread(void *opaque
)
398 struct fv_QueueInfo
*qi
= opaque
;
399 struct VuDev
*dev
= &qi
->virtio_dev
->dev
;
400 struct VuVirtq
*q
= vu_get_queue(dev
, qi
->qidx
);
401 struct fuse_session
*se
= qi
->virtio_dev
->se
;
403 struct fuse_buf fbuf
;
408 fuse_mutex_init(&ch
.lock
);
409 ch
.fd
= (int)0xdaff0d111;
412 fuse_log(FUSE_LOG_INFO
, "%s: Start for queue %d kick_fd %d\n", __func__
,
413 qi
->qidx
, qi
->kick_fd
);
416 pf
[0].fd
= qi
->kick_fd
;
417 pf
[0].events
= POLLIN
;
420 fuse_log(FUSE_LOG_DEBUG
, "%s: Waiting for Queue %d event\n", __func__
,
422 int poll_res
= ppoll(pf
, 1, NULL
, NULL
);
424 if (poll_res
== -1) {
425 if (errno
== EINTR
) {
426 fuse_log(FUSE_LOG_INFO
, "%s: ppoll interrupted, going around\n",
430 fuse_log(FUSE_LOG_ERR
, "fv_queue_thread ppoll: %m\n");
433 assert(poll_res
== 1);
434 if (pf
[0].revents
& (POLLERR
| POLLHUP
| POLLNVAL
)) {
435 fuse_log(FUSE_LOG_ERR
, "%s: Unexpected poll revents %x Queue %d\n",
436 __func__
, pf
[0].revents
, qi
->qidx
);
439 assert(pf
[0].revents
& POLLIN
);
440 fuse_log(FUSE_LOG_DEBUG
, "%s: Got queue event on Queue %d\n", __func__
,
444 if (eventfd_read(qi
->kick_fd
, &evalue
)) {
445 fuse_log(FUSE_LOG_ERR
, "Eventfd_read for queue: %m\n");
448 /* out is from guest, in is too guest */
449 unsigned int in_bytes
, out_bytes
;
450 vu_queue_get_avail_bytes(dev
, q
, &in_bytes
, &out_bytes
, ~0, ~0);
452 fuse_log(FUSE_LOG_DEBUG
,
453 "%s: Queue %d gave evalue: %zx available: in: %u out: %u\n",
454 __func__
, qi
->qidx
, (size_t)evalue
, in_bytes
, out_bytes
);
458 * An element contains one request and the space to send our
459 * response They're spread over multiple descriptors in a
460 * scatter/gather set and we can't trust the guest to keep them
461 * still; so copy in/out.
463 VuVirtqElement
*elem
= vu_queue_pop(dev
, q
, sizeof(VuVirtqElement
));
469 qi
->reply_sent
= false;
472 fbuf
.mem
= malloc(se
->bufsize
);
474 assert(se
->bufsize
> sizeof(struct fuse_in_header
));
476 /* The 'out' part of the elem is from qemu */
477 unsigned int out_num
= elem
->out_num
;
478 struct iovec
*out_sg
= elem
->out_sg
;
479 size_t out_len
= iov_size(out_sg
, out_num
);
480 fuse_log(FUSE_LOG_DEBUG
,
481 "%s: elem %d: with %d out desc of length %zd\n", __func__
,
482 elem
->index
, out_num
, out_len
);
485 * The elem should contain a 'fuse_in_header' (in to fuse)
486 * plus the data based on the len in the header.
488 if (out_len
< sizeof(struct fuse_in_header
)) {
489 fuse_log(FUSE_LOG_ERR
, "%s: elem %d too short for in_header\n",
490 __func__
, elem
->index
);
491 assert(0); /* TODO */
493 if (out_len
> se
->bufsize
) {
494 fuse_log(FUSE_LOG_ERR
, "%s: elem %d too large for buffer\n",
495 __func__
, elem
->index
);
496 assert(0); /* TODO */
498 copy_from_iov(&fbuf
, out_num
, out_sg
);
501 /* TODO! Endianness of header */
503 /* TODO: Add checks for fuse_session_exited */
504 fuse_session_process_buf_int(se
, &fbuf
, &ch
);
506 if (!qi
->reply_sent
) {
507 fuse_log(FUSE_LOG_DEBUG
, "%s: elem %d no reply sent\n",
508 __func__
, elem
->index
);
509 /* I think we've still got to recycle the element */
510 vu_queue_push(dev
, q
, elem
, 0);
511 vu_queue_notify(dev
, q
);
518 pthread_mutex_destroy(&ch
.lock
);
524 /* Callback from libvhost-user on start or stop of a queue */
525 static void fv_queue_set_started(VuDev
*dev
, int qidx
, bool started
)
527 struct fv_VuDev
*vud
= container_of(dev
, struct fv_VuDev
, dev
);
528 struct fv_QueueInfo
*ourqi
;
530 fuse_log(FUSE_LOG_INFO
, "%s: qidx=%d started=%d\n", __func__
, qidx
,
535 * Ignore additional request queues for now. passthrough_ll.c must be
536 * audited for thread-safety issues first. It was written with a
537 * well-behaved client in mind and may not protect against all types of
541 fuse_log(FUSE_LOG_ERR
,
542 "%s: multiple request queues not yet implemented, please only "
543 "configure 1 request queue\n",
549 /* Fire up a thread to watch this queue */
550 if (qidx
>= vud
->nqueues
) {
551 vud
->qi
= realloc(vud
->qi
, (qidx
+ 1) * sizeof(vud
->qi
[0]));
553 memset(vud
->qi
+ vud
->nqueues
, 0,
554 sizeof(vud
->qi
[0]) * (1 + (qidx
- vud
->nqueues
)));
555 vud
->nqueues
= qidx
+ 1;
557 if (!vud
->qi
[qidx
]) {
558 vud
->qi
[qidx
] = calloc(sizeof(struct fv_QueueInfo
), 1);
559 assert(vud
->qi
[qidx
]);
560 vud
->qi
[qidx
]->virtio_dev
= vud
;
561 vud
->qi
[qidx
]->qidx
= qidx
;
563 /* Shouldn't have been started */
564 assert(vud
->qi
[qidx
]->kick_fd
== -1);
566 ourqi
= vud
->qi
[qidx
];
567 ourqi
->kick_fd
= dev
->vq
[qidx
].kick_fd
;
568 if (pthread_create(&ourqi
->thread
, NULL
, fv_queue_thread
, ourqi
)) {
569 fuse_log(FUSE_LOG_ERR
, "%s: Failed to create thread for queue %d\n",
574 /* TODO: Kill the thread */
575 assert(qidx
< vud
->nqueues
);
576 ourqi
= vud
->qi
[qidx
];
581 static bool fv_queue_order(VuDev
*dev
, int qidx
)
586 static const VuDevIface fv_iface
= {
587 .get_features
= fv_get_features
,
588 .set_features
= fv_set_features
,
590 /* Don't need process message, we've not got any at vhost-user level */
591 .queue_set_started
= fv_queue_set_started
,
593 .queue_is_processed_in_order
= fv_queue_order
,
597 * Main loop; this mostly deals with events on the vhost-user
598 * socket itself, and not actual fuse data.
600 int virtio_loop(struct fuse_session
*se
)
602 fuse_log(FUSE_LOG_INFO
, "%s: Entry\n", __func__
);
604 while (!fuse_session_exited(se
)) {
606 pf
[0].fd
= se
->vu_socketfd
;
607 pf
[0].events
= POLLIN
;
610 fuse_log(FUSE_LOG_DEBUG
, "%s: Waiting for VU event\n", __func__
);
611 int poll_res
= ppoll(pf
, 1, NULL
, NULL
);
613 if (poll_res
== -1) {
614 if (errno
== EINTR
) {
615 fuse_log(FUSE_LOG_INFO
, "%s: ppoll interrupted, going around\n",
619 fuse_log(FUSE_LOG_ERR
, "virtio_loop ppoll: %m\n");
622 assert(poll_res
== 1);
623 if (pf
[0].revents
& (POLLERR
| POLLHUP
| POLLNVAL
)) {
624 fuse_log(FUSE_LOG_ERR
, "%s: Unexpected poll revents %x\n", __func__
,
628 assert(pf
[0].revents
& POLLIN
);
629 fuse_log(FUSE_LOG_DEBUG
, "%s: Got VU event\n", __func__
);
630 if (!vu_dispatch(&se
->virtio_dev
->dev
)) {
631 fuse_log(FUSE_LOG_ERR
, "%s: vu_dispatch failed\n", __func__
);
636 fuse_log(FUSE_LOG_INFO
, "%s: Exit\n", __func__
);
641 static int fv_create_listen_socket(struct fuse_session
*se
)
643 struct sockaddr_un un
;
646 /* Nothing to do if fd is already initialized */
647 if (se
->vu_listen_fd
>= 0) {
651 if (strlen(se
->vu_socket_path
) >= sizeof(un
.sun_path
)) {
652 fuse_log(FUSE_LOG_ERR
, "Socket path too long\n");
657 * Create the Unix socket to communicate with qemu
658 * based on QEMU's vhost-user-bridge
660 unlink(se
->vu_socket_path
);
661 strcpy(un
.sun_path
, se
->vu_socket_path
);
662 size_t addr_len
= sizeof(un
);
664 int listen_sock
= socket(AF_UNIX
, SOCK_STREAM
, 0);
665 if (listen_sock
== -1) {
666 fuse_log(FUSE_LOG_ERR
, "vhost socket creation: %m\n");
669 un
.sun_family
= AF_UNIX
;
672 * Unfortunately bind doesn't let you set the mask on the socket,
673 * so set umask to 077 and restore it later.
675 old_umask
= umask(0077);
676 if (bind(listen_sock
, (struct sockaddr
*)&un
, addr_len
) == -1) {
677 fuse_log(FUSE_LOG_ERR
, "vhost socket bind: %m\n");
683 if (listen(listen_sock
, 1) == -1) {
684 fuse_log(FUSE_LOG_ERR
, "vhost socket listen: %m\n");
688 se
->vu_listen_fd
= listen_sock
;
692 int virtio_session_mount(struct fuse_session
*se
)
696 ret
= fv_create_listen_socket(se
);
703 fuse_log(FUSE_LOG_INFO
, "%s: Waiting for vhost-user socket connection...\n",
705 int data_sock
= accept(se
->vu_listen_fd
, NULL
, NULL
);
706 if (data_sock
== -1) {
707 fuse_log(FUSE_LOG_ERR
, "vhost socket accept: %m\n");
708 close(se
->vu_listen_fd
);
711 close(se
->vu_listen_fd
);
712 se
->vu_listen_fd
= -1;
713 fuse_log(FUSE_LOG_INFO
, "%s: Received vhost-user socket connection\n",
716 /* TODO: Some cleanup/deallocation! */
717 se
->virtio_dev
= calloc(sizeof(struct fv_VuDev
), 1);
718 if (!se
->virtio_dev
) {
719 fuse_log(FUSE_LOG_ERR
, "%s: virtio_dev calloc failed\n", __func__
);
724 se
->vu_socketfd
= data_sock
;
725 se
->virtio_dev
->se
= se
;
726 vu_init(&se
->virtio_dev
->dev
, 2, se
->vu_socketfd
, fv_panic
, fv_set_watch
,
727 fv_remove_watch
, &fv_iface
);