2 * virtio-fs glue for FUSE
3 * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates
6 * Dave Gilbert <dgilbert@redhat.com>
8 * Implements the glue between libfuse and libvhost-user
10 * This program can be distributed under the terms of the GNU LGPLv2.
11 * See the file COPYING.LIB
14 #include "qemu/osdep.h"
16 #include "qapi/error.h"
18 #include "standard-headers/linux/fuse.h"
19 #include "fuse_misc.h"
21 #include "fuse_virtio.h"
30 #include <sys/eventfd.h>
31 #include <sys/socket.h>
32 #include <sys/types.h>
34 #include <sys/types.h>
38 #include "contrib/libvhost-user/libvhost-user.h"
44 * This lock protects the VuVirtq preventing races between
45 * fv_queue_thread() and fv_queue_worker().
47 pthread_mutex_t vq_lock
;
49 struct fv_VuDev
*virtio_dev
;
51 /* Our queue index, corresponds to array position */
54 int kill_fd
; /* For killing the thread */
62 /* Used to complete requests that involve no reply */
67 * We pass the dev element into libvhost-user
68 * and then use it to get back to the outer
69 * container for other data.
73 struct fuse_session
*se
;
76 * Either handle virtqueues or vhost-user protocol messages. Don't do
77 * both at the same time since that could lead to race conditions if
78 * virtqueues or memory tables change while another thread is accessing
81 * The assumptions are:
82 * 1. fv_queue_thread() reads/writes to virtqueues and only reads VuDev.
83 * 2. virtio_loop() reads/writes virtqueues and VuDev.
85 pthread_rwlock_t vu_dispatch_rwlock
;
88 * The following pair of fields are only accessed in the main
92 struct fv_QueueInfo
**qi
;
96 struct virtio_fs_config
{
101 /* Callback from libvhost-user */
102 static uint64_t fv_get_features(VuDev
*dev
)
104 return 1ULL << VIRTIO_F_VERSION_1
;
107 /* Callback from libvhost-user */
108 static void fv_set_features(VuDev
*dev
, uint64_t features
)
113 * Callback from libvhost-user if there's a new fd we're supposed to listen
114 * to, typically a queue kick?
116 static void fv_set_watch(VuDev
*dev
, int fd
, int condition
, vu_watch_cb cb
,
119 fuse_log(FUSE_LOG_WARNING
, "%s: TODO! fd=%d\n", __func__
, fd
);
123 * Callback from libvhost-user if we're no longer supposed to listen on an fd
125 static void fv_remove_watch(VuDev
*dev
, int fd
)
127 fuse_log(FUSE_LOG_WARNING
, "%s: TODO! fd=%d\n", __func__
, fd
);
130 /* Callback from libvhost-user to panic */
131 static void fv_panic(VuDev
*dev
, const char *err
)
133 fuse_log(FUSE_LOG_ERR
, "%s: libvhost-user: %s\n", __func__
, err
);
134 /* TODO: Allow reconnects?? */
139 * Copy from an iovec into a fuse_buf (memory only)
140 * Caller must ensure there is space
142 static void copy_from_iov(struct fuse_buf
*buf
, size_t out_num
,
143 const struct iovec
*out_sg
)
145 void *dest
= buf
->mem
;
148 size_t onelen
= out_sg
->iov_len
;
149 memcpy(dest
, out_sg
->iov_base
, onelen
);
157 * Copy from one iov to another, the given number of bytes
158 * The caller must have checked sizes.
160 static void copy_iov(struct iovec
*src_iov
, int src_count
,
161 struct iovec
*dst_iov
, int dst_count
, size_t to_copy
)
163 size_t dst_offset
= 0;
164 /* Outer loop copies 'src' elements */
167 size_t src_len
= src_iov
[0].iov_len
;
168 size_t src_offset
= 0;
170 if (src_len
> to_copy
) {
173 /* Inner loop copies contents of one 'src' to maybe multiple dst. */
176 size_t dst_len
= dst_iov
[0].iov_len
- dst_offset
;
177 if (dst_len
> src_len
) {
181 memcpy(dst_iov
[0].iov_base
+ dst_offset
,
182 src_iov
[0].iov_base
+ src_offset
, dst_len
);
185 src_offset
+= dst_len
;
186 dst_offset
+= dst_len
;
188 assert(dst_offset
<= dst_iov
[0].iov_len
);
189 if (dst_offset
== dst_iov
[0].iov_len
) {
201 * Called back by ll whenever it wants to send a reply/message back
202 * The 1st element of the iov starts with the fuse_out_header
203 * 'unique'==0 means it's a notify message.
205 int virtio_send_msg(struct fuse_session
*se
, struct fuse_chan
*ch
,
206 struct iovec
*iov
, int count
)
208 FVRequest
*req
= container_of(ch
, FVRequest
, ch
);
209 struct fv_QueueInfo
*qi
= ch
->qi
;
210 VuDev
*dev
= &se
->virtio_dev
->dev
;
211 VuVirtq
*q
= vu_get_queue(dev
, qi
->qidx
);
212 VuVirtqElement
*elem
= &req
->elem
;
216 assert(iov
[0].iov_len
>= sizeof(struct fuse_out_header
));
218 struct fuse_out_header
*out
= iov
[0].iov_base
;
219 /* TODO: Endianness! */
221 size_t tosend_len
= iov_size(iov
, count
);
223 /* unique == 0 is notification, which we don't support */
225 assert(!req
->reply_sent
);
227 /* The 'in' part of the elem is to qemu */
228 unsigned int in_num
= elem
->in_num
;
229 struct iovec
*in_sg
= elem
->in_sg
;
230 size_t in_len
= iov_size(in_sg
, in_num
);
231 fuse_log(FUSE_LOG_DEBUG
, "%s: elem %d: with %d in desc of length %zd\n",
232 __func__
, elem
->index
, in_num
, in_len
);
235 * The elem should have room for a 'fuse_out_header' (out from fuse)
236 * plus the data based on the len in the header.
238 if (in_len
< sizeof(struct fuse_out_header
)) {
239 fuse_log(FUSE_LOG_ERR
, "%s: elem %d too short for out_header\n",
240 __func__
, elem
->index
);
244 if (in_len
< tosend_len
) {
245 fuse_log(FUSE_LOG_ERR
, "%s: elem %d too small for data len %zd\n",
246 __func__
, elem
->index
, tosend_len
);
251 copy_iov(iov
, count
, in_sg
, in_num
, tosend_len
);
253 pthread_rwlock_rdlock(&qi
->virtio_dev
->vu_dispatch_rwlock
);
254 pthread_mutex_lock(&qi
->vq_lock
);
255 vu_queue_push(dev
, q
, elem
, tosend_len
);
256 vu_queue_notify(dev
, q
);
257 pthread_mutex_unlock(&qi
->vq_lock
);
258 pthread_rwlock_unlock(&qi
->virtio_dev
->vu_dispatch_rwlock
);
260 req
->reply_sent
= true;
267 * Callback from fuse_send_data_iov_* when it's virtio and the buffer
268 * is a single FD with FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK
269 * We need send the iov and then the buffer.
270 * Return 0 on success
272 int virtio_send_data_iov(struct fuse_session
*se
, struct fuse_chan
*ch
,
273 struct iovec
*iov
, int count
, struct fuse_bufvec
*buf
,
276 FVRequest
*req
= container_of(ch
, FVRequest
, ch
);
277 struct fv_QueueInfo
*qi
= ch
->qi
;
278 VuDev
*dev
= &se
->virtio_dev
->dev
;
279 VuVirtq
*q
= vu_get_queue(dev
, qi
->qidx
);
280 VuVirtqElement
*elem
= &req
->elem
;
284 assert(iov
[0].iov_len
>= sizeof(struct fuse_out_header
));
286 struct fuse_out_header
*out
= iov
[0].iov_base
;
287 /* TODO: Endianness! */
289 size_t iov_len
= iov_size(iov
, count
);
290 size_t tosend_len
= iov_len
+ len
;
292 out
->len
= tosend_len
;
294 fuse_log(FUSE_LOG_DEBUG
, "%s: count=%d len=%zd iov_len=%zd\n", __func__
,
295 count
, len
, iov_len
);
297 /* unique == 0 is notification which we don't support */
300 assert(!req
->reply_sent
);
302 /* The 'in' part of the elem is to qemu */
303 unsigned int in_num
= elem
->in_num
;
304 struct iovec
*in_sg
= elem
->in_sg
;
305 size_t in_len
= iov_size(in_sg
, in_num
);
306 fuse_log(FUSE_LOG_DEBUG
, "%s: elem %d: with %d in desc of length %zd\n",
307 __func__
, elem
->index
, in_num
, in_len
);
310 * The elem should have room for a 'fuse_out_header' (out from fuse)
311 * plus the data based on the len in the header.
313 if (in_len
< sizeof(struct fuse_out_header
)) {
314 fuse_log(FUSE_LOG_ERR
, "%s: elem %d too short for out_header\n",
315 __func__
, elem
->index
);
319 if (in_len
< tosend_len
) {
320 fuse_log(FUSE_LOG_ERR
, "%s: elem %d too small for data len %zd\n",
321 __func__
, elem
->index
, tosend_len
);
326 /* TODO: Limit to 'len' */
328 /* First copy the header data from iov->in_sg */
329 copy_iov(iov
, count
, in_sg
, in_num
, iov_len
);
332 * Build a copy of the the in_sg iov so we can skip bits in it,
333 * including changing the offsets
335 struct iovec
*in_sg_cpy
= calloc(sizeof(struct iovec
), in_num
);
337 memcpy(in_sg_cpy
, in_sg
, sizeof(struct iovec
) * in_num
);
338 /* These get updated as we skip */
339 struct iovec
*in_sg_ptr
= in_sg_cpy
;
340 int in_sg_cpy_count
= in_num
;
342 /* skip over parts of in_sg that contained the header iov */
343 size_t skip_size
= iov_len
;
345 size_t in_sg_left
= 0;
347 while (skip_size
!= 0 && in_sg_cpy_count
) {
348 if (skip_size
>= in_sg_ptr
[0].iov_len
) {
349 skip_size
-= in_sg_ptr
[0].iov_len
;
353 in_sg_ptr
[0].iov_len
-= skip_size
;
354 in_sg_ptr
[0].iov_base
+= skip_size
;
360 for (i
= 0, in_sg_left
= 0; i
< in_sg_cpy_count
; i
++) {
361 in_sg_left
+= in_sg_ptr
[i
].iov_len
;
363 fuse_log(FUSE_LOG_DEBUG
,
364 "%s: after skip skip_size=%zd in_sg_cpy_count=%d "
366 __func__
, skip_size
, in_sg_cpy_count
, in_sg_left
);
367 ret
= preadv(buf
->buf
[0].fd
, in_sg_ptr
, in_sg_cpy_count
,
372 fuse_log(FUSE_LOG_DEBUG
, "%s: preadv failed (%m) len=%zd\n",
377 fuse_log(FUSE_LOG_DEBUG
, "%s: preadv ret=%d len=%zd\n", __func__
,
379 if (ret
< len
&& ret
) {
380 fuse_log(FUSE_LOG_DEBUG
, "%s: ret < len\n", __func__
);
381 /* Skip over this much next time around */
383 buf
->buf
[0].pos
+= ret
;
386 /* Lets do another read */
391 fuse_log(FUSE_LOG_DEBUG
, "%s: !ret in_sg_left=%zd\n", __func__
,
396 fuse_log(FUSE_LOG_DEBUG
, "%s: ret!=len\n", __func__
);
403 } while (in_sg_left
);
406 /* Need to fix out->len on EOF */
408 struct fuse_out_header
*out_sg
= in_sg
[0].iov_base
;
411 out_sg
->len
= tosend_len
;
416 pthread_rwlock_rdlock(&qi
->virtio_dev
->vu_dispatch_rwlock
);
417 pthread_mutex_lock(&qi
->vq_lock
);
418 vu_queue_push(dev
, q
, elem
, tosend_len
);
419 vu_queue_notify(dev
, q
);
420 pthread_mutex_unlock(&qi
->vq_lock
);
421 pthread_rwlock_unlock(&qi
->virtio_dev
->vu_dispatch_rwlock
);
425 req
->reply_sent
= true;
431 static __thread
bool clone_fs_called
;
433 /* Process one FVRequest in a thread pool */
434 static void fv_queue_worker(gpointer data
, gpointer user_data
)
436 struct fv_QueueInfo
*qi
= user_data
;
437 struct fuse_session
*se
= qi
->virtio_dev
->se
;
438 struct VuDev
*dev
= &qi
->virtio_dev
->dev
;
439 FVRequest
*req
= data
;
440 VuVirtqElement
*elem
= &req
->elem
;
441 struct fuse_buf fbuf
= {};
442 bool allocated_bufv
= false;
443 struct fuse_bufvec bufv
;
444 struct fuse_bufvec
*pbufv
;
446 assert(se
->bufsize
> sizeof(struct fuse_in_header
));
448 if (!clone_fs_called
) {
451 /* unshare FS for xattr operation */
452 ret
= unshare(CLONE_FS
);
453 /* should not fail */
456 clone_fs_called
= true;
460 * An element contains one request and the space to send our response
461 * They're spread over multiple descriptors in a scatter/gather set
462 * and we can't trust the guest to keep them still; so copy in/out.
464 fbuf
.mem
= malloc(se
->bufsize
);
467 fuse_mutex_init(&req
->ch
.lock
);
471 /* The 'out' part of the elem is from qemu */
472 unsigned int out_num
= elem
->out_num
;
473 struct iovec
*out_sg
= elem
->out_sg
;
474 size_t out_len
= iov_size(out_sg
, out_num
);
475 fuse_log(FUSE_LOG_DEBUG
,
476 "%s: elem %d: with %d out desc of length %zd\n",
477 __func__
, elem
->index
, out_num
, out_len
);
480 * The elem should contain a 'fuse_in_header' (in to fuse)
481 * plus the data based on the len in the header.
483 if (out_len
< sizeof(struct fuse_in_header
)) {
484 fuse_log(FUSE_LOG_ERR
, "%s: elem %d too short for in_header\n",
485 __func__
, elem
->index
);
486 assert(0); /* TODO */
488 if (out_len
> se
->bufsize
) {
489 fuse_log(FUSE_LOG_ERR
, "%s: elem %d too large for buffer\n", __func__
,
491 assert(0); /* TODO */
493 /* Copy just the first element and look at it */
494 copy_from_iov(&fbuf
, 1, out_sg
);
496 pbufv
= NULL
; /* Compiler thinks an unitialised path */
498 out_sg
[0].iov_len
== sizeof(struct fuse_in_header
) &&
499 ((struct fuse_in_header
*)fbuf
.mem
)->opcode
== FUSE_WRITE
&&
500 out_sg
[1].iov_len
== sizeof(struct fuse_write_in
)) {
502 * For a write we don't actually need to copy the
503 * data, we can just do it straight out of guest memory
504 * but we must still copy the headers in case the guest
505 * was nasty and changed them while we were using them.
507 fuse_log(FUSE_LOG_DEBUG
, "%s: Write special case\n", __func__
);
509 /* copy the fuse_write_in header afte rthe fuse_in_header */
510 fbuf
.mem
+= out_sg
->iov_len
;
511 copy_from_iov(&fbuf
, 1, out_sg
+ 1);
512 fbuf
.mem
-= out_sg
->iov_len
;
513 fbuf
.size
= out_sg
[0].iov_len
+ out_sg
[1].iov_len
;
515 /* Allocate the bufv, with space for the rest of the iov */
516 pbufv
= malloc(sizeof(struct fuse_bufvec
) +
517 sizeof(struct fuse_buf
) * (out_num
- 2));
519 fuse_log(FUSE_LOG_ERR
, "%s: pbufv malloc failed\n",
524 allocated_bufv
= true;
526 pbufv
->buf
[0] = fbuf
;
528 size_t iovindex
, pbufvindex
;
529 iovindex
= 2; /* 2 headers, separate iovs */
530 pbufvindex
= 1; /* 2 headers, 1 fusebuf */
532 for (; iovindex
< out_num
; iovindex
++, pbufvindex
++) {
534 pbufv
->buf
[pbufvindex
].pos
= ~0; /* Dummy */
535 pbufv
->buf
[pbufvindex
].flags
= 0;
536 pbufv
->buf
[pbufvindex
].mem
= out_sg
[iovindex
].iov_base
;
537 pbufv
->buf
[pbufvindex
].size
= out_sg
[iovindex
].iov_len
;
540 /* Normal (non fast write) path */
542 /* Copy the rest of the buffer */
543 fbuf
.mem
+= out_sg
->iov_len
;
544 copy_from_iov(&fbuf
, out_num
- 1, out_sg
+ 1);
545 fbuf
.mem
-= out_sg
->iov_len
;
548 /* TODO! Endianness of header */
550 /* TODO: Add checks for fuse_session_exited */
557 fuse_session_process_buf_int(se
, pbufv
, &req
->ch
);
560 if (allocated_bufv
) {
564 /* If the request has no reply, still recycle the virtqueue element */
565 if (!req
->reply_sent
) {
566 struct VuVirtq
*q
= vu_get_queue(dev
, qi
->qidx
);
568 fuse_log(FUSE_LOG_DEBUG
, "%s: elem %d no reply sent\n", __func__
,
571 pthread_rwlock_rdlock(&qi
->virtio_dev
->vu_dispatch_rwlock
);
572 pthread_mutex_lock(&qi
->vq_lock
);
573 vu_queue_push(dev
, q
, elem
, 0);
574 vu_queue_notify(dev
, q
);
575 pthread_mutex_unlock(&qi
->vq_lock
);
576 pthread_rwlock_unlock(&qi
->virtio_dev
->vu_dispatch_rwlock
);
579 pthread_mutex_destroy(&req
->ch
.lock
);
584 /* Thread function for individual queues, created when a queue is 'started' */
585 static void *fv_queue_thread(void *opaque
)
587 struct fv_QueueInfo
*qi
= opaque
;
588 struct VuDev
*dev
= &qi
->virtio_dev
->dev
;
589 struct VuVirtq
*q
= vu_get_queue(dev
, qi
->qidx
);
590 struct fuse_session
*se
= qi
->virtio_dev
->se
;
593 pool
= g_thread_pool_new(fv_queue_worker
, qi
, se
->thread_pool_size
, FALSE
,
596 fuse_log(FUSE_LOG_ERR
, "%s: g_thread_pool_new failed\n", __func__
);
600 fuse_log(FUSE_LOG_INFO
, "%s: Start for queue %d kick_fd %d\n", __func__
,
601 qi
->qidx
, qi
->kick_fd
);
606 pf
[0].fd
= qi
->kick_fd
;
607 pf
[0].events
= POLLIN
;
609 pf
[1].fd
= qi
->kill_fd
;
610 pf
[1].events
= POLLIN
;
613 fuse_log(FUSE_LOG_DEBUG
, "%s: Waiting for Queue %d event\n", __func__
,
615 int poll_res
= ppoll(pf
, 2, NULL
, NULL
);
617 if (poll_res
== -1) {
618 if (errno
== EINTR
) {
619 fuse_log(FUSE_LOG_INFO
, "%s: ppoll interrupted, going around\n",
623 fuse_log(FUSE_LOG_ERR
, "fv_queue_thread ppoll: %m\n");
626 assert(poll_res
>= 1);
627 if (pf
[0].revents
& (POLLERR
| POLLHUP
| POLLNVAL
)) {
628 fuse_log(FUSE_LOG_ERR
, "%s: Unexpected poll revents %x Queue %d\n",
629 __func__
, pf
[0].revents
, qi
->qidx
);
632 if (pf
[1].revents
& (POLLERR
| POLLHUP
| POLLNVAL
)) {
633 fuse_log(FUSE_LOG_ERR
,
634 "%s: Unexpected poll revents %x Queue %d killfd\n",
635 __func__
, pf
[1].revents
, qi
->qidx
);
639 fuse_log(FUSE_LOG_INFO
, "%s: kill event on queue %d - quitting\n",
643 assert(pf
[0].revents
& POLLIN
);
644 fuse_log(FUSE_LOG_DEBUG
, "%s: Got queue event on Queue %d\n", __func__
,
648 if (eventfd_read(qi
->kick_fd
, &evalue
)) {
649 fuse_log(FUSE_LOG_ERR
, "Eventfd_read for queue: %m\n");
652 /* Mutual exclusion with virtio_loop() */
653 ret
= pthread_rwlock_rdlock(&qi
->virtio_dev
->vu_dispatch_rwlock
);
654 assert(ret
== 0); /* there is no possible error case */
655 pthread_mutex_lock(&qi
->vq_lock
);
656 /* out is from guest, in is too guest */
657 unsigned int in_bytes
, out_bytes
;
658 vu_queue_get_avail_bytes(dev
, q
, &in_bytes
, &out_bytes
, ~0, ~0);
660 fuse_log(FUSE_LOG_DEBUG
,
661 "%s: Queue %d gave evalue: %zx available: in: %u out: %u\n",
662 __func__
, qi
->qidx
, (size_t)evalue
, in_bytes
, out_bytes
);
665 FVRequest
*req
= vu_queue_pop(dev
, q
, sizeof(FVRequest
));
670 req
->reply_sent
= false;
672 g_thread_pool_push(pool
, req
, NULL
);
675 pthread_mutex_unlock(&qi
->vq_lock
);
676 pthread_rwlock_unlock(&qi
->virtio_dev
->vu_dispatch_rwlock
);
679 g_thread_pool_free(pool
, FALSE
, TRUE
);
684 static void fv_queue_cleanup_thread(struct fv_VuDev
*vud
, int qidx
)
687 struct fv_QueueInfo
*ourqi
;
689 assert(qidx
< vud
->nqueues
);
690 ourqi
= vud
->qi
[qidx
];
692 /* Kill the thread */
693 if (eventfd_write(ourqi
->kill_fd
, 1)) {
694 fuse_log(FUSE_LOG_ERR
, "Eventfd_write for queue %d: %s\n",
695 qidx
, strerror(errno
));
697 ret
= pthread_join(ourqi
->thread
, NULL
);
699 fuse_log(FUSE_LOG_ERR
, "%s: Failed to join thread idx %d err %d\n",
700 __func__
, qidx
, ret
);
702 pthread_mutex_destroy(&ourqi
->vq_lock
);
703 close(ourqi
->kill_fd
);
706 vud
->qi
[qidx
] = NULL
;
709 /* Callback from libvhost-user on start or stop of a queue */
710 static void fv_queue_set_started(VuDev
*dev
, int qidx
, bool started
)
712 struct fv_VuDev
*vud
= container_of(dev
, struct fv_VuDev
, dev
);
713 struct fv_QueueInfo
*ourqi
;
715 fuse_log(FUSE_LOG_INFO
, "%s: qidx=%d started=%d\n", __func__
, qidx
,
720 * Ignore additional request queues for now. passthrough_ll.c must be
721 * audited for thread-safety issues first. It was written with a
722 * well-behaved client in mind and may not protect against all types of
726 fuse_log(FUSE_LOG_ERR
,
727 "%s: multiple request queues not yet implemented, please only "
728 "configure 1 request queue\n",
734 /* Fire up a thread to watch this queue */
735 if (qidx
>= vud
->nqueues
) {
736 vud
->qi
= realloc(vud
->qi
, (qidx
+ 1) * sizeof(vud
->qi
[0]));
738 memset(vud
->qi
+ vud
->nqueues
, 0,
739 sizeof(vud
->qi
[0]) * (1 + (qidx
- vud
->nqueues
)));
740 vud
->nqueues
= qidx
+ 1;
742 if (!vud
->qi
[qidx
]) {
743 vud
->qi
[qidx
] = calloc(sizeof(struct fv_QueueInfo
), 1);
744 assert(vud
->qi
[qidx
]);
745 vud
->qi
[qidx
]->virtio_dev
= vud
;
746 vud
->qi
[qidx
]->qidx
= qidx
;
748 /* Shouldn't have been started */
749 assert(vud
->qi
[qidx
]->kick_fd
== -1);
751 ourqi
= vud
->qi
[qidx
];
752 ourqi
->kick_fd
= dev
->vq
[qidx
].kick_fd
;
754 ourqi
->kill_fd
= eventfd(0, EFD_CLOEXEC
| EFD_SEMAPHORE
);
755 assert(ourqi
->kill_fd
!= -1);
756 pthread_mutex_init(&ourqi
->vq_lock
, NULL
);
758 if (pthread_create(&ourqi
->thread
, NULL
, fv_queue_thread
, ourqi
)) {
759 fuse_log(FUSE_LOG_ERR
, "%s: Failed to create thread for queue %d\n",
764 fv_queue_cleanup_thread(vud
, qidx
);
768 static bool fv_queue_order(VuDev
*dev
, int qidx
)
773 static const VuDevIface fv_iface
= {
774 .get_features
= fv_get_features
,
775 .set_features
= fv_set_features
,
777 /* Don't need process message, we've not got any at vhost-user level */
778 .queue_set_started
= fv_queue_set_started
,
780 .queue_is_processed_in_order
= fv_queue_order
,
784 * Main loop; this mostly deals with events on the vhost-user
785 * socket itself, and not actual fuse data.
787 int virtio_loop(struct fuse_session
*se
)
789 fuse_log(FUSE_LOG_INFO
, "%s: Entry\n", __func__
);
791 while (!fuse_session_exited(se
)) {
795 pf
[0].fd
= se
->vu_socketfd
;
796 pf
[0].events
= POLLIN
;
799 fuse_log(FUSE_LOG_DEBUG
, "%s: Waiting for VU event\n", __func__
);
800 int poll_res
= ppoll(pf
, 1, NULL
, NULL
);
802 if (poll_res
== -1) {
803 if (errno
== EINTR
) {
804 fuse_log(FUSE_LOG_INFO
, "%s: ppoll interrupted, going around\n",
808 fuse_log(FUSE_LOG_ERR
, "virtio_loop ppoll: %m\n");
811 assert(poll_res
== 1);
812 if (pf
[0].revents
& (POLLERR
| POLLHUP
| POLLNVAL
)) {
813 fuse_log(FUSE_LOG_ERR
, "%s: Unexpected poll revents %x\n", __func__
,
817 assert(pf
[0].revents
& POLLIN
);
818 fuse_log(FUSE_LOG_DEBUG
, "%s: Got VU event\n", __func__
);
819 /* Mutual exclusion with fv_queue_thread() */
820 ret
= pthread_rwlock_wrlock(&se
->virtio_dev
->vu_dispatch_rwlock
);
821 assert(ret
== 0); /* there is no possible error case */
823 ok
= vu_dispatch(&se
->virtio_dev
->dev
);
825 pthread_rwlock_unlock(&se
->virtio_dev
->vu_dispatch_rwlock
);
828 fuse_log(FUSE_LOG_ERR
, "%s: vu_dispatch failed\n", __func__
);
834 * Make sure all fv_queue_thread()s quit on exit, as we're about to
835 * free virtio dev and fuse session, no one should access them anymore.
837 for (int i
= 0; i
< se
->virtio_dev
->nqueues
; i
++) {
838 if (!se
->virtio_dev
->qi
[i
]) {
842 fuse_log(FUSE_LOG_INFO
, "%s: Stopping queue %d thread\n", __func__
, i
);
843 fv_queue_cleanup_thread(se
->virtio_dev
, i
);
846 fuse_log(FUSE_LOG_INFO
, "%s: Exit\n", __func__
);
851 static void strreplace(char *s
, char old
, char new)
860 static bool fv_socket_lock(struct fuse_session
*se
)
862 g_autofree gchar
*sk_name
= NULL
;
863 g_autofree gchar
*pidfile
= NULL
;
864 g_autofree gchar
*dir
= NULL
;
865 Error
*local_err
= NULL
;
867 dir
= qemu_get_local_state_pathname("run/virtiofsd");
869 if (g_mkdir_with_parents(dir
, S_IRWXU
) < 0) {
870 fuse_log(FUSE_LOG_ERR
, "%s: Failed to create directory %s: %s",
871 __func__
, dir
, strerror(errno
));
875 sk_name
= g_strdup(se
->vu_socket_path
);
876 strreplace(sk_name
, '/', '.');
877 pidfile
= g_strdup_printf("%s/%s.pid", dir
, sk_name
);
879 if (!qemu_write_pidfile(pidfile
, &local_err
)) {
880 error_report_err(local_err
);
887 static int fv_create_listen_socket(struct fuse_session
*se
)
889 struct sockaddr_un un
;
892 /* Nothing to do if fd is already initialized */
893 if (se
->vu_listen_fd
>= 0) {
897 if (strlen(se
->vu_socket_path
) >= sizeof(un
.sun_path
)) {
898 fuse_log(FUSE_LOG_ERR
, "Socket path too long\n");
902 if (!strlen(se
->vu_socket_path
)) {
903 fuse_log(FUSE_LOG_ERR
, "Socket path is empty\n");
907 /* Check the vu_socket_path is already used */
908 if (!fv_socket_lock(se
)) {
913 * Create the Unix socket to communicate with qemu
914 * based on QEMU's vhost-user-bridge
916 unlink(se
->vu_socket_path
);
917 strcpy(un
.sun_path
, se
->vu_socket_path
);
918 size_t addr_len
= sizeof(un
);
920 int listen_sock
= socket(AF_UNIX
, SOCK_STREAM
, 0);
921 if (listen_sock
== -1) {
922 fuse_log(FUSE_LOG_ERR
, "vhost socket creation: %m\n");
925 un
.sun_family
= AF_UNIX
;
928 * Unfortunately bind doesn't let you set the mask on the socket,
929 * so set umask appropriately and restore it later.
931 if (se
->vu_socket_group
) {
932 old_umask
= umask(S_IROTH
| S_IWOTH
| S_IXOTH
);
934 old_umask
= umask(S_IRGRP
| S_IWGRP
| S_IXGRP
|
935 S_IROTH
| S_IWOTH
| S_IXOTH
);
937 if (bind(listen_sock
, (struct sockaddr
*)&un
, addr_len
) == -1) {
938 fuse_log(FUSE_LOG_ERR
, "vhost socket bind: %m\n");
943 if (se
->vu_socket_group
) {
944 struct group
*g
= getgrnam(se
->vu_socket_group
);
946 if (!chown(se
->vu_socket_path
, -1, g
->gr_gid
)) {
947 fuse_log(FUSE_LOG_WARNING
,
948 "vhost socket failed to set group to %s (%d)\n",
949 se
->vu_socket_group
, g
->gr_gid
);
955 if (listen(listen_sock
, 1) == -1) {
956 fuse_log(FUSE_LOG_ERR
, "vhost socket listen: %m\n");
961 se
->vu_listen_fd
= listen_sock
;
965 int virtio_session_mount(struct fuse_session
*se
)
970 * Test that unshare(CLONE_FS) works. fv_queue_worker() will need it. It's
971 * an unprivileged system call but some Docker/Moby versions are known to
972 * reject it via seccomp when CAP_SYS_ADMIN is not given.
974 * Note that the program is single-threaded here so this syscall has no
975 * visible effect and is safe to make.
977 ret
= unshare(CLONE_FS
);
978 if (ret
== -1 && errno
== EPERM
) {
979 fuse_log(FUSE_LOG_ERR
, "unshare(CLONE_FS) failed with EPERM. If "
980 "running in a container please check that the container "
981 "runtime seccomp policy allows unshare.\n");
985 ret
= fv_create_listen_socket(se
);
992 fuse_log(FUSE_LOG_INFO
, "%s: Waiting for vhost-user socket connection...\n",
994 int data_sock
= accept(se
->vu_listen_fd
, NULL
, NULL
);
995 if (data_sock
== -1) {
996 fuse_log(FUSE_LOG_ERR
, "vhost socket accept: %m\n");
997 close(se
->vu_listen_fd
);
1000 close(se
->vu_listen_fd
);
1001 se
->vu_listen_fd
= -1;
1002 fuse_log(FUSE_LOG_INFO
, "%s: Received vhost-user socket connection\n",
1005 /* TODO: Some cleanup/deallocation! */
1006 se
->virtio_dev
= calloc(sizeof(struct fv_VuDev
), 1);
1007 if (!se
->virtio_dev
) {
1008 fuse_log(FUSE_LOG_ERR
, "%s: virtio_dev calloc failed\n", __func__
);
1013 se
->vu_socketfd
= data_sock
;
1014 se
->virtio_dev
->se
= se
;
1015 pthread_rwlock_init(&se
->virtio_dev
->vu_dispatch_rwlock
, NULL
);
1016 vu_init(&se
->virtio_dev
->dev
, 2, se
->vu_socketfd
, fv_panic
, NULL
,
1017 fv_set_watch
, fv_remove_watch
, &fv_iface
);
1022 void virtio_session_close(struct fuse_session
*se
)
1024 close(se
->vu_socketfd
);
1026 if (!se
->virtio_dev
) {
1030 free(se
->virtio_dev
->qi
);
1031 pthread_rwlock_destroy(&se
->virtio_dev
->vu_dispatch_rwlock
);
1032 free(se
->virtio_dev
);
1033 se
->virtio_dev
= NULL
;