configure: Improve TCI feature description
[qemu/ar7.git] / tools / virtiofsd / fuse_virtio.c
blobddcefee4272f7f448c1fc9a89c59859c4782590d
1 /*
2 * virtio-fs glue for FUSE
3 * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates
5 * Authors:
6 * Dave Gilbert <dgilbert@redhat.com>
8 * Implements the glue between libfuse and libvhost-user
10 * This program can be distributed under the terms of the GNU LGPLv2.
11 * See the file COPYING.LIB
14 #include "qemu/osdep.h"
15 #include "qemu/iov.h"
16 #include "qapi/error.h"
17 #include "fuse_i.h"
18 #include "standard-headers/linux/fuse.h"
19 #include "fuse_misc.h"
20 #include "fuse_opt.h"
21 #include "fuse_virtio.h"
23 #include <sys/eventfd.h>
24 #include <sys/socket.h>
25 #include <sys/un.h>
26 #include <grp.h>
28 #include "libvhost-user.h"
30 struct fv_VuDev;
31 struct fv_QueueInfo {
32 pthread_t thread;
34 * This lock protects the VuVirtq preventing races between
35 * fv_queue_thread() and fv_queue_worker().
37 pthread_mutex_t vq_lock;
39 struct fv_VuDev *virtio_dev;
41 /* Our queue index, corresponds to array position */
42 int qidx;
43 int kick_fd;
44 int kill_fd; /* For killing the thread */
47 /* A FUSE request */
48 typedef struct {
49 VuVirtqElement elem;
50 struct fuse_chan ch;
52 /* Used to complete requests that involve no reply */
53 bool reply_sent;
54 } FVRequest;
57 * We pass the dev element into libvhost-user
58 * and then use it to get back to the outer
59 * container for other data.
61 struct fv_VuDev {
62 VuDev dev;
63 struct fuse_session *se;
66 * Either handle virtqueues or vhost-user protocol messages. Don't do
67 * both at the same time since that could lead to race conditions if
68 * virtqueues or memory tables change while another thread is accessing
69 * them.
71 * The assumptions are:
72 * 1. fv_queue_thread() reads/writes to virtqueues and only reads VuDev.
73 * 2. virtio_loop() reads/writes virtqueues and VuDev.
75 pthread_rwlock_t vu_dispatch_rwlock;
78 * The following pair of fields are only accessed in the main
79 * virtio_loop
81 size_t nqueues;
82 struct fv_QueueInfo **qi;
85 /* From spec */
86 struct virtio_fs_config {
87 char tag[36];
88 uint32_t num_queues;
91 /* Callback from libvhost-user */
92 static uint64_t fv_get_features(VuDev *dev)
94 return 1ULL << VIRTIO_F_VERSION_1;
97 /* Callback from libvhost-user */
98 static void fv_set_features(VuDev *dev, uint64_t features)
103 * Callback from libvhost-user if there's a new fd we're supposed to listen
104 * to, typically a queue kick?
106 static void fv_set_watch(VuDev *dev, int fd, int condition, vu_watch_cb cb,
107 void *data)
109 fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd);
113 * Callback from libvhost-user if we're no longer supposed to listen on an fd
115 static void fv_remove_watch(VuDev *dev, int fd)
117 fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd);
120 /* Callback from libvhost-user to panic */
121 static void fv_panic(VuDev *dev, const char *err)
123 fuse_log(FUSE_LOG_ERR, "%s: libvhost-user: %s\n", __func__, err);
124 /* TODO: Allow reconnects?? */
125 exit(EXIT_FAILURE);
129 * Copy from an iovec into a fuse_buf (memory only)
130 * Caller must ensure there is space
132 static void copy_from_iov(struct fuse_buf *buf, size_t out_num,
133 const struct iovec *out_sg)
135 void *dest = buf->mem;
137 while (out_num) {
138 size_t onelen = out_sg->iov_len;
139 memcpy(dest, out_sg->iov_base, onelen);
140 dest += onelen;
141 out_sg++;
142 out_num--;
147 * Copy from one iov to another, the given number of bytes
148 * The caller must have checked sizes.
150 static void copy_iov(struct iovec *src_iov, int src_count,
151 struct iovec *dst_iov, int dst_count, size_t to_copy)
153 size_t dst_offset = 0;
154 /* Outer loop copies 'src' elements */
155 while (to_copy) {
156 assert(src_count);
157 size_t src_len = src_iov[0].iov_len;
158 size_t src_offset = 0;
160 if (src_len > to_copy) {
161 src_len = to_copy;
163 /* Inner loop copies contents of one 'src' to maybe multiple dst. */
164 while (src_len) {
165 assert(dst_count);
166 size_t dst_len = dst_iov[0].iov_len - dst_offset;
167 if (dst_len > src_len) {
168 dst_len = src_len;
171 memcpy(dst_iov[0].iov_base + dst_offset,
172 src_iov[0].iov_base + src_offset, dst_len);
173 src_len -= dst_len;
174 to_copy -= dst_len;
175 src_offset += dst_len;
176 dst_offset += dst_len;
178 assert(dst_offset <= dst_iov[0].iov_len);
179 if (dst_offset == dst_iov[0].iov_len) {
180 dst_offset = 0;
181 dst_iov++;
182 dst_count--;
185 src_iov++;
186 src_count--;
191 * Called back by ll whenever it wants to send a reply/message back
192 * The 1st element of the iov starts with the fuse_out_header
193 * 'unique'==0 means it's a notify message.
195 int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch,
196 struct iovec *iov, int count)
198 FVRequest *req = container_of(ch, FVRequest, ch);
199 struct fv_QueueInfo *qi = ch->qi;
200 VuDev *dev = &se->virtio_dev->dev;
201 VuVirtq *q = vu_get_queue(dev, qi->qidx);
202 VuVirtqElement *elem = &req->elem;
203 int ret = 0;
205 assert(count >= 1);
206 assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
208 struct fuse_out_header *out = iov[0].iov_base;
209 /* TODO: Endianness! */
211 size_t tosend_len = iov_size(iov, count);
213 /* unique == 0 is notification, which we don't support */
214 assert(out->unique);
215 assert(!req->reply_sent);
217 /* The 'in' part of the elem is to qemu */
218 unsigned int in_num = elem->in_num;
219 struct iovec *in_sg = elem->in_sg;
220 size_t in_len = iov_size(in_sg, in_num);
221 fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
222 __func__, elem->index, in_num, in_len);
225 * The elem should have room for a 'fuse_out_header' (out from fuse)
226 * plus the data based on the len in the header.
228 if (in_len < sizeof(struct fuse_out_header)) {
229 fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
230 __func__, elem->index);
231 ret = -E2BIG;
232 goto err;
234 if (in_len < tosend_len) {
235 fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
236 __func__, elem->index, tosend_len);
237 ret = -E2BIG;
238 goto err;
241 copy_iov(iov, count, in_sg, in_num, tosend_len);
243 pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
244 pthread_mutex_lock(&qi->vq_lock);
245 vu_queue_push(dev, q, elem, tosend_len);
246 vu_queue_notify(dev, q);
247 pthread_mutex_unlock(&qi->vq_lock);
248 pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
250 req->reply_sent = true;
252 err:
253 return ret;
257 * Callback from fuse_send_data_iov_* when it's virtio and the buffer
258 * is a single FD with FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK
259 * We need send the iov and then the buffer.
260 * Return 0 on success
262 int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
263 struct iovec *iov, int count, struct fuse_bufvec *buf,
264 size_t len)
266 FVRequest *req = container_of(ch, FVRequest, ch);
267 struct fv_QueueInfo *qi = ch->qi;
268 VuDev *dev = &se->virtio_dev->dev;
269 VuVirtq *q = vu_get_queue(dev, qi->qidx);
270 VuVirtqElement *elem = &req->elem;
271 int ret = 0;
273 assert(count >= 1);
274 assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
276 struct fuse_out_header *out = iov[0].iov_base;
277 /* TODO: Endianness! */
279 size_t iov_len = iov_size(iov, count);
280 size_t tosend_len = iov_len + len;
282 out->len = tosend_len;
284 fuse_log(FUSE_LOG_DEBUG, "%s: count=%d len=%zd iov_len=%zd\n", __func__,
285 count, len, iov_len);
287 /* unique == 0 is notification which we don't support */
288 assert(out->unique);
290 assert(!req->reply_sent);
292 /* The 'in' part of the elem is to qemu */
293 unsigned int in_num = elem->in_num;
294 struct iovec *in_sg = elem->in_sg;
295 size_t in_len = iov_size(in_sg, in_num);
296 fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
297 __func__, elem->index, in_num, in_len);
300 * The elem should have room for a 'fuse_out_header' (out from fuse)
301 * plus the data based on the len in the header.
303 if (in_len < sizeof(struct fuse_out_header)) {
304 fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
305 __func__, elem->index);
306 ret = E2BIG;
307 goto err;
309 if (in_len < tosend_len) {
310 fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
311 __func__, elem->index, tosend_len);
312 ret = E2BIG;
313 goto err;
316 /* TODO: Limit to 'len' */
318 /* First copy the header data from iov->in_sg */
319 copy_iov(iov, count, in_sg, in_num, iov_len);
322 * Build a copy of the the in_sg iov so we can skip bits in it,
323 * including changing the offsets
325 struct iovec *in_sg_cpy = calloc(sizeof(struct iovec), in_num);
326 assert(in_sg_cpy);
327 memcpy(in_sg_cpy, in_sg, sizeof(struct iovec) * in_num);
328 /* These get updated as we skip */
329 struct iovec *in_sg_ptr = in_sg_cpy;
330 int in_sg_cpy_count = in_num;
332 /* skip over parts of in_sg that contained the header iov */
333 size_t skip_size = iov_len;
335 size_t in_sg_left = 0;
336 do {
337 while (skip_size != 0 && in_sg_cpy_count) {
338 if (skip_size >= in_sg_ptr[0].iov_len) {
339 skip_size -= in_sg_ptr[0].iov_len;
340 in_sg_ptr++;
341 in_sg_cpy_count--;
342 } else {
343 in_sg_ptr[0].iov_len -= skip_size;
344 in_sg_ptr[0].iov_base += skip_size;
345 break;
349 int i;
350 for (i = 0, in_sg_left = 0; i < in_sg_cpy_count; i++) {
351 in_sg_left += in_sg_ptr[i].iov_len;
353 fuse_log(FUSE_LOG_DEBUG,
354 "%s: after skip skip_size=%zd in_sg_cpy_count=%d "
355 "in_sg_left=%zd\n",
356 __func__, skip_size, in_sg_cpy_count, in_sg_left);
357 ret = preadv(buf->buf[0].fd, in_sg_ptr, in_sg_cpy_count,
358 buf->buf[0].pos);
360 if (ret == -1) {
361 ret = errno;
362 fuse_log(FUSE_LOG_DEBUG, "%s: preadv failed (%m) len=%zd\n",
363 __func__, len);
364 free(in_sg_cpy);
365 goto err;
367 fuse_log(FUSE_LOG_DEBUG, "%s: preadv ret=%d len=%zd\n", __func__,
368 ret, len);
369 if (ret < len && ret) {
370 fuse_log(FUSE_LOG_DEBUG, "%s: ret < len\n", __func__);
371 /* Skip over this much next time around */
372 skip_size = ret;
373 buf->buf[0].pos += ret;
374 len -= ret;
376 /* Lets do another read */
377 continue;
379 if (!ret) {
380 /* EOF case? */
381 fuse_log(FUSE_LOG_DEBUG, "%s: !ret in_sg_left=%zd\n", __func__,
382 in_sg_left);
383 break;
385 if (ret != len) {
386 fuse_log(FUSE_LOG_DEBUG, "%s: ret!=len\n", __func__);
387 ret = EIO;
388 free(in_sg_cpy);
389 goto err;
391 in_sg_left -= ret;
392 len -= ret;
393 } while (in_sg_left);
394 free(in_sg_cpy);
396 /* Need to fix out->len on EOF */
397 if (len) {
398 struct fuse_out_header *out_sg = in_sg[0].iov_base;
400 tosend_len -= len;
401 out_sg->len = tosend_len;
404 ret = 0;
406 pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
407 pthread_mutex_lock(&qi->vq_lock);
408 vu_queue_push(dev, q, elem, tosend_len);
409 vu_queue_notify(dev, q);
410 pthread_mutex_unlock(&qi->vq_lock);
411 pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
413 err:
414 if (ret == 0) {
415 req->reply_sent = true;
418 return ret;
421 static __thread bool clone_fs_called;
423 /* Process one FVRequest in a thread pool */
424 static void fv_queue_worker(gpointer data, gpointer user_data)
426 struct fv_QueueInfo *qi = user_data;
427 struct fuse_session *se = qi->virtio_dev->se;
428 struct VuDev *dev = &qi->virtio_dev->dev;
429 FVRequest *req = data;
430 VuVirtqElement *elem = &req->elem;
431 struct fuse_buf fbuf = {};
432 bool allocated_bufv = false;
433 struct fuse_bufvec bufv;
434 struct fuse_bufvec *pbufv;
436 assert(se->bufsize > sizeof(struct fuse_in_header));
438 if (!clone_fs_called) {
439 int ret;
441 /* unshare FS for xattr operation */
442 ret = unshare(CLONE_FS);
443 /* should not fail */
444 assert(ret == 0);
446 clone_fs_called = true;
450 * An element contains one request and the space to send our response
451 * They're spread over multiple descriptors in a scatter/gather set
452 * and we can't trust the guest to keep them still; so copy in/out.
454 fbuf.mem = malloc(se->bufsize);
455 assert(fbuf.mem);
457 fuse_mutex_init(&req->ch.lock);
458 req->ch.fd = -1;
459 req->ch.qi = qi;
461 /* The 'out' part of the elem is from qemu */
462 unsigned int out_num = elem->out_num;
463 struct iovec *out_sg = elem->out_sg;
464 size_t out_len = iov_size(out_sg, out_num);
465 fuse_log(FUSE_LOG_DEBUG,
466 "%s: elem %d: with %d out desc of length %zd\n",
467 __func__, elem->index, out_num, out_len);
470 * The elem should contain a 'fuse_in_header' (in to fuse)
471 * plus the data based on the len in the header.
473 if (out_len < sizeof(struct fuse_in_header)) {
474 fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for in_header\n",
475 __func__, elem->index);
476 assert(0); /* TODO */
478 if (out_len > se->bufsize) {
479 fuse_log(FUSE_LOG_ERR, "%s: elem %d too large for buffer\n", __func__,
480 elem->index);
481 assert(0); /* TODO */
483 /* Copy just the first element and look at it */
484 copy_from_iov(&fbuf, 1, out_sg);
486 pbufv = NULL; /* Compiler thinks an unitialised path */
487 if (out_num > 2 &&
488 out_sg[0].iov_len == sizeof(struct fuse_in_header) &&
489 ((struct fuse_in_header *)fbuf.mem)->opcode == FUSE_WRITE &&
490 out_sg[1].iov_len == sizeof(struct fuse_write_in)) {
492 * For a write we don't actually need to copy the
493 * data, we can just do it straight out of guest memory
494 * but we must still copy the headers in case the guest
495 * was nasty and changed them while we were using them.
497 fuse_log(FUSE_LOG_DEBUG, "%s: Write special case\n", __func__);
499 /* copy the fuse_write_in header afte rthe fuse_in_header */
500 fbuf.mem += out_sg->iov_len;
501 copy_from_iov(&fbuf, 1, out_sg + 1);
502 fbuf.mem -= out_sg->iov_len;
503 fbuf.size = out_sg[0].iov_len + out_sg[1].iov_len;
505 /* Allocate the bufv, with space for the rest of the iov */
506 pbufv = malloc(sizeof(struct fuse_bufvec) +
507 sizeof(struct fuse_buf) * (out_num - 2));
508 if (!pbufv) {
509 fuse_log(FUSE_LOG_ERR, "%s: pbufv malloc failed\n",
510 __func__);
511 goto out;
514 allocated_bufv = true;
515 pbufv->count = 1;
516 pbufv->buf[0] = fbuf;
518 size_t iovindex, pbufvindex;
519 iovindex = 2; /* 2 headers, separate iovs */
520 pbufvindex = 1; /* 2 headers, 1 fusebuf */
522 for (; iovindex < out_num; iovindex++, pbufvindex++) {
523 pbufv->count++;
524 pbufv->buf[pbufvindex].pos = ~0; /* Dummy */
525 pbufv->buf[pbufvindex].flags = 0;
526 pbufv->buf[pbufvindex].mem = out_sg[iovindex].iov_base;
527 pbufv->buf[pbufvindex].size = out_sg[iovindex].iov_len;
529 } else {
530 /* Normal (non fast write) path */
532 /* Copy the rest of the buffer */
533 fbuf.mem += out_sg->iov_len;
534 copy_from_iov(&fbuf, out_num - 1, out_sg + 1);
535 fbuf.mem -= out_sg->iov_len;
536 fbuf.size = out_len;
538 /* TODO! Endianness of header */
540 /* TODO: Add checks for fuse_session_exited */
541 bufv.buf[0] = fbuf;
542 bufv.count = 1;
543 pbufv = &bufv;
545 pbufv->idx = 0;
546 pbufv->off = 0;
547 fuse_session_process_buf_int(se, pbufv, &req->ch);
549 out:
550 if (allocated_bufv) {
551 free(pbufv);
554 /* If the request has no reply, still recycle the virtqueue element */
555 if (!req->reply_sent) {
556 struct VuVirtq *q = vu_get_queue(dev, qi->qidx);
558 fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", __func__,
559 elem->index);
561 pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
562 pthread_mutex_lock(&qi->vq_lock);
563 vu_queue_push(dev, q, elem, 0);
564 vu_queue_notify(dev, q);
565 pthread_mutex_unlock(&qi->vq_lock);
566 pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
569 pthread_mutex_destroy(&req->ch.lock);
570 free(fbuf.mem);
571 free(req);
574 /* Thread function for individual queues, created when a queue is 'started' */
575 static void *fv_queue_thread(void *opaque)
577 struct fv_QueueInfo *qi = opaque;
578 struct VuDev *dev = &qi->virtio_dev->dev;
579 struct VuVirtq *q = vu_get_queue(dev, qi->qidx);
580 struct fuse_session *se = qi->virtio_dev->se;
581 GThreadPool *pool = NULL;
582 GList *req_list = NULL;
584 if (se->thread_pool_size) {
585 fuse_log(FUSE_LOG_DEBUG, "%s: Creating thread pool for Queue %d\n",
586 __func__, qi->qidx);
587 pool = g_thread_pool_new(fv_queue_worker, qi, se->thread_pool_size,
588 FALSE, NULL);
589 if (!pool) {
590 fuse_log(FUSE_LOG_ERR, "%s: g_thread_pool_new failed\n", __func__);
591 return NULL;
595 fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__,
596 qi->qidx, qi->kick_fd);
597 while (1) {
598 struct pollfd pf[2];
599 int ret;
601 pf[0].fd = qi->kick_fd;
602 pf[0].events = POLLIN;
603 pf[0].revents = 0;
604 pf[1].fd = qi->kill_fd;
605 pf[1].events = POLLIN;
606 pf[1].revents = 0;
608 fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for Queue %d event\n", __func__,
609 qi->qidx);
610 int poll_res = ppoll(pf, 2, NULL, NULL);
612 if (poll_res == -1) {
613 if (errno == EINTR) {
614 fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n",
615 __func__);
616 continue;
618 fuse_log(FUSE_LOG_ERR, "fv_queue_thread ppoll: %m\n");
619 break;
621 assert(poll_res >= 1);
622 if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) {
623 fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x Queue %d\n",
624 __func__, pf[0].revents, qi->qidx);
625 break;
627 if (pf[1].revents & (POLLERR | POLLHUP | POLLNVAL)) {
628 fuse_log(FUSE_LOG_ERR,
629 "%s: Unexpected poll revents %x Queue %d killfd\n",
630 __func__, pf[1].revents, qi->qidx);
631 break;
633 if (pf[1].revents) {
634 fuse_log(FUSE_LOG_INFO, "%s: kill event on queue %d - quitting\n",
635 __func__, qi->qidx);
636 break;
638 assert(pf[0].revents & POLLIN);
639 fuse_log(FUSE_LOG_DEBUG, "%s: Got queue event on Queue %d\n", __func__,
640 qi->qidx);
642 eventfd_t evalue;
643 if (eventfd_read(qi->kick_fd, &evalue)) {
644 fuse_log(FUSE_LOG_ERR, "Eventfd_read for queue: %m\n");
645 break;
647 /* Mutual exclusion with virtio_loop() */
648 ret = pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
649 assert(ret == 0); /* there is no possible error case */
650 pthread_mutex_lock(&qi->vq_lock);
651 /* out is from guest, in is too guest */
652 unsigned int in_bytes, out_bytes;
653 vu_queue_get_avail_bytes(dev, q, &in_bytes, &out_bytes, ~0, ~0);
655 fuse_log(FUSE_LOG_DEBUG,
656 "%s: Queue %d gave evalue: %zx available: in: %u out: %u\n",
657 __func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes);
659 while (1) {
660 FVRequest *req = vu_queue_pop(dev, q, sizeof(FVRequest));
661 if (!req) {
662 break;
665 req->reply_sent = false;
667 if (!se->thread_pool_size) {
668 req_list = g_list_prepend(req_list, req);
669 } else {
670 g_thread_pool_push(pool, req, NULL);
674 pthread_mutex_unlock(&qi->vq_lock);
675 pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
677 /* Process all the requests. */
678 if (!se->thread_pool_size && req_list != NULL) {
679 g_list_foreach(req_list, fv_queue_worker, qi);
680 g_list_free(req_list);
681 req_list = NULL;
685 if (pool) {
686 g_thread_pool_free(pool, FALSE, TRUE);
689 return NULL;
692 static void fv_queue_cleanup_thread(struct fv_VuDev *vud, int qidx)
694 int ret;
695 struct fv_QueueInfo *ourqi;
697 assert(qidx < vud->nqueues);
698 ourqi = vud->qi[qidx];
700 /* Kill the thread */
701 if (eventfd_write(ourqi->kill_fd, 1)) {
702 fuse_log(FUSE_LOG_ERR, "Eventfd_write for queue %d: %s\n",
703 qidx, strerror(errno));
705 ret = pthread_join(ourqi->thread, NULL);
706 if (ret) {
707 fuse_log(FUSE_LOG_ERR, "%s: Failed to join thread idx %d err %d\n",
708 __func__, qidx, ret);
710 pthread_mutex_destroy(&ourqi->vq_lock);
711 close(ourqi->kill_fd);
712 ourqi->kick_fd = -1;
713 free(vud->qi[qidx]);
714 vud->qi[qidx] = NULL;
717 /* Callback from libvhost-user on start or stop of a queue */
718 static void fv_queue_set_started(VuDev *dev, int qidx, bool started)
720 struct fv_VuDev *vud = container_of(dev, struct fv_VuDev, dev);
721 struct fv_QueueInfo *ourqi;
723 fuse_log(FUSE_LOG_INFO, "%s: qidx=%d started=%d\n", __func__, qidx,
724 started);
725 assert(qidx >= 0);
728 * Ignore additional request queues for now. passthrough_ll.c must be
729 * audited for thread-safety issues first. It was written with a
730 * well-behaved client in mind and may not protect against all types of
731 * races yet.
733 if (qidx > 1) {
734 fuse_log(FUSE_LOG_ERR,
735 "%s: multiple request queues not yet implemented, please only "
736 "configure 1 request queue\n",
737 __func__);
738 exit(EXIT_FAILURE);
741 if (started) {
742 /* Fire up a thread to watch this queue */
743 if (qidx >= vud->nqueues) {
744 vud->qi = realloc(vud->qi, (qidx + 1) * sizeof(vud->qi[0]));
745 assert(vud->qi);
746 memset(vud->qi + vud->nqueues, 0,
747 sizeof(vud->qi[0]) * (1 + (qidx - vud->nqueues)));
748 vud->nqueues = qidx + 1;
750 if (!vud->qi[qidx]) {
751 vud->qi[qidx] = calloc(sizeof(struct fv_QueueInfo), 1);
752 assert(vud->qi[qidx]);
753 vud->qi[qidx]->virtio_dev = vud;
754 vud->qi[qidx]->qidx = qidx;
755 } else {
756 /* Shouldn't have been started */
757 assert(vud->qi[qidx]->kick_fd == -1);
759 ourqi = vud->qi[qidx];
760 ourqi->kick_fd = dev->vq[qidx].kick_fd;
762 ourqi->kill_fd = eventfd(0, EFD_CLOEXEC | EFD_SEMAPHORE);
763 assert(ourqi->kill_fd != -1);
764 pthread_mutex_init(&ourqi->vq_lock, NULL);
766 if (pthread_create(&ourqi->thread, NULL, fv_queue_thread, ourqi)) {
767 fuse_log(FUSE_LOG_ERR, "%s: Failed to create thread for queue %d\n",
768 __func__, qidx);
769 assert(0);
771 } else {
772 fv_queue_cleanup_thread(vud, qidx);
776 static bool fv_queue_order(VuDev *dev, int qidx)
778 return false;
781 static const VuDevIface fv_iface = {
782 .get_features = fv_get_features,
783 .set_features = fv_set_features,
785 /* Don't need process message, we've not got any at vhost-user level */
786 .queue_set_started = fv_queue_set_started,
788 .queue_is_processed_in_order = fv_queue_order,
792 * Main loop; this mostly deals with events on the vhost-user
793 * socket itself, and not actual fuse data.
795 int virtio_loop(struct fuse_session *se)
797 fuse_log(FUSE_LOG_INFO, "%s: Entry\n", __func__);
799 while (!fuse_session_exited(se)) {
800 struct pollfd pf[1];
801 bool ok;
802 int ret;
803 pf[0].fd = se->vu_socketfd;
804 pf[0].events = POLLIN;
805 pf[0].revents = 0;
807 fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for VU event\n", __func__);
808 int poll_res = ppoll(pf, 1, NULL, NULL);
810 if (poll_res == -1) {
811 if (errno == EINTR) {
812 fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n",
813 __func__);
814 continue;
816 fuse_log(FUSE_LOG_ERR, "virtio_loop ppoll: %m\n");
817 break;
819 assert(poll_res == 1);
820 if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) {
821 fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x\n", __func__,
822 pf[0].revents);
823 break;
825 assert(pf[0].revents & POLLIN);
826 fuse_log(FUSE_LOG_DEBUG, "%s: Got VU event\n", __func__);
827 /* Mutual exclusion with fv_queue_thread() */
828 ret = pthread_rwlock_wrlock(&se->virtio_dev->vu_dispatch_rwlock);
829 assert(ret == 0); /* there is no possible error case */
831 ok = vu_dispatch(&se->virtio_dev->dev);
833 pthread_rwlock_unlock(&se->virtio_dev->vu_dispatch_rwlock);
835 if (!ok) {
836 fuse_log(FUSE_LOG_ERR, "%s: vu_dispatch failed\n", __func__);
837 break;
842 * Make sure all fv_queue_thread()s quit on exit, as we're about to
843 * free virtio dev and fuse session, no one should access them anymore.
845 for (int i = 0; i < se->virtio_dev->nqueues; i++) {
846 if (!se->virtio_dev->qi[i]) {
847 continue;
850 fuse_log(FUSE_LOG_INFO, "%s: Stopping queue %d thread\n", __func__, i);
851 fv_queue_cleanup_thread(se->virtio_dev, i);
854 fuse_log(FUSE_LOG_INFO, "%s: Exit\n", __func__);
856 return 0;
859 static void strreplace(char *s, char old, char new)
861 for (; *s; ++s) {
862 if (*s == old) {
863 *s = new;
868 static bool fv_socket_lock(struct fuse_session *se)
870 g_autofree gchar *sk_name = NULL;
871 g_autofree gchar *pidfile = NULL;
872 g_autofree gchar *dir = NULL;
873 Error *local_err = NULL;
875 dir = qemu_get_local_state_pathname("run/virtiofsd");
877 if (g_mkdir_with_parents(dir, S_IRWXU) < 0) {
878 fuse_log(FUSE_LOG_ERR, "%s: Failed to create directory %s: %s",
879 __func__, dir, strerror(errno));
880 return false;
883 sk_name = g_strdup(se->vu_socket_path);
884 strreplace(sk_name, '/', '.');
885 pidfile = g_strdup_printf("%s/%s.pid", dir, sk_name);
887 if (!qemu_write_pidfile(pidfile, &local_err)) {
888 error_report_err(local_err);
889 return false;
892 return true;
895 static int fv_create_listen_socket(struct fuse_session *se)
897 struct sockaddr_un un;
898 mode_t old_umask;
900 /* Nothing to do if fd is already initialized */
901 if (se->vu_listen_fd >= 0) {
902 return 0;
905 if (strlen(se->vu_socket_path) >= sizeof(un.sun_path)) {
906 fuse_log(FUSE_LOG_ERR, "Socket path too long\n");
907 return -1;
910 if (!strlen(se->vu_socket_path)) {
911 fuse_log(FUSE_LOG_ERR, "Socket path is empty\n");
912 return -1;
915 /* Check the vu_socket_path is already used */
916 if (!fv_socket_lock(se)) {
917 return -1;
921 * Create the Unix socket to communicate with qemu
922 * based on QEMU's vhost-user-bridge
924 unlink(se->vu_socket_path);
925 strcpy(un.sun_path, se->vu_socket_path);
926 size_t addr_len = sizeof(un);
928 int listen_sock = socket(AF_UNIX, SOCK_STREAM, 0);
929 if (listen_sock == -1) {
930 fuse_log(FUSE_LOG_ERR, "vhost socket creation: %m\n");
931 return -1;
933 un.sun_family = AF_UNIX;
936 * Unfortunately bind doesn't let you set the mask on the socket,
937 * so set umask appropriately and restore it later.
939 if (se->vu_socket_group) {
940 old_umask = umask(S_IROTH | S_IWOTH | S_IXOTH);
941 } else {
942 old_umask = umask(S_IRGRP | S_IWGRP | S_IXGRP |
943 S_IROTH | S_IWOTH | S_IXOTH);
945 if (bind(listen_sock, (struct sockaddr *)&un, addr_len) == -1) {
946 fuse_log(FUSE_LOG_ERR, "vhost socket bind: %m\n");
947 close(listen_sock);
948 umask(old_umask);
949 return -1;
951 if (se->vu_socket_group) {
952 struct group *g = getgrnam(se->vu_socket_group);
953 if (g) {
954 if (!chown(se->vu_socket_path, -1, g->gr_gid)) {
955 fuse_log(FUSE_LOG_WARNING,
956 "vhost socket failed to set group to %s (%d)\n",
957 se->vu_socket_group, g->gr_gid);
961 umask(old_umask);
963 if (listen(listen_sock, 1) == -1) {
964 fuse_log(FUSE_LOG_ERR, "vhost socket listen: %m\n");
965 close(listen_sock);
966 return -1;
969 se->vu_listen_fd = listen_sock;
970 return 0;
973 int virtio_session_mount(struct fuse_session *se)
975 int ret;
978 * Test that unshare(CLONE_FS) works. fv_queue_worker() will need it. It's
979 * an unprivileged system call but some Docker/Moby versions are known to
980 * reject it via seccomp when CAP_SYS_ADMIN is not given.
982 * Note that the program is single-threaded here so this syscall has no
983 * visible effect and is safe to make.
985 ret = unshare(CLONE_FS);
986 if (ret == -1 && errno == EPERM) {
987 fuse_log(FUSE_LOG_ERR, "unshare(CLONE_FS) failed with EPERM. If "
988 "running in a container please check that the container "
989 "runtime seccomp policy allows unshare.\n");
990 return -1;
993 ret = fv_create_listen_socket(se);
994 if (ret < 0) {
995 return ret;
998 se->fd = -1;
1000 fuse_log(FUSE_LOG_INFO, "%s: Waiting for vhost-user socket connection...\n",
1001 __func__);
1002 int data_sock = accept(se->vu_listen_fd, NULL, NULL);
1003 if (data_sock == -1) {
1004 fuse_log(FUSE_LOG_ERR, "vhost socket accept: %m\n");
1005 close(se->vu_listen_fd);
1006 return -1;
1008 close(se->vu_listen_fd);
1009 se->vu_listen_fd = -1;
1010 fuse_log(FUSE_LOG_INFO, "%s: Received vhost-user socket connection\n",
1011 __func__);
1013 /* TODO: Some cleanup/deallocation! */
1014 se->virtio_dev = calloc(sizeof(struct fv_VuDev), 1);
1015 if (!se->virtio_dev) {
1016 fuse_log(FUSE_LOG_ERR, "%s: virtio_dev calloc failed\n", __func__);
1017 close(data_sock);
1018 return -1;
1021 se->vu_socketfd = data_sock;
1022 se->virtio_dev->se = se;
1023 pthread_rwlock_init(&se->virtio_dev->vu_dispatch_rwlock, NULL);
1024 if (!vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd, fv_panic, NULL,
1025 fv_set_watch, fv_remove_watch, &fv_iface)) {
1026 fuse_log(FUSE_LOG_ERR, "%s: vu_init failed\n", __func__);
1027 return -1;
1030 return 0;
1033 void virtio_session_close(struct fuse_session *se)
1035 close(se->vu_socketfd);
1037 if (!se->virtio_dev) {
1038 return;
1041 free(se->virtio_dev->qi);
1042 pthread_rwlock_destroy(&se->virtio_dev->vu_dispatch_rwlock);
1043 free(se->virtio_dev);
1044 se->virtio_dev = NULL;