tests/9pfs: Factor out do_version() helper
[qemu/ar7.git] / tools / virtiofsd / fuse_virtio.c
blob324936948d3daf79bd7fe42a09a82b9b3371e3a8
1 /*
2 * virtio-fs glue for FUSE
3 * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates
5 * Authors:
6 * Dave Gilbert <dgilbert@redhat.com>
8 * Implements the glue between libfuse and libvhost-user
10 * This program can be distributed under the terms of the GNU LGPLv2.
11 * See the file COPYING.LIB
14 #include "qemu/osdep.h"
15 #include "qemu/iov.h"
16 #include "qapi/error.h"
17 #include "fuse_i.h"
18 #include "standard-headers/linux/fuse.h"
19 #include "fuse_misc.h"
20 #include "fuse_opt.h"
21 #include "fuse_virtio.h"
23 #include <assert.h>
24 #include <errno.h>
25 #include <glib.h>
26 #include <stdint.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <sys/eventfd.h>
31 #include <sys/socket.h>
32 #include <sys/types.h>
33 #include <sys/un.h>
34 #include <sys/types.h>
35 #include <grp.h>
36 #include <unistd.h>
38 #include "contrib/libvhost-user/libvhost-user.h"
40 struct fv_VuDev;
41 struct fv_QueueInfo {
42 pthread_t thread;
44 * This lock protects the VuVirtq preventing races between
45 * fv_queue_thread() and fv_queue_worker().
47 pthread_mutex_t vq_lock;
49 struct fv_VuDev *virtio_dev;
51 /* Our queue index, corresponds to array position */
52 int qidx;
53 int kick_fd;
54 int kill_fd; /* For killing the thread */
57 /* A FUSE request */
58 typedef struct {
59 VuVirtqElement elem;
60 struct fuse_chan ch;
62 /* Used to complete requests that involve no reply */
63 bool reply_sent;
64 } FVRequest;
67 * We pass the dev element into libvhost-user
68 * and then use it to get back to the outer
69 * container for other data.
71 struct fv_VuDev {
72 VuDev dev;
73 struct fuse_session *se;
76 * Either handle virtqueues or vhost-user protocol messages. Don't do
77 * both at the same time since that could lead to race conditions if
78 * virtqueues or memory tables change while another thread is accessing
79 * them.
81 * The assumptions are:
82 * 1. fv_queue_thread() reads/writes to virtqueues and only reads VuDev.
83 * 2. virtio_loop() reads/writes virtqueues and VuDev.
85 pthread_rwlock_t vu_dispatch_rwlock;
88 * The following pair of fields are only accessed in the main
89 * virtio_loop
91 size_t nqueues;
92 struct fv_QueueInfo **qi;
95 /* From spec */
96 struct virtio_fs_config {
97 char tag[36];
98 uint32_t num_queues;
101 /* Callback from libvhost-user */
102 static uint64_t fv_get_features(VuDev *dev)
104 return 1ULL << VIRTIO_F_VERSION_1;
107 /* Callback from libvhost-user */
108 static void fv_set_features(VuDev *dev, uint64_t features)
113 * Callback from libvhost-user if there's a new fd we're supposed to listen
114 * to, typically a queue kick?
116 static void fv_set_watch(VuDev *dev, int fd, int condition, vu_watch_cb cb,
117 void *data)
119 fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd);
123 * Callback from libvhost-user if we're no longer supposed to listen on an fd
125 static void fv_remove_watch(VuDev *dev, int fd)
127 fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd);
130 /* Callback from libvhost-user to panic */
131 static void fv_panic(VuDev *dev, const char *err)
133 fuse_log(FUSE_LOG_ERR, "%s: libvhost-user: %s\n", __func__, err);
134 /* TODO: Allow reconnects?? */
135 exit(EXIT_FAILURE);
139 * Copy from an iovec into a fuse_buf (memory only)
140 * Caller must ensure there is space
142 static void copy_from_iov(struct fuse_buf *buf, size_t out_num,
143 const struct iovec *out_sg)
145 void *dest = buf->mem;
147 while (out_num) {
148 size_t onelen = out_sg->iov_len;
149 memcpy(dest, out_sg->iov_base, onelen);
150 dest += onelen;
151 out_sg++;
152 out_num--;
157 * Copy from one iov to another, the given number of bytes
158 * The caller must have checked sizes.
160 static void copy_iov(struct iovec *src_iov, int src_count,
161 struct iovec *dst_iov, int dst_count, size_t to_copy)
163 size_t dst_offset = 0;
164 /* Outer loop copies 'src' elements */
165 while (to_copy) {
166 assert(src_count);
167 size_t src_len = src_iov[0].iov_len;
168 size_t src_offset = 0;
170 if (src_len > to_copy) {
171 src_len = to_copy;
173 /* Inner loop copies contents of one 'src' to maybe multiple dst. */
174 while (src_len) {
175 assert(dst_count);
176 size_t dst_len = dst_iov[0].iov_len - dst_offset;
177 if (dst_len > src_len) {
178 dst_len = src_len;
181 memcpy(dst_iov[0].iov_base + dst_offset,
182 src_iov[0].iov_base + src_offset, dst_len);
183 src_len -= dst_len;
184 to_copy -= dst_len;
185 src_offset += dst_len;
186 dst_offset += dst_len;
188 assert(dst_offset <= dst_iov[0].iov_len);
189 if (dst_offset == dst_iov[0].iov_len) {
190 dst_offset = 0;
191 dst_iov++;
192 dst_count--;
195 src_iov++;
196 src_count--;
201 * Called back by ll whenever it wants to send a reply/message back
202 * The 1st element of the iov starts with the fuse_out_header
203 * 'unique'==0 means it's a notify message.
205 int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch,
206 struct iovec *iov, int count)
208 FVRequest *req = container_of(ch, FVRequest, ch);
209 struct fv_QueueInfo *qi = ch->qi;
210 VuDev *dev = &se->virtio_dev->dev;
211 VuVirtq *q = vu_get_queue(dev, qi->qidx);
212 VuVirtqElement *elem = &req->elem;
213 int ret = 0;
215 assert(count >= 1);
216 assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
218 struct fuse_out_header *out = iov[0].iov_base;
219 /* TODO: Endianness! */
221 size_t tosend_len = iov_size(iov, count);
223 /* unique == 0 is notification, which we don't support */
224 assert(out->unique);
225 assert(!req->reply_sent);
227 /* The 'in' part of the elem is to qemu */
228 unsigned int in_num = elem->in_num;
229 struct iovec *in_sg = elem->in_sg;
230 size_t in_len = iov_size(in_sg, in_num);
231 fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
232 __func__, elem->index, in_num, in_len);
235 * The elem should have room for a 'fuse_out_header' (out from fuse)
236 * plus the data based on the len in the header.
238 if (in_len < sizeof(struct fuse_out_header)) {
239 fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
240 __func__, elem->index);
241 ret = -E2BIG;
242 goto err;
244 if (in_len < tosend_len) {
245 fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
246 __func__, elem->index, tosend_len);
247 ret = -E2BIG;
248 goto err;
251 copy_iov(iov, count, in_sg, in_num, tosend_len);
253 pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
254 pthread_mutex_lock(&qi->vq_lock);
255 vu_queue_push(dev, q, elem, tosend_len);
256 vu_queue_notify(dev, q);
257 pthread_mutex_unlock(&qi->vq_lock);
258 pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
260 req->reply_sent = true;
262 err:
263 return ret;
267 * Callback from fuse_send_data_iov_* when it's virtio and the buffer
268 * is a single FD with FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK
269 * We need send the iov and then the buffer.
270 * Return 0 on success
272 int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
273 struct iovec *iov, int count, struct fuse_bufvec *buf,
274 size_t len)
276 FVRequest *req = container_of(ch, FVRequest, ch);
277 struct fv_QueueInfo *qi = ch->qi;
278 VuDev *dev = &se->virtio_dev->dev;
279 VuVirtq *q = vu_get_queue(dev, qi->qidx);
280 VuVirtqElement *elem = &req->elem;
281 int ret = 0;
283 assert(count >= 1);
284 assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
286 struct fuse_out_header *out = iov[0].iov_base;
287 /* TODO: Endianness! */
289 size_t iov_len = iov_size(iov, count);
290 size_t tosend_len = iov_len + len;
292 out->len = tosend_len;
294 fuse_log(FUSE_LOG_DEBUG, "%s: count=%d len=%zd iov_len=%zd\n", __func__,
295 count, len, iov_len);
297 /* unique == 0 is notification which we don't support */
298 assert(out->unique);
300 assert(!req->reply_sent);
302 /* The 'in' part of the elem is to qemu */
303 unsigned int in_num = elem->in_num;
304 struct iovec *in_sg = elem->in_sg;
305 size_t in_len = iov_size(in_sg, in_num);
306 fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
307 __func__, elem->index, in_num, in_len);
310 * The elem should have room for a 'fuse_out_header' (out from fuse)
311 * plus the data based on the len in the header.
313 if (in_len < sizeof(struct fuse_out_header)) {
314 fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
315 __func__, elem->index);
316 ret = E2BIG;
317 goto err;
319 if (in_len < tosend_len) {
320 fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
321 __func__, elem->index, tosend_len);
322 ret = E2BIG;
323 goto err;
326 /* TODO: Limit to 'len' */
328 /* First copy the header data from iov->in_sg */
329 copy_iov(iov, count, in_sg, in_num, iov_len);
332 * Build a copy of the the in_sg iov so we can skip bits in it,
333 * including changing the offsets
335 struct iovec *in_sg_cpy = calloc(sizeof(struct iovec), in_num);
336 assert(in_sg_cpy);
337 memcpy(in_sg_cpy, in_sg, sizeof(struct iovec) * in_num);
338 /* These get updated as we skip */
339 struct iovec *in_sg_ptr = in_sg_cpy;
340 int in_sg_cpy_count = in_num;
342 /* skip over parts of in_sg that contained the header iov */
343 size_t skip_size = iov_len;
345 size_t in_sg_left = 0;
346 do {
347 while (skip_size != 0 && in_sg_cpy_count) {
348 if (skip_size >= in_sg_ptr[0].iov_len) {
349 skip_size -= in_sg_ptr[0].iov_len;
350 in_sg_ptr++;
351 in_sg_cpy_count--;
352 } else {
353 in_sg_ptr[0].iov_len -= skip_size;
354 in_sg_ptr[0].iov_base += skip_size;
355 break;
359 int i;
360 for (i = 0, in_sg_left = 0; i < in_sg_cpy_count; i++) {
361 in_sg_left += in_sg_ptr[i].iov_len;
363 fuse_log(FUSE_LOG_DEBUG,
364 "%s: after skip skip_size=%zd in_sg_cpy_count=%d "
365 "in_sg_left=%zd\n",
366 __func__, skip_size, in_sg_cpy_count, in_sg_left);
367 ret = preadv(buf->buf[0].fd, in_sg_ptr, in_sg_cpy_count,
368 buf->buf[0].pos);
370 if (ret == -1) {
371 ret = errno;
372 fuse_log(FUSE_LOG_DEBUG, "%s: preadv failed (%m) len=%zd\n",
373 __func__, len);
374 free(in_sg_cpy);
375 goto err;
377 fuse_log(FUSE_LOG_DEBUG, "%s: preadv ret=%d len=%zd\n", __func__,
378 ret, len);
379 if (ret < len && ret) {
380 fuse_log(FUSE_LOG_DEBUG, "%s: ret < len\n", __func__);
381 /* Skip over this much next time around */
382 skip_size = ret;
383 buf->buf[0].pos += ret;
384 len -= ret;
386 /* Lets do another read */
387 continue;
389 if (!ret) {
390 /* EOF case? */
391 fuse_log(FUSE_LOG_DEBUG, "%s: !ret in_sg_left=%zd\n", __func__,
392 in_sg_left);
393 break;
395 if (ret != len) {
396 fuse_log(FUSE_LOG_DEBUG, "%s: ret!=len\n", __func__);
397 ret = EIO;
398 free(in_sg_cpy);
399 goto err;
401 in_sg_left -= ret;
402 len -= ret;
403 } while (in_sg_left);
404 free(in_sg_cpy);
406 /* Need to fix out->len on EOF */
407 if (len) {
408 struct fuse_out_header *out_sg = in_sg[0].iov_base;
410 tosend_len -= len;
411 out_sg->len = tosend_len;
414 ret = 0;
416 pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
417 pthread_mutex_lock(&qi->vq_lock);
418 vu_queue_push(dev, q, elem, tosend_len);
419 vu_queue_notify(dev, q);
420 pthread_mutex_unlock(&qi->vq_lock);
421 pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
423 err:
424 if (ret == 0) {
425 req->reply_sent = true;
428 return ret;
431 static __thread bool clone_fs_called;
433 /* Process one FVRequest in a thread pool */
434 static void fv_queue_worker(gpointer data, gpointer user_data)
436 struct fv_QueueInfo *qi = user_data;
437 struct fuse_session *se = qi->virtio_dev->se;
438 struct VuDev *dev = &qi->virtio_dev->dev;
439 FVRequest *req = data;
440 VuVirtqElement *elem = &req->elem;
441 struct fuse_buf fbuf = {};
442 bool allocated_bufv = false;
443 struct fuse_bufvec bufv;
444 struct fuse_bufvec *pbufv;
446 assert(se->bufsize > sizeof(struct fuse_in_header));
448 if (!clone_fs_called) {
449 int ret;
451 /* unshare FS for xattr operation */
452 ret = unshare(CLONE_FS);
453 /* should not fail */
454 assert(ret == 0);
456 clone_fs_called = true;
460 * An element contains one request and the space to send our response
461 * They're spread over multiple descriptors in a scatter/gather set
462 * and we can't trust the guest to keep them still; so copy in/out.
464 fbuf.mem = malloc(se->bufsize);
465 assert(fbuf.mem);
467 fuse_mutex_init(&req->ch.lock);
468 req->ch.fd = -1;
469 req->ch.qi = qi;
471 /* The 'out' part of the elem is from qemu */
472 unsigned int out_num = elem->out_num;
473 struct iovec *out_sg = elem->out_sg;
474 size_t out_len = iov_size(out_sg, out_num);
475 fuse_log(FUSE_LOG_DEBUG,
476 "%s: elem %d: with %d out desc of length %zd\n",
477 __func__, elem->index, out_num, out_len);
480 * The elem should contain a 'fuse_in_header' (in to fuse)
481 * plus the data based on the len in the header.
483 if (out_len < sizeof(struct fuse_in_header)) {
484 fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for in_header\n",
485 __func__, elem->index);
486 assert(0); /* TODO */
488 if (out_len > se->bufsize) {
489 fuse_log(FUSE_LOG_ERR, "%s: elem %d too large for buffer\n", __func__,
490 elem->index);
491 assert(0); /* TODO */
493 /* Copy just the first element and look at it */
494 copy_from_iov(&fbuf, 1, out_sg);
496 pbufv = NULL; /* Compiler thinks an unitialised path */
497 if (out_num > 2 &&
498 out_sg[0].iov_len == sizeof(struct fuse_in_header) &&
499 ((struct fuse_in_header *)fbuf.mem)->opcode == FUSE_WRITE &&
500 out_sg[1].iov_len == sizeof(struct fuse_write_in)) {
502 * For a write we don't actually need to copy the
503 * data, we can just do it straight out of guest memory
504 * but we must still copy the headers in case the guest
505 * was nasty and changed them while we were using them.
507 fuse_log(FUSE_LOG_DEBUG, "%s: Write special case\n", __func__);
509 /* copy the fuse_write_in header afte rthe fuse_in_header */
510 fbuf.mem += out_sg->iov_len;
511 copy_from_iov(&fbuf, 1, out_sg + 1);
512 fbuf.mem -= out_sg->iov_len;
513 fbuf.size = out_sg[0].iov_len + out_sg[1].iov_len;
515 /* Allocate the bufv, with space for the rest of the iov */
516 pbufv = malloc(sizeof(struct fuse_bufvec) +
517 sizeof(struct fuse_buf) * (out_num - 2));
518 if (!pbufv) {
519 fuse_log(FUSE_LOG_ERR, "%s: pbufv malloc failed\n",
520 __func__);
521 goto out;
524 allocated_bufv = true;
525 pbufv->count = 1;
526 pbufv->buf[0] = fbuf;
528 size_t iovindex, pbufvindex;
529 iovindex = 2; /* 2 headers, separate iovs */
530 pbufvindex = 1; /* 2 headers, 1 fusebuf */
532 for (; iovindex < out_num; iovindex++, pbufvindex++) {
533 pbufv->count++;
534 pbufv->buf[pbufvindex].pos = ~0; /* Dummy */
535 pbufv->buf[pbufvindex].flags = 0;
536 pbufv->buf[pbufvindex].mem = out_sg[iovindex].iov_base;
537 pbufv->buf[pbufvindex].size = out_sg[iovindex].iov_len;
539 } else {
540 /* Normal (non fast write) path */
542 /* Copy the rest of the buffer */
543 fbuf.mem += out_sg->iov_len;
544 copy_from_iov(&fbuf, out_num - 1, out_sg + 1);
545 fbuf.mem -= out_sg->iov_len;
546 fbuf.size = out_len;
548 /* TODO! Endianness of header */
550 /* TODO: Add checks for fuse_session_exited */
551 bufv.buf[0] = fbuf;
552 bufv.count = 1;
553 pbufv = &bufv;
555 pbufv->idx = 0;
556 pbufv->off = 0;
557 fuse_session_process_buf_int(se, pbufv, &req->ch);
559 out:
560 if (allocated_bufv) {
561 free(pbufv);
564 /* If the request has no reply, still recycle the virtqueue element */
565 if (!req->reply_sent) {
566 struct VuVirtq *q = vu_get_queue(dev, qi->qidx);
568 fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", __func__,
569 elem->index);
571 pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
572 pthread_mutex_lock(&qi->vq_lock);
573 vu_queue_push(dev, q, elem, 0);
574 vu_queue_notify(dev, q);
575 pthread_mutex_unlock(&qi->vq_lock);
576 pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
579 pthread_mutex_destroy(&req->ch.lock);
580 free(fbuf.mem);
581 free(req);
584 /* Thread function for individual queues, created when a queue is 'started' */
585 static void *fv_queue_thread(void *opaque)
587 struct fv_QueueInfo *qi = opaque;
588 struct VuDev *dev = &qi->virtio_dev->dev;
589 struct VuVirtq *q = vu_get_queue(dev, qi->qidx);
590 struct fuse_session *se = qi->virtio_dev->se;
591 GThreadPool *pool;
593 pool = g_thread_pool_new(fv_queue_worker, qi, se->thread_pool_size, FALSE,
594 NULL);
595 if (!pool) {
596 fuse_log(FUSE_LOG_ERR, "%s: g_thread_pool_new failed\n", __func__);
597 return NULL;
600 fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__,
601 qi->qidx, qi->kick_fd);
602 while (1) {
603 struct pollfd pf[2];
604 int ret;
606 pf[0].fd = qi->kick_fd;
607 pf[0].events = POLLIN;
608 pf[0].revents = 0;
609 pf[1].fd = qi->kill_fd;
610 pf[1].events = POLLIN;
611 pf[1].revents = 0;
613 fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for Queue %d event\n", __func__,
614 qi->qidx);
615 int poll_res = ppoll(pf, 2, NULL, NULL);
617 if (poll_res == -1) {
618 if (errno == EINTR) {
619 fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n",
620 __func__);
621 continue;
623 fuse_log(FUSE_LOG_ERR, "fv_queue_thread ppoll: %m\n");
624 break;
626 assert(poll_res >= 1);
627 if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) {
628 fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x Queue %d\n",
629 __func__, pf[0].revents, qi->qidx);
630 break;
632 if (pf[1].revents & (POLLERR | POLLHUP | POLLNVAL)) {
633 fuse_log(FUSE_LOG_ERR,
634 "%s: Unexpected poll revents %x Queue %d killfd\n",
635 __func__, pf[1].revents, qi->qidx);
636 break;
638 if (pf[1].revents) {
639 fuse_log(FUSE_LOG_INFO, "%s: kill event on queue %d - quitting\n",
640 __func__, qi->qidx);
641 break;
643 assert(pf[0].revents & POLLIN);
644 fuse_log(FUSE_LOG_DEBUG, "%s: Got queue event on Queue %d\n", __func__,
645 qi->qidx);
647 eventfd_t evalue;
648 if (eventfd_read(qi->kick_fd, &evalue)) {
649 fuse_log(FUSE_LOG_ERR, "Eventfd_read for queue: %m\n");
650 break;
652 /* Mutual exclusion with virtio_loop() */
653 ret = pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
654 assert(ret == 0); /* there is no possible error case */
655 pthread_mutex_lock(&qi->vq_lock);
656 /* out is from guest, in is too guest */
657 unsigned int in_bytes, out_bytes;
658 vu_queue_get_avail_bytes(dev, q, &in_bytes, &out_bytes, ~0, ~0);
660 fuse_log(FUSE_LOG_DEBUG,
661 "%s: Queue %d gave evalue: %zx available: in: %u out: %u\n",
662 __func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes);
664 while (1) {
665 FVRequest *req = vu_queue_pop(dev, q, sizeof(FVRequest));
666 if (!req) {
667 break;
670 req->reply_sent = false;
672 g_thread_pool_push(pool, req, NULL);
675 pthread_mutex_unlock(&qi->vq_lock);
676 pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
679 g_thread_pool_free(pool, FALSE, TRUE);
681 return NULL;
684 static void fv_queue_cleanup_thread(struct fv_VuDev *vud, int qidx)
686 int ret;
687 struct fv_QueueInfo *ourqi;
689 assert(qidx < vud->nqueues);
690 ourqi = vud->qi[qidx];
692 /* Kill the thread */
693 if (eventfd_write(ourqi->kill_fd, 1)) {
694 fuse_log(FUSE_LOG_ERR, "Eventfd_write for queue %d: %s\n",
695 qidx, strerror(errno));
697 ret = pthread_join(ourqi->thread, NULL);
698 if (ret) {
699 fuse_log(FUSE_LOG_ERR, "%s: Failed to join thread idx %d err %d\n",
700 __func__, qidx, ret);
702 pthread_mutex_destroy(&ourqi->vq_lock);
703 close(ourqi->kill_fd);
704 ourqi->kick_fd = -1;
705 free(vud->qi[qidx]);
706 vud->qi[qidx] = NULL;
709 /* Callback from libvhost-user on start or stop of a queue */
710 static void fv_queue_set_started(VuDev *dev, int qidx, bool started)
712 struct fv_VuDev *vud = container_of(dev, struct fv_VuDev, dev);
713 struct fv_QueueInfo *ourqi;
715 fuse_log(FUSE_LOG_INFO, "%s: qidx=%d started=%d\n", __func__, qidx,
716 started);
717 assert(qidx >= 0);
720 * Ignore additional request queues for now. passthrough_ll.c must be
721 * audited for thread-safety issues first. It was written with a
722 * well-behaved client in mind and may not protect against all types of
723 * races yet.
725 if (qidx > 1) {
726 fuse_log(FUSE_LOG_ERR,
727 "%s: multiple request queues not yet implemented, please only "
728 "configure 1 request queue\n",
729 __func__);
730 exit(EXIT_FAILURE);
733 if (started) {
734 /* Fire up a thread to watch this queue */
735 if (qidx >= vud->nqueues) {
736 vud->qi = realloc(vud->qi, (qidx + 1) * sizeof(vud->qi[0]));
737 assert(vud->qi);
738 memset(vud->qi + vud->nqueues, 0,
739 sizeof(vud->qi[0]) * (1 + (qidx - vud->nqueues)));
740 vud->nqueues = qidx + 1;
742 if (!vud->qi[qidx]) {
743 vud->qi[qidx] = calloc(sizeof(struct fv_QueueInfo), 1);
744 assert(vud->qi[qidx]);
745 vud->qi[qidx]->virtio_dev = vud;
746 vud->qi[qidx]->qidx = qidx;
747 } else {
748 /* Shouldn't have been started */
749 assert(vud->qi[qidx]->kick_fd == -1);
751 ourqi = vud->qi[qidx];
752 ourqi->kick_fd = dev->vq[qidx].kick_fd;
754 ourqi->kill_fd = eventfd(0, EFD_CLOEXEC | EFD_SEMAPHORE);
755 assert(ourqi->kill_fd != -1);
756 pthread_mutex_init(&ourqi->vq_lock, NULL);
758 if (pthread_create(&ourqi->thread, NULL, fv_queue_thread, ourqi)) {
759 fuse_log(FUSE_LOG_ERR, "%s: Failed to create thread for queue %d\n",
760 __func__, qidx);
761 assert(0);
763 } else {
764 fv_queue_cleanup_thread(vud, qidx);
768 static bool fv_queue_order(VuDev *dev, int qidx)
770 return false;
773 static const VuDevIface fv_iface = {
774 .get_features = fv_get_features,
775 .set_features = fv_set_features,
777 /* Don't need process message, we've not got any at vhost-user level */
778 .queue_set_started = fv_queue_set_started,
780 .queue_is_processed_in_order = fv_queue_order,
784 * Main loop; this mostly deals with events on the vhost-user
785 * socket itself, and not actual fuse data.
787 int virtio_loop(struct fuse_session *se)
789 fuse_log(FUSE_LOG_INFO, "%s: Entry\n", __func__);
791 while (!fuse_session_exited(se)) {
792 struct pollfd pf[1];
793 bool ok;
794 int ret;
795 pf[0].fd = se->vu_socketfd;
796 pf[0].events = POLLIN;
797 pf[0].revents = 0;
799 fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for VU event\n", __func__);
800 int poll_res = ppoll(pf, 1, NULL, NULL);
802 if (poll_res == -1) {
803 if (errno == EINTR) {
804 fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n",
805 __func__);
806 continue;
808 fuse_log(FUSE_LOG_ERR, "virtio_loop ppoll: %m\n");
809 break;
811 assert(poll_res == 1);
812 if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) {
813 fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x\n", __func__,
814 pf[0].revents);
815 break;
817 assert(pf[0].revents & POLLIN);
818 fuse_log(FUSE_LOG_DEBUG, "%s: Got VU event\n", __func__);
819 /* Mutual exclusion with fv_queue_thread() */
820 ret = pthread_rwlock_wrlock(&se->virtio_dev->vu_dispatch_rwlock);
821 assert(ret == 0); /* there is no possible error case */
823 ok = vu_dispatch(&se->virtio_dev->dev);
825 pthread_rwlock_unlock(&se->virtio_dev->vu_dispatch_rwlock);
827 if (!ok) {
828 fuse_log(FUSE_LOG_ERR, "%s: vu_dispatch failed\n", __func__);
829 break;
834 * Make sure all fv_queue_thread()s quit on exit, as we're about to
835 * free virtio dev and fuse session, no one should access them anymore.
837 for (int i = 0; i < se->virtio_dev->nqueues; i++) {
838 if (!se->virtio_dev->qi[i]) {
839 continue;
842 fuse_log(FUSE_LOG_INFO, "%s: Stopping queue %d thread\n", __func__, i);
843 fv_queue_cleanup_thread(se->virtio_dev, i);
846 fuse_log(FUSE_LOG_INFO, "%s: Exit\n", __func__);
848 return 0;
851 static void strreplace(char *s, char old, char new)
853 for (; *s; ++s) {
854 if (*s == old) {
855 *s = new;
860 static bool fv_socket_lock(struct fuse_session *se)
862 g_autofree gchar *sk_name = NULL;
863 g_autofree gchar *pidfile = NULL;
864 g_autofree gchar *dir = NULL;
865 Error *local_err = NULL;
867 dir = qemu_get_local_state_pathname("run/virtiofsd");
869 if (g_mkdir_with_parents(dir, S_IRWXU) < 0) {
870 fuse_log(FUSE_LOG_ERR, "%s: Failed to create directory %s: %s",
871 __func__, dir, strerror(errno));
872 return false;
875 sk_name = g_strdup(se->vu_socket_path);
876 strreplace(sk_name, '/', '.');
877 pidfile = g_strdup_printf("%s/%s.pid", dir, sk_name);
879 if (!qemu_write_pidfile(pidfile, &local_err)) {
880 error_report_err(local_err);
881 return false;
884 return true;
887 static int fv_create_listen_socket(struct fuse_session *se)
889 struct sockaddr_un un;
890 mode_t old_umask;
892 /* Nothing to do if fd is already initialized */
893 if (se->vu_listen_fd >= 0) {
894 return 0;
897 if (strlen(se->vu_socket_path) >= sizeof(un.sun_path)) {
898 fuse_log(FUSE_LOG_ERR, "Socket path too long\n");
899 return -1;
902 if (!strlen(se->vu_socket_path)) {
903 fuse_log(FUSE_LOG_ERR, "Socket path is empty\n");
904 return -1;
907 /* Check the vu_socket_path is already used */
908 if (!fv_socket_lock(se)) {
909 return -1;
913 * Create the Unix socket to communicate with qemu
914 * based on QEMU's vhost-user-bridge
916 unlink(se->vu_socket_path);
917 strcpy(un.sun_path, se->vu_socket_path);
918 size_t addr_len = sizeof(un);
920 int listen_sock = socket(AF_UNIX, SOCK_STREAM, 0);
921 if (listen_sock == -1) {
922 fuse_log(FUSE_LOG_ERR, "vhost socket creation: %m\n");
923 return -1;
925 un.sun_family = AF_UNIX;
928 * Unfortunately bind doesn't let you set the mask on the socket,
929 * so set umask appropriately and restore it later.
931 if (se->vu_socket_group) {
932 old_umask = umask(S_IROTH | S_IWOTH | S_IXOTH);
933 } else {
934 old_umask = umask(S_IRGRP | S_IWGRP | S_IXGRP |
935 S_IROTH | S_IWOTH | S_IXOTH);
937 if (bind(listen_sock, (struct sockaddr *)&un, addr_len) == -1) {
938 fuse_log(FUSE_LOG_ERR, "vhost socket bind: %m\n");
939 close(listen_sock);
940 umask(old_umask);
941 return -1;
943 if (se->vu_socket_group) {
944 struct group *g = getgrnam(se->vu_socket_group);
945 if (g) {
946 if (!chown(se->vu_socket_path, -1, g->gr_gid)) {
947 fuse_log(FUSE_LOG_WARNING,
948 "vhost socket failed to set group to %s (%d)\n",
949 se->vu_socket_group, g->gr_gid);
953 umask(old_umask);
955 if (listen(listen_sock, 1) == -1) {
956 fuse_log(FUSE_LOG_ERR, "vhost socket listen: %m\n");
957 close(listen_sock);
958 return -1;
961 se->vu_listen_fd = listen_sock;
962 return 0;
965 int virtio_session_mount(struct fuse_session *se)
967 int ret;
970 * Test that unshare(CLONE_FS) works. fv_queue_worker() will need it. It's
971 * an unprivileged system call but some Docker/Moby versions are known to
972 * reject it via seccomp when CAP_SYS_ADMIN is not given.
974 * Note that the program is single-threaded here so this syscall has no
975 * visible effect and is safe to make.
977 ret = unshare(CLONE_FS);
978 if (ret == -1 && errno == EPERM) {
979 fuse_log(FUSE_LOG_ERR, "unshare(CLONE_FS) failed with EPERM. If "
980 "running in a container please check that the container "
981 "runtime seccomp policy allows unshare.\n");
982 return -1;
985 ret = fv_create_listen_socket(se);
986 if (ret < 0) {
987 return ret;
990 se->fd = -1;
992 fuse_log(FUSE_LOG_INFO, "%s: Waiting for vhost-user socket connection...\n",
993 __func__);
994 int data_sock = accept(se->vu_listen_fd, NULL, NULL);
995 if (data_sock == -1) {
996 fuse_log(FUSE_LOG_ERR, "vhost socket accept: %m\n");
997 close(se->vu_listen_fd);
998 return -1;
1000 close(se->vu_listen_fd);
1001 se->vu_listen_fd = -1;
1002 fuse_log(FUSE_LOG_INFO, "%s: Received vhost-user socket connection\n",
1003 __func__);
1005 /* TODO: Some cleanup/deallocation! */
1006 se->virtio_dev = calloc(sizeof(struct fv_VuDev), 1);
1007 if (!se->virtio_dev) {
1008 fuse_log(FUSE_LOG_ERR, "%s: virtio_dev calloc failed\n", __func__);
1009 close(data_sock);
1010 return -1;
1013 se->vu_socketfd = data_sock;
1014 se->virtio_dev->se = se;
1015 pthread_rwlock_init(&se->virtio_dev->vu_dispatch_rwlock, NULL);
1016 vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd, fv_panic, NULL,
1017 fv_set_watch, fv_remove_watch, &fv_iface);
1019 return 0;
1022 void virtio_session_close(struct fuse_session *se)
1024 close(se->vu_socketfd);
1026 if (!se->virtio_dev) {
1027 return;
1030 free(se->virtio_dev->qi);
1031 pthread_rwlock_destroy(&se->virtio_dev->vu_dispatch_rwlock);
1032 free(se->virtio_dev);
1033 se->virtio_dev = NULL;