virtiofsd: Fix check of chown()'s return value
[qemu/ar7.git] / tools / virtiofsd / fuse_virtio.c
blob9efdbd8ffd0a05600a451c2b82e0073b7e7406de
1 /*
2 * virtio-fs glue for FUSE
3 * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates
5 * Authors:
6 * Dave Gilbert <dgilbert@redhat.com>
8 * Implements the glue between libfuse and libvhost-user
10 * This program can be distributed under the terms of the GNU LGPLv2.
11 * See the file COPYING.LIB
14 #include "qemu/osdep.h"
15 #include "qemu/iov.h"
16 #include "qapi/error.h"
17 #include "fuse_i.h"
18 #include "standard-headers/linux/fuse.h"
19 #include "fuse_misc.h"
20 #include "fuse_opt.h"
21 #include "fuse_virtio.h"
23 #include <sys/eventfd.h>
24 #include <sys/socket.h>
25 #include <sys/un.h>
26 #include <grp.h>
28 #include "libvhost-user.h"
30 struct fv_VuDev;
31 struct fv_QueueInfo {
32 pthread_t thread;
34 * This lock protects the VuVirtq preventing races between
35 * fv_queue_thread() and fv_queue_worker().
37 pthread_mutex_t vq_lock;
39 struct fv_VuDev *virtio_dev;
41 /* Our queue index, corresponds to array position */
42 int qidx;
43 int kick_fd;
44 int kill_fd; /* For killing the thread */
47 /* A FUSE request */
48 typedef struct {
49 VuVirtqElement elem;
50 struct fuse_chan ch;
52 /* Used to complete requests that involve no reply */
53 bool reply_sent;
54 } FVRequest;
57 * We pass the dev element into libvhost-user
58 * and then use it to get back to the outer
59 * container for other data.
61 struct fv_VuDev {
62 VuDev dev;
63 struct fuse_session *se;
66 * Either handle virtqueues or vhost-user protocol messages. Don't do
67 * both at the same time since that could lead to race conditions if
68 * virtqueues or memory tables change while another thread is accessing
69 * them.
71 * The assumptions are:
72 * 1. fv_queue_thread() reads/writes to virtqueues and only reads VuDev.
73 * 2. virtio_loop() reads/writes virtqueues and VuDev.
75 pthread_rwlock_t vu_dispatch_rwlock;
78 * The following pair of fields are only accessed in the main
79 * virtio_loop
81 size_t nqueues;
82 struct fv_QueueInfo **qi;
85 /* From spec */
86 struct virtio_fs_config {
87 char tag[36];
88 uint32_t num_queues;
91 /* Callback from libvhost-user */
92 static uint64_t fv_get_features(VuDev *dev)
94 return 1ULL << VIRTIO_F_VERSION_1;
97 /* Callback from libvhost-user */
98 static void fv_set_features(VuDev *dev, uint64_t features)
103 * Callback from libvhost-user if there's a new fd we're supposed to listen
104 * to, typically a queue kick?
106 static void fv_set_watch(VuDev *dev, int fd, int condition, vu_watch_cb cb,
107 void *data)
109 fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd);
113 * Callback from libvhost-user if we're no longer supposed to listen on an fd
115 static void fv_remove_watch(VuDev *dev, int fd)
117 fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd);
120 /* Callback from libvhost-user to panic */
121 static void fv_panic(VuDev *dev, const char *err)
123 fuse_log(FUSE_LOG_ERR, "%s: libvhost-user: %s\n", __func__, err);
124 /* TODO: Allow reconnects?? */
125 exit(EXIT_FAILURE);
129 * Copy from an iovec into a fuse_buf (memory only)
130 * Caller must ensure there is space
132 static size_t copy_from_iov(struct fuse_buf *buf, size_t out_num,
133 const struct iovec *out_sg,
134 size_t max)
136 void *dest = buf->mem;
137 size_t copied = 0;
139 while (out_num && max) {
140 size_t onelen = out_sg->iov_len;
141 onelen = MIN(onelen, max);
142 memcpy(dest, out_sg->iov_base, onelen);
143 dest += onelen;
144 copied += onelen;
145 out_sg++;
146 out_num--;
147 max -= onelen;
150 return copied;
154 * Skip 'skip' bytes in the iov; 'sg_1stindex' is set as
155 * the index for the 1st iovec to read data from, and
156 * 'sg_1stskip' is the number of bytes to skip in that entry.
158 * Returns True if there are at least 'skip' bytes in the iovec
161 static bool skip_iov(const struct iovec *sg, size_t sg_size,
162 size_t skip,
163 size_t *sg_1stindex, size_t *sg_1stskip)
165 size_t vec;
167 for (vec = 0; vec < sg_size; vec++) {
168 if (sg[vec].iov_len > skip) {
169 *sg_1stskip = skip;
170 *sg_1stindex = vec;
172 return true;
175 skip -= sg[vec].iov_len;
178 *sg_1stindex = vec;
179 *sg_1stskip = 0;
180 return skip == 0;
184 * Copy from one iov to another, the given number of bytes
185 * The caller must have checked sizes.
187 static void copy_iov(struct iovec *src_iov, int src_count,
188 struct iovec *dst_iov, int dst_count, size_t to_copy)
190 size_t dst_offset = 0;
191 /* Outer loop copies 'src' elements */
192 while (to_copy) {
193 assert(src_count);
194 size_t src_len = src_iov[0].iov_len;
195 size_t src_offset = 0;
197 if (src_len > to_copy) {
198 src_len = to_copy;
200 /* Inner loop copies contents of one 'src' to maybe multiple dst. */
201 while (src_len) {
202 assert(dst_count);
203 size_t dst_len = dst_iov[0].iov_len - dst_offset;
204 if (dst_len > src_len) {
205 dst_len = src_len;
208 memcpy(dst_iov[0].iov_base + dst_offset,
209 src_iov[0].iov_base + src_offset, dst_len);
210 src_len -= dst_len;
211 to_copy -= dst_len;
212 src_offset += dst_len;
213 dst_offset += dst_len;
215 assert(dst_offset <= dst_iov[0].iov_len);
216 if (dst_offset == dst_iov[0].iov_len) {
217 dst_offset = 0;
218 dst_iov++;
219 dst_count--;
222 src_iov++;
223 src_count--;
228 * pthread_rwlock_rdlock() and pthread_rwlock_wrlock can fail if
229 * a deadlock condition is detected or the current thread already
230 * owns the lock. They can also fail, like pthread_rwlock_unlock(),
231 * if the mutex wasn't properly initialized. None of these are ever
232 * expected to happen.
234 static void vu_dispatch_rdlock(struct fv_VuDev *vud)
236 int ret = pthread_rwlock_rdlock(&vud->vu_dispatch_rwlock);
237 assert(ret == 0);
240 static void vu_dispatch_wrlock(struct fv_VuDev *vud)
242 int ret = pthread_rwlock_wrlock(&vud->vu_dispatch_rwlock);
243 assert(ret == 0);
246 static void vu_dispatch_unlock(struct fv_VuDev *vud)
248 int ret = pthread_rwlock_unlock(&vud->vu_dispatch_rwlock);
249 assert(ret == 0);
253 * Called back by ll whenever it wants to send a reply/message back
254 * The 1st element of the iov starts with the fuse_out_header
255 * 'unique'==0 means it's a notify message.
257 int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch,
258 struct iovec *iov, int count)
260 FVRequest *req = container_of(ch, FVRequest, ch);
261 struct fv_QueueInfo *qi = ch->qi;
262 VuDev *dev = &se->virtio_dev->dev;
263 VuVirtq *q = vu_get_queue(dev, qi->qidx);
264 VuVirtqElement *elem = &req->elem;
265 int ret = 0;
267 assert(count >= 1);
268 assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
270 struct fuse_out_header *out = iov[0].iov_base;
271 /* TODO: Endianness! */
273 size_t tosend_len = iov_size(iov, count);
275 /* unique == 0 is notification, which we don't support */
276 assert(out->unique);
277 assert(!req->reply_sent);
279 /* The 'in' part of the elem is to qemu */
280 unsigned int in_num = elem->in_num;
281 struct iovec *in_sg = elem->in_sg;
282 size_t in_len = iov_size(in_sg, in_num);
283 fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
284 __func__, elem->index, in_num, in_len);
287 * The elem should have room for a 'fuse_out_header' (out from fuse)
288 * plus the data based on the len in the header.
290 if (in_len < sizeof(struct fuse_out_header)) {
291 fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
292 __func__, elem->index);
293 ret = -E2BIG;
294 goto err;
296 if (in_len < tosend_len) {
297 fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
298 __func__, elem->index, tosend_len);
299 ret = -E2BIG;
300 goto err;
303 copy_iov(iov, count, in_sg, in_num, tosend_len);
305 vu_dispatch_rdlock(qi->virtio_dev);
306 pthread_mutex_lock(&qi->vq_lock);
307 vu_queue_push(dev, q, elem, tosend_len);
308 vu_queue_notify(dev, q);
309 pthread_mutex_unlock(&qi->vq_lock);
310 vu_dispatch_unlock(qi->virtio_dev);
312 req->reply_sent = true;
314 err:
315 return ret;
319 * Callback from fuse_send_data_iov_* when it's virtio and the buffer
320 * is a single FD with FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK
321 * We need send the iov and then the buffer.
322 * Return 0 on success
324 int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
325 struct iovec *iov, int count, struct fuse_bufvec *buf,
326 size_t len)
328 FVRequest *req = container_of(ch, FVRequest, ch);
329 struct fv_QueueInfo *qi = ch->qi;
330 VuDev *dev = &se->virtio_dev->dev;
331 VuVirtq *q = vu_get_queue(dev, qi->qidx);
332 VuVirtqElement *elem = &req->elem;
333 int ret = 0;
334 g_autofree struct iovec *in_sg_cpy = NULL;
336 assert(count >= 1);
337 assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
339 struct fuse_out_header *out = iov[0].iov_base;
340 /* TODO: Endianness! */
342 size_t iov_len = iov_size(iov, count);
343 size_t tosend_len = iov_len + len;
345 out->len = tosend_len;
347 fuse_log(FUSE_LOG_DEBUG, "%s: count=%d len=%zd iov_len=%zd\n", __func__,
348 count, len, iov_len);
350 /* unique == 0 is notification which we don't support */
351 assert(out->unique);
353 assert(!req->reply_sent);
355 /* The 'in' part of the elem is to qemu */
356 unsigned int in_num = elem->in_num;
357 struct iovec *in_sg = elem->in_sg;
358 size_t in_len = iov_size(in_sg, in_num);
359 fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
360 __func__, elem->index, in_num, in_len);
363 * The elem should have room for a 'fuse_out_header' (out from fuse)
364 * plus the data based on the len in the header.
366 if (in_len < sizeof(struct fuse_out_header)) {
367 fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
368 __func__, elem->index);
369 ret = E2BIG;
370 goto err;
372 if (in_len < tosend_len) {
373 fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
374 __func__, elem->index, tosend_len);
375 ret = E2BIG;
376 goto err;
379 /* TODO: Limit to 'len' */
381 /* First copy the header data from iov->in_sg */
382 copy_iov(iov, count, in_sg, in_num, iov_len);
385 * Build a copy of the the in_sg iov so we can skip bits in it,
386 * including changing the offsets
388 in_sg_cpy = g_new(struct iovec, in_num);
389 memcpy(in_sg_cpy, in_sg, sizeof(struct iovec) * in_num);
390 /* These get updated as we skip */
391 struct iovec *in_sg_ptr = in_sg_cpy;
392 int in_sg_cpy_count = in_num;
394 /* skip over parts of in_sg that contained the header iov */
395 size_t skip_size = iov_len;
397 size_t in_sg_left = 0;
398 do {
399 while (skip_size != 0 && in_sg_cpy_count) {
400 if (skip_size >= in_sg_ptr[0].iov_len) {
401 skip_size -= in_sg_ptr[0].iov_len;
402 in_sg_ptr++;
403 in_sg_cpy_count--;
404 } else {
405 in_sg_ptr[0].iov_len -= skip_size;
406 in_sg_ptr[0].iov_base += skip_size;
407 break;
411 int i;
412 for (i = 0, in_sg_left = 0; i < in_sg_cpy_count; i++) {
413 in_sg_left += in_sg_ptr[i].iov_len;
415 fuse_log(FUSE_LOG_DEBUG,
416 "%s: after skip skip_size=%zd in_sg_cpy_count=%d "
417 "in_sg_left=%zd\n",
418 __func__, skip_size, in_sg_cpy_count, in_sg_left);
419 ret = preadv(buf->buf[0].fd, in_sg_ptr, in_sg_cpy_count,
420 buf->buf[0].pos);
422 if (ret == -1) {
423 ret = errno;
424 fuse_log(FUSE_LOG_DEBUG, "%s: preadv failed (%m) len=%zd\n",
425 __func__, len);
426 goto err;
428 fuse_log(FUSE_LOG_DEBUG, "%s: preadv ret=%d len=%zd\n", __func__,
429 ret, len);
430 if (ret < len && ret) {
431 fuse_log(FUSE_LOG_DEBUG, "%s: ret < len\n", __func__);
432 /* Skip over this much next time around */
433 skip_size = ret;
434 buf->buf[0].pos += ret;
435 len -= ret;
437 /* Lets do another read */
438 continue;
440 if (!ret) {
441 /* EOF case? */
442 fuse_log(FUSE_LOG_DEBUG, "%s: !ret in_sg_left=%zd\n", __func__,
443 in_sg_left);
444 break;
446 if (ret != len) {
447 fuse_log(FUSE_LOG_DEBUG, "%s: ret!=len\n", __func__);
448 ret = EIO;
449 goto err;
451 in_sg_left -= ret;
452 len -= ret;
453 } while (in_sg_left);
455 /* Need to fix out->len on EOF */
456 if (len) {
457 struct fuse_out_header *out_sg = in_sg[0].iov_base;
459 tosend_len -= len;
460 out_sg->len = tosend_len;
463 ret = 0;
465 vu_dispatch_rdlock(qi->virtio_dev);
466 pthread_mutex_lock(&qi->vq_lock);
467 vu_queue_push(dev, q, elem, tosend_len);
468 vu_queue_notify(dev, q);
469 pthread_mutex_unlock(&qi->vq_lock);
470 vu_dispatch_unlock(qi->virtio_dev);
472 err:
473 if (ret == 0) {
474 req->reply_sent = true;
477 return ret;
480 static __thread bool clone_fs_called;
482 /* Process one FVRequest in a thread pool */
483 static void fv_queue_worker(gpointer data, gpointer user_data)
485 struct fv_QueueInfo *qi = user_data;
486 struct fuse_session *se = qi->virtio_dev->se;
487 struct VuDev *dev = &qi->virtio_dev->dev;
488 FVRequest *req = data;
489 VuVirtqElement *elem = &req->elem;
490 struct fuse_buf fbuf = {};
491 bool allocated_bufv = false;
492 struct fuse_bufvec bufv;
493 struct fuse_bufvec *pbufv;
494 struct fuse_in_header inh;
496 assert(se->bufsize > sizeof(struct fuse_in_header));
498 if (!clone_fs_called) {
499 int ret;
501 /* unshare FS for xattr operation */
502 ret = unshare(CLONE_FS);
503 /* should not fail */
504 assert(ret == 0);
506 clone_fs_called = true;
510 * An element contains one request and the space to send our response
511 * They're spread over multiple descriptors in a scatter/gather set
512 * and we can't trust the guest to keep them still; so copy in/out.
514 fbuf.mem = g_malloc(se->bufsize);
516 fuse_mutex_init(&req->ch.lock);
517 req->ch.fd = -1;
518 req->ch.qi = qi;
520 /* The 'out' part of the elem is from qemu */
521 unsigned int out_num = elem->out_num;
522 struct iovec *out_sg = elem->out_sg;
523 size_t out_len = iov_size(out_sg, out_num);
524 fuse_log(FUSE_LOG_DEBUG,
525 "%s: elem %d: with %d out desc of length %zd\n",
526 __func__, elem->index, out_num, out_len);
529 * The elem should contain a 'fuse_in_header' (in to fuse)
530 * plus the data based on the len in the header.
532 if (out_len < sizeof(struct fuse_in_header)) {
533 fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for in_header\n",
534 __func__, elem->index);
535 assert(0); /* TODO */
537 if (out_len > se->bufsize) {
538 fuse_log(FUSE_LOG_ERR, "%s: elem %d too large for buffer\n", __func__,
539 elem->index);
540 assert(0); /* TODO */
542 /* Copy just the fuse_in_header and look at it */
543 copy_from_iov(&fbuf, out_num, out_sg,
544 sizeof(struct fuse_in_header));
545 memcpy(&inh, fbuf.mem, sizeof(struct fuse_in_header));
547 pbufv = NULL; /* Compiler thinks an unitialised path */
548 if (inh.opcode == FUSE_WRITE &&
549 out_len >= (sizeof(struct fuse_in_header) +
550 sizeof(struct fuse_write_in))) {
552 * For a write we don't actually need to copy the
553 * data, we can just do it straight out of guest memory
554 * but we must still copy the headers in case the guest
555 * was nasty and changed them while we were using them.
557 fuse_log(FUSE_LOG_DEBUG, "%s: Write special case\n", __func__);
559 fbuf.size = copy_from_iov(&fbuf, out_num, out_sg,
560 sizeof(struct fuse_in_header) +
561 sizeof(struct fuse_write_in));
562 /* That copy reread the in_header, make sure we use the original */
563 memcpy(fbuf.mem, &inh, sizeof(struct fuse_in_header));
565 /* Allocate the bufv, with space for the rest of the iov */
566 pbufv = g_try_malloc(sizeof(struct fuse_bufvec) +
567 sizeof(struct fuse_buf) * out_num);
568 if (!pbufv) {
569 fuse_log(FUSE_LOG_ERR, "%s: pbufv malloc failed\n",
570 __func__);
571 goto out;
574 allocated_bufv = true;
575 pbufv->count = 1;
576 pbufv->buf[0] = fbuf;
578 size_t iovindex, pbufvindex, iov_bytes_skip;
579 pbufvindex = 1; /* 2 headers, 1 fusebuf */
581 if (!skip_iov(out_sg, out_num,
582 sizeof(struct fuse_in_header) +
583 sizeof(struct fuse_write_in),
584 &iovindex, &iov_bytes_skip)) {
585 fuse_log(FUSE_LOG_ERR, "%s: skip failed\n",
586 __func__);
587 goto out;
590 for (; iovindex < out_num; iovindex++, pbufvindex++) {
591 pbufv->count++;
592 pbufv->buf[pbufvindex].pos = ~0; /* Dummy */
593 pbufv->buf[pbufvindex].flags = 0;
594 pbufv->buf[pbufvindex].mem = out_sg[iovindex].iov_base;
595 pbufv->buf[pbufvindex].size = out_sg[iovindex].iov_len;
597 if (iov_bytes_skip) {
598 pbufv->buf[pbufvindex].mem += iov_bytes_skip;
599 pbufv->buf[pbufvindex].size -= iov_bytes_skip;
600 iov_bytes_skip = 0;
603 } else {
604 /* Normal (non fast write) path */
606 copy_from_iov(&fbuf, out_num, out_sg, se->bufsize);
607 /* That copy reread the in_header, make sure we use the original */
608 memcpy(fbuf.mem, &inh, sizeof(struct fuse_in_header));
609 fbuf.size = out_len;
611 /* TODO! Endianness of header */
613 /* TODO: Add checks for fuse_session_exited */
614 bufv.buf[0] = fbuf;
615 bufv.count = 1;
616 pbufv = &bufv;
618 pbufv->idx = 0;
619 pbufv->off = 0;
620 fuse_session_process_buf_int(se, pbufv, &req->ch);
622 out:
623 if (allocated_bufv) {
624 g_free(pbufv);
627 /* If the request has no reply, still recycle the virtqueue element */
628 if (!req->reply_sent) {
629 struct VuVirtq *q = vu_get_queue(dev, qi->qidx);
631 fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", __func__,
632 elem->index);
634 vu_dispatch_rdlock(qi->virtio_dev);
635 pthread_mutex_lock(&qi->vq_lock);
636 vu_queue_push(dev, q, elem, 0);
637 vu_queue_notify(dev, q);
638 pthread_mutex_unlock(&qi->vq_lock);
639 vu_dispatch_unlock(qi->virtio_dev);
642 pthread_mutex_destroy(&req->ch.lock);
643 g_free(fbuf.mem);
644 free(req);
647 /* Thread function for individual queues, created when a queue is 'started' */
648 static void *fv_queue_thread(void *opaque)
650 struct fv_QueueInfo *qi = opaque;
651 struct VuDev *dev = &qi->virtio_dev->dev;
652 struct VuVirtq *q = vu_get_queue(dev, qi->qidx);
653 struct fuse_session *se = qi->virtio_dev->se;
654 GThreadPool *pool = NULL;
655 GList *req_list = NULL;
657 if (se->thread_pool_size) {
658 fuse_log(FUSE_LOG_DEBUG, "%s: Creating thread pool for Queue %d\n",
659 __func__, qi->qidx);
660 pool = g_thread_pool_new(fv_queue_worker, qi, se->thread_pool_size,
661 FALSE, NULL);
662 if (!pool) {
663 fuse_log(FUSE_LOG_ERR, "%s: g_thread_pool_new failed\n", __func__);
664 return NULL;
668 fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__,
669 qi->qidx, qi->kick_fd);
670 while (1) {
671 struct pollfd pf[2];
673 pf[0].fd = qi->kick_fd;
674 pf[0].events = POLLIN;
675 pf[0].revents = 0;
676 pf[1].fd = qi->kill_fd;
677 pf[1].events = POLLIN;
678 pf[1].revents = 0;
680 fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for Queue %d event\n", __func__,
681 qi->qidx);
682 int poll_res = ppoll(pf, 2, NULL, NULL);
684 if (poll_res == -1) {
685 if (errno == EINTR) {
686 fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n",
687 __func__);
688 continue;
690 fuse_log(FUSE_LOG_ERR, "fv_queue_thread ppoll: %m\n");
691 break;
693 assert(poll_res >= 1);
694 if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) {
695 fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x Queue %d\n",
696 __func__, pf[0].revents, qi->qidx);
697 break;
699 if (pf[1].revents & (POLLERR | POLLHUP | POLLNVAL)) {
700 fuse_log(FUSE_LOG_ERR,
701 "%s: Unexpected poll revents %x Queue %d killfd\n",
702 __func__, pf[1].revents, qi->qidx);
703 break;
705 if (pf[1].revents) {
706 fuse_log(FUSE_LOG_INFO, "%s: kill event on queue %d - quitting\n",
707 __func__, qi->qidx);
708 break;
710 assert(pf[0].revents & POLLIN);
711 fuse_log(FUSE_LOG_DEBUG, "%s: Got queue event on Queue %d\n", __func__,
712 qi->qidx);
714 eventfd_t evalue;
715 if (eventfd_read(qi->kick_fd, &evalue)) {
716 fuse_log(FUSE_LOG_ERR, "Eventfd_read for queue: %m\n");
717 break;
719 /* Mutual exclusion with virtio_loop() */
720 vu_dispatch_rdlock(qi->virtio_dev);
721 pthread_mutex_lock(&qi->vq_lock);
722 /* out is from guest, in is too guest */
723 unsigned int in_bytes, out_bytes;
724 vu_queue_get_avail_bytes(dev, q, &in_bytes, &out_bytes, ~0, ~0);
726 fuse_log(FUSE_LOG_DEBUG,
727 "%s: Queue %d gave evalue: %zx available: in: %u out: %u\n",
728 __func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes);
730 while (1) {
731 FVRequest *req = vu_queue_pop(dev, q, sizeof(FVRequest));
732 if (!req) {
733 break;
736 req->reply_sent = false;
738 if (!se->thread_pool_size) {
739 req_list = g_list_prepend(req_list, req);
740 } else {
741 g_thread_pool_push(pool, req, NULL);
745 pthread_mutex_unlock(&qi->vq_lock);
746 vu_dispatch_unlock(qi->virtio_dev);
748 /* Process all the requests. */
749 if (!se->thread_pool_size && req_list != NULL) {
750 g_list_foreach(req_list, fv_queue_worker, qi);
751 g_list_free(req_list);
752 req_list = NULL;
756 if (pool) {
757 g_thread_pool_free(pool, FALSE, TRUE);
760 return NULL;
763 static void fv_queue_cleanup_thread(struct fv_VuDev *vud, int qidx)
765 int ret;
766 struct fv_QueueInfo *ourqi;
768 assert(qidx < vud->nqueues);
769 ourqi = vud->qi[qidx];
771 /* Kill the thread */
772 if (eventfd_write(ourqi->kill_fd, 1)) {
773 fuse_log(FUSE_LOG_ERR, "Eventfd_write for queue %d: %s\n",
774 qidx, strerror(errno));
776 ret = pthread_join(ourqi->thread, NULL);
777 if (ret) {
778 fuse_log(FUSE_LOG_ERR, "%s: Failed to join thread idx %d err %d\n",
779 __func__, qidx, ret);
781 pthread_mutex_destroy(&ourqi->vq_lock);
782 close(ourqi->kill_fd);
783 ourqi->kick_fd = -1;
784 g_free(vud->qi[qidx]);
785 vud->qi[qidx] = NULL;
788 /* Callback from libvhost-user on start or stop of a queue */
789 static void fv_queue_set_started(VuDev *dev, int qidx, bool started)
791 struct fv_VuDev *vud = container_of(dev, struct fv_VuDev, dev);
792 struct fv_QueueInfo *ourqi;
794 fuse_log(FUSE_LOG_INFO, "%s: qidx=%d started=%d\n", __func__, qidx,
795 started);
796 assert(qidx >= 0);
799 * Ignore additional request queues for now. passthrough_ll.c must be
800 * audited for thread-safety issues first. It was written with a
801 * well-behaved client in mind and may not protect against all types of
802 * races yet.
804 if (qidx > 1) {
805 fuse_log(FUSE_LOG_ERR,
806 "%s: multiple request queues not yet implemented, please only "
807 "configure 1 request queue\n",
808 __func__);
809 exit(EXIT_FAILURE);
812 if (started) {
813 /* Fire up a thread to watch this queue */
814 if (qidx >= vud->nqueues) {
815 vud->qi = g_realloc_n(vud->qi, qidx + 1, sizeof(vud->qi[0]));
816 memset(vud->qi + vud->nqueues, 0,
817 sizeof(vud->qi[0]) * (1 + (qidx - vud->nqueues)));
818 vud->nqueues = qidx + 1;
820 if (!vud->qi[qidx]) {
821 vud->qi[qidx] = g_new0(struct fv_QueueInfo, 1);
822 vud->qi[qidx]->virtio_dev = vud;
823 vud->qi[qidx]->qidx = qidx;
824 } else {
825 /* Shouldn't have been started */
826 assert(vud->qi[qidx]->kick_fd == -1);
828 ourqi = vud->qi[qidx];
829 ourqi->kick_fd = dev->vq[qidx].kick_fd;
831 ourqi->kill_fd = eventfd(0, EFD_CLOEXEC | EFD_SEMAPHORE);
832 assert(ourqi->kill_fd != -1);
833 pthread_mutex_init(&ourqi->vq_lock, NULL);
835 if (pthread_create(&ourqi->thread, NULL, fv_queue_thread, ourqi)) {
836 fuse_log(FUSE_LOG_ERR, "%s: Failed to create thread for queue %d\n",
837 __func__, qidx);
838 assert(0);
840 } else {
842 * Temporarily drop write-lock taken in virtio_loop() so that
843 * the queue thread doesn't block in virtio_send_msg().
845 vu_dispatch_unlock(vud);
846 fv_queue_cleanup_thread(vud, qidx);
847 vu_dispatch_wrlock(vud);
851 static bool fv_queue_order(VuDev *dev, int qidx)
853 return false;
856 static const VuDevIface fv_iface = {
857 .get_features = fv_get_features,
858 .set_features = fv_set_features,
860 /* Don't need process message, we've not got any at vhost-user level */
861 .queue_set_started = fv_queue_set_started,
863 .queue_is_processed_in_order = fv_queue_order,
867 * Main loop; this mostly deals with events on the vhost-user
868 * socket itself, and not actual fuse data.
870 int virtio_loop(struct fuse_session *se)
872 fuse_log(FUSE_LOG_INFO, "%s: Entry\n", __func__);
874 while (!fuse_session_exited(se)) {
875 struct pollfd pf[1];
876 bool ok;
877 pf[0].fd = se->vu_socketfd;
878 pf[0].events = POLLIN;
879 pf[0].revents = 0;
881 fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for VU event\n", __func__);
882 int poll_res = ppoll(pf, 1, NULL, NULL);
884 if (poll_res == -1) {
885 if (errno == EINTR) {
886 fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n",
887 __func__);
888 continue;
890 fuse_log(FUSE_LOG_ERR, "virtio_loop ppoll: %m\n");
891 break;
893 assert(poll_res == 1);
894 if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) {
895 fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x\n", __func__,
896 pf[0].revents);
897 break;
899 assert(pf[0].revents & POLLIN);
900 fuse_log(FUSE_LOG_DEBUG, "%s: Got VU event\n", __func__);
901 /* Mutual exclusion with fv_queue_thread() */
902 vu_dispatch_wrlock(se->virtio_dev);
904 ok = vu_dispatch(&se->virtio_dev->dev);
906 vu_dispatch_unlock(se->virtio_dev);
908 if (!ok) {
909 fuse_log(FUSE_LOG_ERR, "%s: vu_dispatch failed\n", __func__);
910 break;
915 * Make sure all fv_queue_thread()s quit on exit, as we're about to
916 * free virtio dev and fuse session, no one should access them anymore.
918 for (int i = 0; i < se->virtio_dev->nqueues; i++) {
919 if (!se->virtio_dev->qi[i]) {
920 continue;
923 fuse_log(FUSE_LOG_INFO, "%s: Stopping queue %d thread\n", __func__, i);
924 fv_queue_cleanup_thread(se->virtio_dev, i);
927 fuse_log(FUSE_LOG_INFO, "%s: Exit\n", __func__);
929 return 0;
932 static void strreplace(char *s, char old, char new)
934 for (; *s; ++s) {
935 if (*s == old) {
936 *s = new;
941 static bool fv_socket_lock(struct fuse_session *se)
943 g_autofree gchar *sk_name = NULL;
944 g_autofree gchar *pidfile = NULL;
945 g_autofree gchar *dir = NULL;
946 Error *local_err = NULL;
948 dir = qemu_get_local_state_pathname("run/virtiofsd");
950 if (g_mkdir_with_parents(dir, S_IRWXU) < 0) {
951 fuse_log(FUSE_LOG_ERR, "%s: Failed to create directory %s: %s",
952 __func__, dir, strerror(errno));
953 return false;
956 sk_name = g_strdup(se->vu_socket_path);
957 strreplace(sk_name, '/', '.');
958 pidfile = g_strdup_printf("%s/%s.pid", dir, sk_name);
960 if (!qemu_write_pidfile(pidfile, &local_err)) {
961 error_report_err(local_err);
962 return false;
965 return true;
968 static int fv_create_listen_socket(struct fuse_session *se)
970 struct sockaddr_un un;
971 mode_t old_umask;
973 /* Nothing to do if fd is already initialized */
974 if (se->vu_listen_fd >= 0) {
975 return 0;
978 if (strlen(se->vu_socket_path) >= sizeof(un.sun_path)) {
979 fuse_log(FUSE_LOG_ERR, "Socket path too long\n");
980 return -1;
983 if (!strlen(se->vu_socket_path)) {
984 fuse_log(FUSE_LOG_ERR, "Socket path is empty\n");
985 return -1;
988 /* Check the vu_socket_path is already used */
989 if (!fv_socket_lock(se)) {
990 return -1;
994 * Create the Unix socket to communicate with qemu
995 * based on QEMU's vhost-user-bridge
997 unlink(se->vu_socket_path);
998 strcpy(un.sun_path, se->vu_socket_path);
999 size_t addr_len = sizeof(un);
1001 int listen_sock = socket(AF_UNIX, SOCK_STREAM, 0);
1002 if (listen_sock == -1) {
1003 fuse_log(FUSE_LOG_ERR, "vhost socket creation: %m\n");
1004 return -1;
1006 un.sun_family = AF_UNIX;
1009 * Unfortunately bind doesn't let you set the mask on the socket,
1010 * so set umask appropriately and restore it later.
1012 if (se->vu_socket_group) {
1013 old_umask = umask(S_IROTH | S_IWOTH | S_IXOTH);
1014 } else {
1015 old_umask = umask(S_IRGRP | S_IWGRP | S_IXGRP |
1016 S_IROTH | S_IWOTH | S_IXOTH);
1018 if (bind(listen_sock, (struct sockaddr *)&un, addr_len) == -1) {
1019 fuse_log(FUSE_LOG_ERR, "vhost socket bind: %m\n");
1020 close(listen_sock);
1021 umask(old_umask);
1022 return -1;
1024 if (se->vu_socket_group) {
1025 struct group *g = getgrnam(se->vu_socket_group);
1026 if (g) {
1027 if (chown(se->vu_socket_path, -1, g->gr_gid) == -1) {
1028 fuse_log(FUSE_LOG_WARNING,
1029 "vhost socket failed to set group to %s (%d): %m\n",
1030 se->vu_socket_group, g->gr_gid);
1034 umask(old_umask);
1036 if (listen(listen_sock, 1) == -1) {
1037 fuse_log(FUSE_LOG_ERR, "vhost socket listen: %m\n");
1038 close(listen_sock);
1039 return -1;
1042 se->vu_listen_fd = listen_sock;
1043 return 0;
1046 int virtio_session_mount(struct fuse_session *se)
1048 int ret;
1051 * Test that unshare(CLONE_FS) works. fv_queue_worker() will need it. It's
1052 * an unprivileged system call but some Docker/Moby versions are known to
1053 * reject it via seccomp when CAP_SYS_ADMIN is not given.
1055 * Note that the program is single-threaded here so this syscall has no
1056 * visible effect and is safe to make.
1058 ret = unshare(CLONE_FS);
1059 if (ret == -1 && errno == EPERM) {
1060 fuse_log(FUSE_LOG_ERR, "unshare(CLONE_FS) failed with EPERM. If "
1061 "running in a container please check that the container "
1062 "runtime seccomp policy allows unshare.\n");
1063 return -1;
1066 ret = fv_create_listen_socket(se);
1067 if (ret < 0) {
1068 return ret;
1071 se->fd = -1;
1073 fuse_log(FUSE_LOG_INFO, "%s: Waiting for vhost-user socket connection...\n",
1074 __func__);
1075 int data_sock = accept(se->vu_listen_fd, NULL, NULL);
1076 if (data_sock == -1) {
1077 fuse_log(FUSE_LOG_ERR, "vhost socket accept: %m\n");
1078 close(se->vu_listen_fd);
1079 return -1;
1081 close(se->vu_listen_fd);
1082 se->vu_listen_fd = -1;
1083 fuse_log(FUSE_LOG_INFO, "%s: Received vhost-user socket connection\n",
1084 __func__);
1086 /* TODO: Some cleanup/deallocation! */
1087 se->virtio_dev = g_new0(struct fv_VuDev, 1);
1089 se->vu_socketfd = data_sock;
1090 se->virtio_dev->se = se;
1091 pthread_rwlock_init(&se->virtio_dev->vu_dispatch_rwlock, NULL);
1092 if (!vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd, fv_panic, NULL,
1093 fv_set_watch, fv_remove_watch, &fv_iface)) {
1094 fuse_log(FUSE_LOG_ERR, "%s: vu_init failed\n", __func__);
1095 return -1;
1098 return 0;
1101 void virtio_session_close(struct fuse_session *se)
1103 close(se->vu_socketfd);
1105 if (!se->virtio_dev) {
1106 return;
1109 g_free(se->virtio_dev->qi);
1110 pthread_rwlock_destroy(&se->virtio_dev->vu_dispatch_rwlock);
1111 g_free(se->virtio_dev);
1112 se->virtio_dev = NULL;