virtiofsd: add --fd=FDNUM fd passing option
[qemu.git] / tools / virtiofsd / fuse_virtio.c
blob635f87756a2b0071cfc785a30398c04b6668177e
1 /*
2 * virtio-fs glue for FUSE
3 * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates
5 * Authors:
6 * Dave Gilbert <dgilbert@redhat.com>
8 * Implements the glue between libfuse and libvhost-user
10 * This program can be distributed under the terms of the GNU LGPLv2.
11 * See the file COPYING.LIB
14 #include "qemu/osdep.h"
15 #include "qemu/iov.h"
16 #include "fuse_virtio.h"
17 #include "fuse_i.h"
18 #include "standard-headers/linux/fuse.h"
19 #include "fuse_misc.h"
20 #include "fuse_opt.h"
22 #include <assert.h>
23 #include <errno.h>
24 #include <stdint.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <sys/eventfd.h>
29 #include <sys/socket.h>
30 #include <sys/types.h>
31 #include <sys/un.h>
32 #include <unistd.h>
34 #include "contrib/libvhost-user/libvhost-user.h"
36 struct fv_VuDev;
37 struct fv_QueueInfo {
38 pthread_t thread;
39 struct fv_VuDev *virtio_dev;
41 /* Our queue index, corresponds to array position */
42 int qidx;
43 int kick_fd;
45 /* The element for the command currently being processed */
46 VuVirtqElement *qe;
47 bool reply_sent;
51 * We pass the dev element into libvhost-user
52 * and then use it to get back to the outer
53 * container for other data.
55 struct fv_VuDev {
56 VuDev dev;
57 struct fuse_session *se;
60 * The following pair of fields are only accessed in the main
61 * virtio_loop
63 size_t nqueues;
64 struct fv_QueueInfo **qi;
67 /* From spec */
68 struct virtio_fs_config {
69 char tag[36];
70 uint32_t num_queues;
73 /* Callback from libvhost-user */
74 static uint64_t fv_get_features(VuDev *dev)
76 return 1ULL << VIRTIO_F_VERSION_1;
79 /* Callback from libvhost-user */
80 static void fv_set_features(VuDev *dev, uint64_t features)
85 * Callback from libvhost-user if there's a new fd we're supposed to listen
86 * to, typically a queue kick?
88 static void fv_set_watch(VuDev *dev, int fd, int condition, vu_watch_cb cb,
89 void *data)
91 fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd);
95 * Callback from libvhost-user if we're no longer supposed to listen on an fd
97 static void fv_remove_watch(VuDev *dev, int fd)
99 fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd);
102 /* Callback from libvhost-user to panic */
103 static void fv_panic(VuDev *dev, const char *err)
105 fuse_log(FUSE_LOG_ERR, "%s: libvhost-user: %s\n", __func__, err);
106 /* TODO: Allow reconnects?? */
107 exit(EXIT_FAILURE);
111 * Copy from an iovec into a fuse_buf (memory only)
112 * Caller must ensure there is space
114 static void copy_from_iov(struct fuse_buf *buf, size_t out_num,
115 const struct iovec *out_sg)
117 void *dest = buf->mem;
119 while (out_num) {
120 size_t onelen = out_sg->iov_len;
121 memcpy(dest, out_sg->iov_base, onelen);
122 dest += onelen;
123 out_sg++;
124 out_num--;
129 * Copy from one iov to another, the given number of bytes
130 * The caller must have checked sizes.
132 static void copy_iov(struct iovec *src_iov, int src_count,
133 struct iovec *dst_iov, int dst_count, size_t to_copy)
135 size_t dst_offset = 0;
136 /* Outer loop copies 'src' elements */
137 while (to_copy) {
138 assert(src_count);
139 size_t src_len = src_iov[0].iov_len;
140 size_t src_offset = 0;
142 if (src_len > to_copy) {
143 src_len = to_copy;
145 /* Inner loop copies contents of one 'src' to maybe multiple dst. */
146 while (src_len) {
147 assert(dst_count);
148 size_t dst_len = dst_iov[0].iov_len - dst_offset;
149 if (dst_len > src_len) {
150 dst_len = src_len;
153 memcpy(dst_iov[0].iov_base + dst_offset,
154 src_iov[0].iov_base + src_offset, dst_len);
155 src_len -= dst_len;
156 to_copy -= dst_len;
157 src_offset += dst_len;
158 dst_offset += dst_len;
160 assert(dst_offset <= dst_iov[0].iov_len);
161 if (dst_offset == dst_iov[0].iov_len) {
162 dst_offset = 0;
163 dst_iov++;
164 dst_count--;
167 src_iov++;
168 src_count--;
173 * Called back by ll whenever it wants to send a reply/message back
174 * The 1st element of the iov starts with the fuse_out_header
175 * 'unique'==0 means it's a notify message.
177 int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch,
178 struct iovec *iov, int count)
180 VuVirtqElement *elem;
181 VuVirtq *q;
182 int ret = 0;
184 assert(count >= 1);
185 assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
187 struct fuse_out_header *out = iov[0].iov_base;
188 /* TODO: Endianness! */
190 size_t tosend_len = iov_size(iov, count);
192 /* unique == 0 is notification, which we don't support */
193 assert(out->unique);
194 /* For virtio we always have ch */
195 assert(ch);
196 assert(!ch->qi->reply_sent);
197 elem = ch->qi->qe;
198 q = &ch->qi->virtio_dev->dev.vq[ch->qi->qidx];
200 /* The 'in' part of the elem is to qemu */
201 unsigned int in_num = elem->in_num;
202 struct iovec *in_sg = elem->in_sg;
203 size_t in_len = iov_size(in_sg, in_num);
204 fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
205 __func__, elem->index, in_num, in_len);
208 * The elem should have room for a 'fuse_out_header' (out from fuse)
209 * plus the data based on the len in the header.
211 if (in_len < sizeof(struct fuse_out_header)) {
212 fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
213 __func__, elem->index);
214 ret = -E2BIG;
215 goto err;
217 if (in_len < tosend_len) {
218 fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
219 __func__, elem->index, tosend_len);
220 ret = -E2BIG;
221 goto err;
224 copy_iov(iov, count, in_sg, in_num, tosend_len);
225 vu_queue_push(&se->virtio_dev->dev, q, elem, tosend_len);
226 vu_queue_notify(&se->virtio_dev->dev, q);
227 ch->qi->reply_sent = true;
229 err:
230 return ret;
234 * Callback from fuse_send_data_iov_* when it's virtio and the buffer
235 * is a single FD with FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK
236 * We need send the iov and then the buffer.
237 * Return 0 on success
239 int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
240 struct iovec *iov, int count, struct fuse_bufvec *buf,
241 size_t len)
243 int ret = 0;
244 VuVirtqElement *elem;
245 VuVirtq *q;
247 assert(count >= 1);
248 assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
250 struct fuse_out_header *out = iov[0].iov_base;
251 /* TODO: Endianness! */
253 size_t iov_len = iov_size(iov, count);
254 size_t tosend_len = iov_len + len;
256 out->len = tosend_len;
258 fuse_log(FUSE_LOG_DEBUG, "%s: count=%d len=%zd iov_len=%zd\n", __func__,
259 count, len, iov_len);
261 /* unique == 0 is notification which we don't support */
262 assert(out->unique);
264 /* For virtio we always have ch */
265 assert(ch);
266 assert(!ch->qi->reply_sent);
267 elem = ch->qi->qe;
268 q = &ch->qi->virtio_dev->dev.vq[ch->qi->qidx];
270 /* The 'in' part of the elem is to qemu */
271 unsigned int in_num = elem->in_num;
272 struct iovec *in_sg = elem->in_sg;
273 size_t in_len = iov_size(in_sg, in_num);
274 fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
275 __func__, elem->index, in_num, in_len);
278 * The elem should have room for a 'fuse_out_header' (out from fuse)
279 * plus the data based on the len in the header.
281 if (in_len < sizeof(struct fuse_out_header)) {
282 fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
283 __func__, elem->index);
284 ret = E2BIG;
285 goto err;
287 if (in_len < tosend_len) {
288 fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
289 __func__, elem->index, tosend_len);
290 ret = E2BIG;
291 goto err;
294 /* TODO: Limit to 'len' */
296 /* First copy the header data from iov->in_sg */
297 copy_iov(iov, count, in_sg, in_num, iov_len);
300 * Build a copy of the the in_sg iov so we can skip bits in it,
301 * including changing the offsets
303 struct iovec *in_sg_cpy = calloc(sizeof(struct iovec), in_num);
304 assert(in_sg_cpy);
305 memcpy(in_sg_cpy, in_sg, sizeof(struct iovec) * in_num);
306 /* These get updated as we skip */
307 struct iovec *in_sg_ptr = in_sg_cpy;
308 int in_sg_cpy_count = in_num;
310 /* skip over parts of in_sg that contained the header iov */
311 size_t skip_size = iov_len;
313 size_t in_sg_left = 0;
314 do {
315 while (skip_size != 0 && in_sg_cpy_count) {
316 if (skip_size >= in_sg_ptr[0].iov_len) {
317 skip_size -= in_sg_ptr[0].iov_len;
318 in_sg_ptr++;
319 in_sg_cpy_count--;
320 } else {
321 in_sg_ptr[0].iov_len -= skip_size;
322 in_sg_ptr[0].iov_base += skip_size;
323 break;
327 int i;
328 for (i = 0, in_sg_left = 0; i < in_sg_cpy_count; i++) {
329 in_sg_left += in_sg_ptr[i].iov_len;
331 fuse_log(FUSE_LOG_DEBUG,
332 "%s: after skip skip_size=%zd in_sg_cpy_count=%d "
333 "in_sg_left=%zd\n",
334 __func__, skip_size, in_sg_cpy_count, in_sg_left);
335 ret = preadv(buf->buf[0].fd, in_sg_ptr, in_sg_cpy_count,
336 buf->buf[0].pos);
338 if (ret == -1) {
339 ret = errno;
340 fuse_log(FUSE_LOG_DEBUG, "%s: preadv failed (%m) len=%zd\n",
341 __func__, len);
342 free(in_sg_cpy);
343 goto err;
345 fuse_log(FUSE_LOG_DEBUG, "%s: preadv ret=%d len=%zd\n", __func__,
346 ret, len);
347 if (ret < len && ret) {
348 fuse_log(FUSE_LOG_DEBUG, "%s: ret < len\n", __func__);
349 /* Skip over this much next time around */
350 skip_size = ret;
351 buf->buf[0].pos += ret;
352 len -= ret;
354 /* Lets do another read */
355 continue;
357 if (!ret) {
358 /* EOF case? */
359 fuse_log(FUSE_LOG_DEBUG, "%s: !ret in_sg_left=%zd\n", __func__,
360 in_sg_left);
361 break;
363 if (ret != len) {
364 fuse_log(FUSE_LOG_DEBUG, "%s: ret!=len\n", __func__);
365 ret = EIO;
366 free(in_sg_cpy);
367 goto err;
369 in_sg_left -= ret;
370 len -= ret;
371 } while (in_sg_left);
372 free(in_sg_cpy);
374 /* Need to fix out->len on EOF */
375 if (len) {
376 struct fuse_out_header *out_sg = in_sg[0].iov_base;
378 tosend_len -= len;
379 out_sg->len = tosend_len;
382 ret = 0;
384 vu_queue_push(&se->virtio_dev->dev, q, elem, tosend_len);
385 vu_queue_notify(&se->virtio_dev->dev, q);
387 err:
388 if (ret == 0) {
389 ch->qi->reply_sent = true;
392 return ret;
395 /* Thread function for individual queues, created when a queue is 'started' */
396 static void *fv_queue_thread(void *opaque)
398 struct fv_QueueInfo *qi = opaque;
399 struct VuDev *dev = &qi->virtio_dev->dev;
400 struct VuVirtq *q = vu_get_queue(dev, qi->qidx);
401 struct fuse_session *se = qi->virtio_dev->se;
402 struct fuse_chan ch;
403 struct fuse_buf fbuf;
405 fbuf.mem = NULL;
406 fbuf.flags = 0;
408 fuse_mutex_init(&ch.lock);
409 ch.fd = (int)0xdaff0d111;
410 ch.qi = qi;
412 fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__,
413 qi->qidx, qi->kick_fd);
414 while (1) {
415 struct pollfd pf[1];
416 pf[0].fd = qi->kick_fd;
417 pf[0].events = POLLIN;
418 pf[0].revents = 0;
420 fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for Queue %d event\n", __func__,
421 qi->qidx);
422 int poll_res = ppoll(pf, 1, NULL, NULL);
424 if (poll_res == -1) {
425 if (errno == EINTR) {
426 fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n",
427 __func__);
428 continue;
430 fuse_log(FUSE_LOG_ERR, "fv_queue_thread ppoll: %m\n");
431 break;
433 assert(poll_res == 1);
434 if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) {
435 fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x Queue %d\n",
436 __func__, pf[0].revents, qi->qidx);
437 break;
439 assert(pf[0].revents & POLLIN);
440 fuse_log(FUSE_LOG_DEBUG, "%s: Got queue event on Queue %d\n", __func__,
441 qi->qidx);
443 eventfd_t evalue;
444 if (eventfd_read(qi->kick_fd, &evalue)) {
445 fuse_log(FUSE_LOG_ERR, "Eventfd_read for queue: %m\n");
446 break;
448 /* out is from guest, in is too guest */
449 unsigned int in_bytes, out_bytes;
450 vu_queue_get_avail_bytes(dev, q, &in_bytes, &out_bytes, ~0, ~0);
452 fuse_log(FUSE_LOG_DEBUG,
453 "%s: Queue %d gave evalue: %zx available: in: %u out: %u\n",
454 __func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes);
456 while (1) {
458 * An element contains one request and the space to send our
459 * response They're spread over multiple descriptors in a
460 * scatter/gather set and we can't trust the guest to keep them
461 * still; so copy in/out.
463 VuVirtqElement *elem = vu_queue_pop(dev, q, sizeof(VuVirtqElement));
464 if (!elem) {
465 break;
468 qi->qe = elem;
469 qi->reply_sent = false;
471 if (!fbuf.mem) {
472 fbuf.mem = malloc(se->bufsize);
473 assert(fbuf.mem);
474 assert(se->bufsize > sizeof(struct fuse_in_header));
476 /* The 'out' part of the elem is from qemu */
477 unsigned int out_num = elem->out_num;
478 struct iovec *out_sg = elem->out_sg;
479 size_t out_len = iov_size(out_sg, out_num);
480 fuse_log(FUSE_LOG_DEBUG,
481 "%s: elem %d: with %d out desc of length %zd\n", __func__,
482 elem->index, out_num, out_len);
485 * The elem should contain a 'fuse_in_header' (in to fuse)
486 * plus the data based on the len in the header.
488 if (out_len < sizeof(struct fuse_in_header)) {
489 fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for in_header\n",
490 __func__, elem->index);
491 assert(0); /* TODO */
493 if (out_len > se->bufsize) {
494 fuse_log(FUSE_LOG_ERR, "%s: elem %d too large for buffer\n",
495 __func__, elem->index);
496 assert(0); /* TODO */
498 copy_from_iov(&fbuf, out_num, out_sg);
499 fbuf.size = out_len;
501 /* TODO! Endianness of header */
503 /* TODO: Add checks for fuse_session_exited */
504 fuse_session_process_buf_int(se, &fbuf, &ch);
506 if (!qi->reply_sent) {
507 fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n",
508 __func__, elem->index);
509 /* I think we've still got to recycle the element */
510 vu_queue_push(dev, q, elem, 0);
511 vu_queue_notify(dev, q);
513 qi->qe = NULL;
514 free(elem);
515 elem = NULL;
518 pthread_mutex_destroy(&ch.lock);
519 free(fbuf.mem);
521 return NULL;
524 /* Callback from libvhost-user on start or stop of a queue */
525 static void fv_queue_set_started(VuDev *dev, int qidx, bool started)
527 struct fv_VuDev *vud = container_of(dev, struct fv_VuDev, dev);
528 struct fv_QueueInfo *ourqi;
530 fuse_log(FUSE_LOG_INFO, "%s: qidx=%d started=%d\n", __func__, qidx,
531 started);
532 assert(qidx >= 0);
535 * Ignore additional request queues for now. passthrough_ll.c must be
536 * audited for thread-safety issues first. It was written with a
537 * well-behaved client in mind and may not protect against all types of
538 * races yet.
540 if (qidx > 1) {
541 fuse_log(FUSE_LOG_ERR,
542 "%s: multiple request queues not yet implemented, please only "
543 "configure 1 request queue\n",
544 __func__);
545 exit(EXIT_FAILURE);
548 if (started) {
549 /* Fire up a thread to watch this queue */
550 if (qidx >= vud->nqueues) {
551 vud->qi = realloc(vud->qi, (qidx + 1) * sizeof(vud->qi[0]));
552 assert(vud->qi);
553 memset(vud->qi + vud->nqueues, 0,
554 sizeof(vud->qi[0]) * (1 + (qidx - vud->nqueues)));
555 vud->nqueues = qidx + 1;
557 if (!vud->qi[qidx]) {
558 vud->qi[qidx] = calloc(sizeof(struct fv_QueueInfo), 1);
559 assert(vud->qi[qidx]);
560 vud->qi[qidx]->virtio_dev = vud;
561 vud->qi[qidx]->qidx = qidx;
562 } else {
563 /* Shouldn't have been started */
564 assert(vud->qi[qidx]->kick_fd == -1);
566 ourqi = vud->qi[qidx];
567 ourqi->kick_fd = dev->vq[qidx].kick_fd;
568 if (pthread_create(&ourqi->thread, NULL, fv_queue_thread, ourqi)) {
569 fuse_log(FUSE_LOG_ERR, "%s: Failed to create thread for queue %d\n",
570 __func__, qidx);
571 assert(0);
573 } else {
574 /* TODO: Kill the thread */
575 assert(qidx < vud->nqueues);
576 ourqi = vud->qi[qidx];
577 ourqi->kick_fd = -1;
581 static bool fv_queue_order(VuDev *dev, int qidx)
583 return false;
586 static const VuDevIface fv_iface = {
587 .get_features = fv_get_features,
588 .set_features = fv_set_features,
590 /* Don't need process message, we've not got any at vhost-user level */
591 .queue_set_started = fv_queue_set_started,
593 .queue_is_processed_in_order = fv_queue_order,
597 * Main loop; this mostly deals with events on the vhost-user
598 * socket itself, and not actual fuse data.
600 int virtio_loop(struct fuse_session *se)
602 fuse_log(FUSE_LOG_INFO, "%s: Entry\n", __func__);
604 while (!fuse_session_exited(se)) {
605 struct pollfd pf[1];
606 pf[0].fd = se->vu_socketfd;
607 pf[0].events = POLLIN;
608 pf[0].revents = 0;
610 fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for VU event\n", __func__);
611 int poll_res = ppoll(pf, 1, NULL, NULL);
613 if (poll_res == -1) {
614 if (errno == EINTR) {
615 fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n",
616 __func__);
617 continue;
619 fuse_log(FUSE_LOG_ERR, "virtio_loop ppoll: %m\n");
620 break;
622 assert(poll_res == 1);
623 if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) {
624 fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x\n", __func__,
625 pf[0].revents);
626 break;
628 assert(pf[0].revents & POLLIN);
629 fuse_log(FUSE_LOG_DEBUG, "%s: Got VU event\n", __func__);
630 if (!vu_dispatch(&se->virtio_dev->dev)) {
631 fuse_log(FUSE_LOG_ERR, "%s: vu_dispatch failed\n", __func__);
632 break;
636 fuse_log(FUSE_LOG_INFO, "%s: Exit\n", __func__);
638 return 0;
641 static int fv_create_listen_socket(struct fuse_session *se)
643 struct sockaddr_un un;
644 mode_t old_umask;
646 /* Nothing to do if fd is already initialized */
647 if (se->vu_listen_fd >= 0) {
648 return 0;
651 if (strlen(se->vu_socket_path) >= sizeof(un.sun_path)) {
652 fuse_log(FUSE_LOG_ERR, "Socket path too long\n");
653 return -1;
657 * Create the Unix socket to communicate with qemu
658 * based on QEMU's vhost-user-bridge
660 unlink(se->vu_socket_path);
661 strcpy(un.sun_path, se->vu_socket_path);
662 size_t addr_len = sizeof(un);
664 int listen_sock = socket(AF_UNIX, SOCK_STREAM, 0);
665 if (listen_sock == -1) {
666 fuse_log(FUSE_LOG_ERR, "vhost socket creation: %m\n");
667 return -1;
669 un.sun_family = AF_UNIX;
672 * Unfortunately bind doesn't let you set the mask on the socket,
673 * so set umask to 077 and restore it later.
675 old_umask = umask(0077);
676 if (bind(listen_sock, (struct sockaddr *)&un, addr_len) == -1) {
677 fuse_log(FUSE_LOG_ERR, "vhost socket bind: %m\n");
678 umask(old_umask);
679 return -1;
681 umask(old_umask);
683 if (listen(listen_sock, 1) == -1) {
684 fuse_log(FUSE_LOG_ERR, "vhost socket listen: %m\n");
685 return -1;
688 se->vu_listen_fd = listen_sock;
689 return 0;
692 int virtio_session_mount(struct fuse_session *se)
694 int ret;
696 ret = fv_create_listen_socket(se);
697 if (ret < 0) {
698 return ret;
701 se->fd = -1;
703 fuse_log(FUSE_LOG_INFO, "%s: Waiting for vhost-user socket connection...\n",
704 __func__);
705 int data_sock = accept(se->vu_listen_fd, NULL, NULL);
706 if (data_sock == -1) {
707 fuse_log(FUSE_LOG_ERR, "vhost socket accept: %m\n");
708 close(se->vu_listen_fd);
709 return -1;
711 close(se->vu_listen_fd);
712 se->vu_listen_fd = -1;
713 fuse_log(FUSE_LOG_INFO, "%s: Received vhost-user socket connection\n",
714 __func__);
716 /* TODO: Some cleanup/deallocation! */
717 se->virtio_dev = calloc(sizeof(struct fv_VuDev), 1);
718 if (!se->virtio_dev) {
719 fuse_log(FUSE_LOG_ERR, "%s: virtio_dev calloc failed\n", __func__);
720 close(data_sock);
721 return -1;
724 se->vu_socketfd = data_sock;
725 se->virtio_dev->se = se;
726 vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd, fv_panic, fv_set_watch,
727 fv_remove_watch, &fv_iface);
729 return 0;