fuzz: avoid double-fetches by default
[qemu/ar7.git] / tools / virtiofsd / passthrough_ll.c
blob97485b22b4114228e622a0b256d9a9bfedd0ca9e
1 /*
2 * FUSE: Filesystem in Userspace
3 * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
5 * This program can be distributed under the terms of the GNU GPLv2.
6 * See the file COPYING.
7 */
9 /*
11 * This file system mirrors the existing file system hierarchy of the
12 * system, starting at the root file system. This is implemented by
13 * just "passing through" all requests to the corresponding user-space
14 * libc functions. In contrast to passthrough.c and passthrough_fh.c,
15 * this implementation uses the low-level API. Its performance should
16 * be the least bad among the three, but many operations are not
17 * implemented. In particular, it is not possible to remove files (or
18 * directories) because the code necessary to defer actual removal
19 * until the file is not opened anymore would make the example much
20 * more complicated.
22 * When writeback caching is enabled (-o writeback mount option), it
23 * is only possible to write to files for which the mounting user has
24 * read permissions. This is because the writeback cache requires the
25 * kernel to be able to issue read requests for all files (which the
26 * passthrough filesystem cannot satisfy if it can't read the file in
27 * the underlying filesystem).
29 * Compile with:
31 * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o
32 * passthrough_ll
34 * ## Source code ##
35 * \include passthrough_ll.c
38 #include "qemu/osdep.h"
39 #include "qemu/timer.h"
40 #include "fuse_virtio.h"
41 #include "fuse_log.h"
42 #include "fuse_lowlevel.h"
43 #include "standard-headers/linux/fuse.h"
44 #include <assert.h>
45 #include <cap-ng.h>
46 #include <dirent.h>
47 #include <errno.h>
48 #include <glib.h>
49 #include <inttypes.h>
50 #include <limits.h>
51 #include <pthread.h>
52 #include <stdbool.h>
53 #include <stddef.h>
54 #include <stdio.h>
55 #include <stdlib.h>
56 #include <string.h>
57 #include <sys/file.h>
58 #include <sys/mount.h>
59 #include <sys/prctl.h>
60 #include <sys/resource.h>
61 #include <sys/syscall.h>
62 #include <sys/types.h>
63 #include <sys/wait.h>
64 #include <sys/xattr.h>
65 #include <syslog.h>
66 #include <unistd.h>
68 #include "qemu/cutils.h"
69 #include "passthrough_helpers.h"
70 #include "passthrough_seccomp.h"
72 /* Keep track of inode posix locks for each owner. */
73 struct lo_inode_plock {
74 uint64_t lock_owner;
75 int fd; /* fd for OFD locks */
78 struct lo_map_elem {
79 union {
80 struct lo_inode *inode;
81 struct lo_dirp *dirp;
82 int fd;
83 ssize_t freelist;
85 bool in_use;
88 /* Maps FUSE fh or ino values to internal objects */
89 struct lo_map {
90 struct lo_map_elem *elems;
91 size_t nelems;
92 ssize_t freelist;
95 struct lo_key {
96 ino_t ino;
97 dev_t dev;
98 uint64_t mnt_id;
101 struct lo_inode {
102 int fd;
105 * Atomic reference count for this object. The nlookup field holds a
106 * reference and release it when nlookup reaches 0.
108 gint refcount;
110 struct lo_key key;
113 * This counter keeps the inode alive during the FUSE session.
114 * Incremented when the FUSE inode number is sent in a reply
115 * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc). Decremented when an inode is
116 * released by requests like FUSE_FORGET, FUSE_RMDIR, FUSE_RENAME, etc.
118 * Note that this value is untrusted because the client can manipulate
119 * it arbitrarily using FUSE_FORGET requests.
121 * Protected by lo->mutex.
123 uint64_t nlookup;
125 fuse_ino_t fuse_ino;
126 pthread_mutex_t plock_mutex;
127 GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
129 mode_t filetype;
132 struct lo_cred {
133 uid_t euid;
134 gid_t egid;
137 enum {
138 CACHE_NONE,
139 CACHE_AUTO,
140 CACHE_ALWAYS,
143 enum {
144 SANDBOX_NAMESPACE,
145 SANDBOX_CHROOT,
148 typedef struct xattr_map_entry {
149 char *key;
150 char *prepend;
151 unsigned int flags;
152 } XattrMapEntry;
154 struct lo_data {
155 pthread_mutex_t mutex;
156 int sandbox;
157 int debug;
158 int writeback;
159 int flock;
160 int posix_lock;
161 int xattr;
162 char *xattrmap;
163 char *source;
164 char *modcaps;
165 double timeout;
166 int cache;
167 int timeout_set;
168 int readdirplus_set;
169 int readdirplus_clear;
170 int allow_direct_io;
171 int announce_submounts;
172 bool use_statx;
173 struct lo_inode root;
174 GHashTable *inodes; /* protected by lo->mutex */
175 struct lo_map ino_map; /* protected by lo->mutex */
176 struct lo_map dirp_map; /* protected by lo->mutex */
177 struct lo_map fd_map; /* protected by lo->mutex */
178 XattrMapEntry *xattr_map_list;
179 size_t xattr_map_nentries;
181 /* An O_PATH file descriptor to /proc/self/fd/ */
182 int proc_self_fd;
185 static const struct fuse_opt lo_opts[] = {
186 { "sandbox=namespace",
187 offsetof(struct lo_data, sandbox),
188 SANDBOX_NAMESPACE },
189 { "sandbox=chroot",
190 offsetof(struct lo_data, sandbox),
191 SANDBOX_CHROOT },
192 { "writeback", offsetof(struct lo_data, writeback), 1 },
193 { "no_writeback", offsetof(struct lo_data, writeback), 0 },
194 { "source=%s", offsetof(struct lo_data, source), 0 },
195 { "flock", offsetof(struct lo_data, flock), 1 },
196 { "no_flock", offsetof(struct lo_data, flock), 0 },
197 { "posix_lock", offsetof(struct lo_data, posix_lock), 1 },
198 { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 },
199 { "xattr", offsetof(struct lo_data, xattr), 1 },
200 { "no_xattr", offsetof(struct lo_data, xattr), 0 },
201 { "xattrmap=%s", offsetof(struct lo_data, xattrmap), 0 },
202 { "modcaps=%s", offsetof(struct lo_data, modcaps), 0 },
203 { "timeout=%lf", offsetof(struct lo_data, timeout), 0 },
204 { "timeout=", offsetof(struct lo_data, timeout_set), 1 },
205 { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE },
206 { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO },
207 { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS },
208 { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 },
209 { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 },
210 { "allow_direct_io", offsetof(struct lo_data, allow_direct_io), 1 },
211 { "no_allow_direct_io", offsetof(struct lo_data, allow_direct_io), 0 },
212 { "announce_submounts", offsetof(struct lo_data, announce_submounts), 1 },
213 FUSE_OPT_END
215 static bool use_syslog = false;
216 static int current_log_level;
217 static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
218 uint64_t n);
220 static struct {
221 pthread_mutex_t mutex;
222 void *saved;
223 } cap;
224 /* That we loaded cap-ng in the current thread from the saved */
225 static __thread bool cap_loaded = 0;
227 static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
228 uint64_t mnt_id);
230 static int is_dot_or_dotdot(const char *name)
232 return name[0] == '.' &&
233 (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'));
236 /* Is `path` a single path component that is not "." or ".."? */
237 static int is_safe_path_component(const char *path)
239 if (strchr(path, '/')) {
240 return 0;
243 return !is_dot_or_dotdot(path);
246 static struct lo_data *lo_data(fuse_req_t req)
248 return (struct lo_data *)fuse_req_userdata(req);
252 * Load capng's state from our saved state if the current thread
253 * hadn't previously been loaded.
254 * returns 0 on success
256 static int load_capng(void)
258 if (!cap_loaded) {
259 pthread_mutex_lock(&cap.mutex);
260 capng_restore_state(&cap.saved);
262 * restore_state free's the saved copy
263 * so make another.
265 cap.saved = capng_save_state();
266 if (!cap.saved) {
267 pthread_mutex_unlock(&cap.mutex);
268 fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n");
269 return -EINVAL;
271 pthread_mutex_unlock(&cap.mutex);
274 * We want to use the loaded state for our pid,
275 * not the original
277 capng_setpid(syscall(SYS_gettid));
278 cap_loaded = true;
280 return 0;
284 * Helpers for dropping and regaining effective capabilities. Returns 0
285 * on success, error otherwise
287 static int drop_effective_cap(const char *cap_name, bool *cap_dropped)
289 int cap, ret;
291 cap = capng_name_to_capability(cap_name);
292 if (cap < 0) {
293 ret = errno;
294 fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
295 cap_name, strerror(errno));
296 goto out;
299 if (load_capng()) {
300 ret = errno;
301 fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
302 goto out;
305 /* We dont have this capability in effective set already. */
306 if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) {
307 ret = 0;
308 goto out;
311 if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) {
312 ret = errno;
313 fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n");
314 goto out;
317 if (capng_apply(CAPNG_SELECT_CAPS)) {
318 ret = errno;
319 fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n");
320 goto out;
323 ret = 0;
324 if (cap_dropped) {
325 *cap_dropped = true;
328 out:
329 return ret;
332 static int gain_effective_cap(const char *cap_name)
334 int cap;
335 int ret = 0;
337 cap = capng_name_to_capability(cap_name);
338 if (cap < 0) {
339 ret = errno;
340 fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
341 cap_name, strerror(errno));
342 goto out;
345 if (load_capng()) {
346 ret = errno;
347 fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
348 goto out;
351 if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) {
352 ret = errno;
353 fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n");
354 goto out;
357 if (capng_apply(CAPNG_SELECT_CAPS)) {
358 ret = errno;
359 fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n");
360 goto out;
362 ret = 0;
364 out:
365 return ret;
368 static void lo_map_init(struct lo_map *map)
370 map->elems = NULL;
371 map->nelems = 0;
372 map->freelist = -1;
375 static void lo_map_destroy(struct lo_map *map)
377 free(map->elems);
380 static int lo_map_grow(struct lo_map *map, size_t new_nelems)
382 struct lo_map_elem *new_elems;
383 size_t i;
385 if (new_nelems <= map->nelems) {
386 return 1;
389 new_elems = realloc(map->elems, sizeof(map->elems[0]) * new_nelems);
390 if (!new_elems) {
391 return 0;
394 for (i = map->nelems; i < new_nelems; i++) {
395 new_elems[i].freelist = i + 1;
396 new_elems[i].in_use = false;
398 new_elems[new_nelems - 1].freelist = -1;
400 map->elems = new_elems;
401 map->freelist = map->nelems;
402 map->nelems = new_nelems;
403 return 1;
406 static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map)
408 struct lo_map_elem *elem;
410 if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) {
411 return NULL;
414 elem = &map->elems[map->freelist];
415 map->freelist = elem->freelist;
417 elem->in_use = true;
419 return elem;
422 static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key)
424 ssize_t *prev;
426 if (!lo_map_grow(map, key + 1)) {
427 return NULL;
430 for (prev = &map->freelist; *prev != -1;
431 prev = &map->elems[*prev].freelist) {
432 if (*prev == key) {
433 struct lo_map_elem *elem = &map->elems[key];
435 *prev = elem->freelist;
436 elem->in_use = true;
437 return elem;
440 return NULL;
443 static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key)
445 if (key >= map->nelems) {
446 return NULL;
448 if (!map->elems[key].in_use) {
449 return NULL;
451 return &map->elems[key];
454 static void lo_map_remove(struct lo_map *map, size_t key)
456 struct lo_map_elem *elem;
458 if (key >= map->nelems) {
459 return;
462 elem = &map->elems[key];
463 if (!elem->in_use) {
464 return;
467 elem->in_use = false;
469 elem->freelist = map->freelist;
470 map->freelist = key;
473 /* Assumes lo->mutex is held */
474 static ssize_t lo_add_fd_mapping(fuse_req_t req, int fd)
476 struct lo_map_elem *elem;
478 elem = lo_map_alloc_elem(&lo_data(req)->fd_map);
479 if (!elem) {
480 return -1;
483 elem->fd = fd;
484 return elem - lo_data(req)->fd_map.elems;
487 /* Assumes lo->mutex is held */
488 static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp)
490 struct lo_map_elem *elem;
492 elem = lo_map_alloc_elem(&lo_data(req)->dirp_map);
493 if (!elem) {
494 return -1;
497 elem->dirp = dirp;
498 return elem - lo_data(req)->dirp_map.elems;
501 /* Assumes lo->mutex is held */
502 static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode)
504 struct lo_map_elem *elem;
506 elem = lo_map_alloc_elem(&lo_data(req)->ino_map);
507 if (!elem) {
508 return -1;
511 elem->inode = inode;
512 return elem - lo_data(req)->ino_map.elems;
515 static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep)
517 struct lo_inode *inode = *inodep;
519 if (!inode) {
520 return;
523 *inodep = NULL;
525 if (g_atomic_int_dec_and_test(&inode->refcount)) {
526 close(inode->fd);
527 free(inode);
531 /* Caller must release refcount using lo_inode_put() */
532 static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino)
534 struct lo_data *lo = lo_data(req);
535 struct lo_map_elem *elem;
537 pthread_mutex_lock(&lo->mutex);
538 elem = lo_map_get(&lo->ino_map, ino);
539 if (elem) {
540 g_atomic_int_inc(&elem->inode->refcount);
542 pthread_mutex_unlock(&lo->mutex);
544 if (!elem) {
545 return NULL;
548 return elem->inode;
552 * TODO Remove this helper and force callers to hold an inode refcount until
553 * they are done with the fd. This will be done in a later patch to make
554 * review easier.
556 static int lo_fd(fuse_req_t req, fuse_ino_t ino)
558 struct lo_inode *inode = lo_inode(req, ino);
559 int fd;
561 if (!inode) {
562 return -1;
565 fd = inode->fd;
566 lo_inode_put(lo_data(req), &inode);
567 return fd;
570 static void lo_init(void *userdata, struct fuse_conn_info *conn)
572 struct lo_data *lo = (struct lo_data *)userdata;
574 if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) {
575 conn->want |= FUSE_CAP_EXPORT_SUPPORT;
578 if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) {
579 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n");
580 conn->want |= FUSE_CAP_WRITEBACK_CACHE;
582 if (conn->capable & FUSE_CAP_FLOCK_LOCKS) {
583 if (lo->flock) {
584 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n");
585 conn->want |= FUSE_CAP_FLOCK_LOCKS;
586 } else {
587 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n");
588 conn->want &= ~FUSE_CAP_FLOCK_LOCKS;
592 if (conn->capable & FUSE_CAP_POSIX_LOCKS) {
593 if (lo->posix_lock) {
594 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n");
595 conn->want |= FUSE_CAP_POSIX_LOCKS;
596 } else {
597 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n");
598 conn->want &= ~FUSE_CAP_POSIX_LOCKS;
602 if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) ||
603 lo->readdirplus_clear) {
604 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n");
605 conn->want &= ~FUSE_CAP_READDIRPLUS;
608 if (!(conn->capable & FUSE_CAP_SUBMOUNTS) && lo->announce_submounts) {
609 fuse_log(FUSE_LOG_WARNING, "lo_init: Cannot announce submounts, client "
610 "does not support it\n");
611 lo->announce_submounts = false;
615 static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
616 struct fuse_file_info *fi)
618 int res;
619 struct stat buf;
620 struct lo_data *lo = lo_data(req);
622 (void)fi;
624 res =
625 fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
626 if (res == -1) {
627 return (void)fuse_reply_err(req, errno);
630 fuse_reply_attr(req, &buf, lo->timeout);
633 static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi)
635 struct lo_data *lo = lo_data(req);
636 struct lo_map_elem *elem;
638 pthread_mutex_lock(&lo->mutex);
639 elem = lo_map_get(&lo->fd_map, fi->fh);
640 pthread_mutex_unlock(&lo->mutex);
642 if (!elem) {
643 return -1;
646 return elem->fd;
649 static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
650 int valid, struct fuse_file_info *fi)
652 int saverr;
653 char procname[64];
654 struct lo_data *lo = lo_data(req);
655 struct lo_inode *inode;
656 int ifd;
657 int res;
658 int fd = -1;
660 inode = lo_inode(req, ino);
661 if (!inode) {
662 fuse_reply_err(req, EBADF);
663 return;
666 ifd = inode->fd;
668 /* If fi->fh is invalid we'll report EBADF later */
669 if (fi) {
670 fd = lo_fi_fd(req, fi);
673 if (valid & FUSE_SET_ATTR_MODE) {
674 if (fi) {
675 res = fchmod(fd, attr->st_mode);
676 } else {
677 sprintf(procname, "%i", ifd);
678 res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0);
680 if (res == -1) {
681 goto out_err;
684 if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) {
685 uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1;
686 gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1;
688 res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
689 if (res == -1) {
690 goto out_err;
693 if (valid & FUSE_SET_ATTR_SIZE) {
694 int truncfd;
696 if (fi) {
697 truncfd = fd;
698 } else {
699 sprintf(procname, "%i", ifd);
700 truncfd = openat(lo->proc_self_fd, procname, O_RDWR);
701 if (truncfd < 0) {
702 goto out_err;
706 res = ftruncate(truncfd, attr->st_size);
707 if (!fi) {
708 saverr = errno;
709 close(truncfd);
710 errno = saverr;
712 if (res == -1) {
713 goto out_err;
716 if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) {
717 struct timespec tv[2];
719 tv[0].tv_sec = 0;
720 tv[1].tv_sec = 0;
721 tv[0].tv_nsec = UTIME_OMIT;
722 tv[1].tv_nsec = UTIME_OMIT;
724 if (valid & FUSE_SET_ATTR_ATIME_NOW) {
725 tv[0].tv_nsec = UTIME_NOW;
726 } else if (valid & FUSE_SET_ATTR_ATIME) {
727 tv[0] = attr->st_atim;
730 if (valid & FUSE_SET_ATTR_MTIME_NOW) {
731 tv[1].tv_nsec = UTIME_NOW;
732 } else if (valid & FUSE_SET_ATTR_MTIME) {
733 tv[1] = attr->st_mtim;
736 if (fi) {
737 res = futimens(fd, tv);
738 } else {
739 sprintf(procname, "%i", inode->fd);
740 res = utimensat(lo->proc_self_fd, procname, tv, 0);
742 if (res == -1) {
743 goto out_err;
746 lo_inode_put(lo, &inode);
748 return lo_getattr(req, ino, fi);
750 out_err:
751 saverr = errno;
752 lo_inode_put(lo, &inode);
753 fuse_reply_err(req, saverr);
756 static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
757 uint64_t mnt_id)
759 struct lo_inode *p;
760 struct lo_key key = {
761 .ino = st->st_ino,
762 .dev = st->st_dev,
763 .mnt_id = mnt_id,
766 pthread_mutex_lock(&lo->mutex);
767 p = g_hash_table_lookup(lo->inodes, &key);
768 if (p) {
769 assert(p->nlookup > 0);
770 p->nlookup++;
771 g_atomic_int_inc(&p->refcount);
773 pthread_mutex_unlock(&lo->mutex);
775 return p;
778 /* value_destroy_func for posix_locks GHashTable */
779 static void posix_locks_value_destroy(gpointer data)
781 struct lo_inode_plock *plock = data;
784 * We had used open() for locks and had only one fd. So
785 * closing this fd should release all OFD locks.
787 close(plock->fd);
788 free(plock);
791 static int do_statx(struct lo_data *lo, int dirfd, const char *pathname,
792 struct stat *statbuf, int flags, uint64_t *mnt_id)
794 int res;
796 #if defined(CONFIG_STATX) && defined(STATX_MNT_ID)
797 if (lo->use_statx) {
798 struct statx statxbuf;
800 res = statx(dirfd, pathname, flags, STATX_BASIC_STATS | STATX_MNT_ID,
801 &statxbuf);
802 if (!res) {
803 memset(statbuf, 0, sizeof(*statbuf));
804 statbuf->st_dev = makedev(statxbuf.stx_dev_major,
805 statxbuf.stx_dev_minor);
806 statbuf->st_ino = statxbuf.stx_ino;
807 statbuf->st_mode = statxbuf.stx_mode;
808 statbuf->st_nlink = statxbuf.stx_nlink;
809 statbuf->st_uid = statxbuf.stx_uid;
810 statbuf->st_gid = statxbuf.stx_gid;
811 statbuf->st_rdev = makedev(statxbuf.stx_rdev_major,
812 statxbuf.stx_rdev_minor);
813 statbuf->st_size = statxbuf.stx_size;
814 statbuf->st_blksize = statxbuf.stx_blksize;
815 statbuf->st_blocks = statxbuf.stx_blocks;
816 statbuf->st_atim.tv_sec = statxbuf.stx_atime.tv_sec;
817 statbuf->st_atim.tv_nsec = statxbuf.stx_atime.tv_nsec;
818 statbuf->st_mtim.tv_sec = statxbuf.stx_mtime.tv_sec;
819 statbuf->st_mtim.tv_nsec = statxbuf.stx_mtime.tv_nsec;
820 statbuf->st_ctim.tv_sec = statxbuf.stx_ctime.tv_sec;
821 statbuf->st_ctim.tv_nsec = statxbuf.stx_ctime.tv_nsec;
823 if (statxbuf.stx_mask & STATX_MNT_ID) {
824 *mnt_id = statxbuf.stx_mnt_id;
825 } else {
826 *mnt_id = 0;
828 return 0;
829 } else if (errno != ENOSYS) {
830 return -1;
832 lo->use_statx = false;
833 /* fallback */
835 #endif
836 res = fstatat(dirfd, pathname, statbuf, flags);
837 if (res == -1) {
838 return -1;
840 *mnt_id = 0;
842 return 0;
846 * Increments nlookup and caller must release refcount using
847 * lo_inode_put(&parent).
849 static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
850 struct fuse_entry_param *e)
852 int newfd;
853 int res;
854 int saverr;
855 uint64_t mnt_id;
856 struct lo_data *lo = lo_data(req);
857 struct lo_inode *inode = NULL;
858 struct lo_inode *dir = lo_inode(req, parent);
861 * name_to_handle_at() and open_by_handle_at() can reach here with fuse
862 * mount point in guest, but we don't have its inode info in the
863 * ino_map.
865 if (!dir) {
866 return ENOENT;
869 memset(e, 0, sizeof(*e));
870 e->attr_timeout = lo->timeout;
871 e->entry_timeout = lo->timeout;
873 /* Do not allow escaping root directory */
874 if (dir == &lo->root && strcmp(name, "..") == 0) {
875 name = ".";
878 newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW);
879 if (newfd == -1) {
880 goto out_err;
883 res = do_statx(lo, newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
884 &mnt_id);
885 if (res == -1) {
886 goto out_err;
889 if (S_ISDIR(e->attr.st_mode) && lo->announce_submounts &&
890 (e->attr.st_dev != dir->key.dev || mnt_id != dir->key.mnt_id)) {
891 e->attr_flags |= FUSE_ATTR_SUBMOUNT;
894 inode = lo_find(lo, &e->attr, mnt_id);
895 if (inode) {
896 close(newfd);
897 } else {
898 inode = calloc(1, sizeof(struct lo_inode));
899 if (!inode) {
900 goto out_err;
903 /* cache only filetype */
904 inode->filetype = (e->attr.st_mode & S_IFMT);
907 * One for the caller and one for nlookup (released in
908 * unref_inode_lolocked())
910 g_atomic_int_set(&inode->refcount, 2);
912 inode->nlookup = 1;
913 inode->fd = newfd;
914 inode->key.ino = e->attr.st_ino;
915 inode->key.dev = e->attr.st_dev;
916 inode->key.mnt_id = mnt_id;
917 pthread_mutex_init(&inode->plock_mutex, NULL);
918 inode->posix_locks = g_hash_table_new_full(
919 g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
921 pthread_mutex_lock(&lo->mutex);
922 inode->fuse_ino = lo_add_inode_mapping(req, inode);
923 g_hash_table_insert(lo->inodes, &inode->key, inode);
924 pthread_mutex_unlock(&lo->mutex);
926 e->ino = inode->fuse_ino;
927 lo_inode_put(lo, &inode);
928 lo_inode_put(lo, &dir);
930 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
931 name, (unsigned long long)e->ino);
933 return 0;
935 out_err:
936 saverr = errno;
937 if (newfd != -1) {
938 close(newfd);
940 lo_inode_put(lo, &inode);
941 lo_inode_put(lo, &dir);
942 return saverr;
945 static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
947 struct fuse_entry_param e;
948 int err;
950 fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent,
951 name);
954 * Don't use is_safe_path_component(), allow "." and ".." for NFS export
955 * support.
957 if (strchr(name, '/')) {
958 fuse_reply_err(req, EINVAL);
959 return;
962 err = lo_do_lookup(req, parent, name, &e);
963 if (err) {
964 fuse_reply_err(req, err);
965 } else {
966 fuse_reply_entry(req, &e);
971 * On some archs, setres*id is limited to 2^16 but they
972 * provide setres*id32 variants that allow 2^32.
973 * Others just let setres*id do 2^32 anyway.
975 #ifdef SYS_setresgid32
976 #define OURSYS_setresgid SYS_setresgid32
977 #else
978 #define OURSYS_setresgid SYS_setresgid
979 #endif
981 #ifdef SYS_setresuid32
982 #define OURSYS_setresuid SYS_setresuid32
983 #else
984 #define OURSYS_setresuid SYS_setresuid
985 #endif
988 * Change to uid/gid of caller so that file is created with
989 * ownership of caller.
990 * TODO: What about selinux context?
992 static int lo_change_cred(fuse_req_t req, struct lo_cred *old)
994 int res;
996 old->euid = geteuid();
997 old->egid = getegid();
999 res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1);
1000 if (res == -1) {
1001 return errno;
1004 res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1);
1005 if (res == -1) {
1006 int errno_save = errno;
1008 syscall(OURSYS_setresgid, -1, old->egid, -1);
1009 return errno_save;
1012 return 0;
1015 /* Regain Privileges */
1016 static void lo_restore_cred(struct lo_cred *old)
1018 int res;
1020 res = syscall(OURSYS_setresuid, -1, old->euid, -1);
1021 if (res == -1) {
1022 fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid);
1023 exit(1);
1026 res = syscall(OURSYS_setresgid, -1, old->egid, -1);
1027 if (res == -1) {
1028 fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid);
1029 exit(1);
1033 static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
1034 const char *name, mode_t mode, dev_t rdev,
1035 const char *link)
1037 int res;
1038 int saverr;
1039 struct lo_data *lo = lo_data(req);
1040 struct lo_inode *dir;
1041 struct fuse_entry_param e;
1042 struct lo_cred old = {};
1044 if (!is_safe_path_component(name)) {
1045 fuse_reply_err(req, EINVAL);
1046 return;
1049 dir = lo_inode(req, parent);
1050 if (!dir) {
1051 fuse_reply_err(req, EBADF);
1052 return;
1055 saverr = lo_change_cred(req, &old);
1056 if (saverr) {
1057 goto out;
1060 res = mknod_wrapper(dir->fd, name, link, mode, rdev);
1062 saverr = errno;
1064 lo_restore_cred(&old);
1066 if (res == -1) {
1067 goto out;
1070 saverr = lo_do_lookup(req, parent, name, &e);
1071 if (saverr) {
1072 goto out;
1075 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
1076 name, (unsigned long long)e.ino);
1078 fuse_reply_entry(req, &e);
1079 lo_inode_put(lo, &dir);
1080 return;
1082 out:
1083 lo_inode_put(lo, &dir);
1084 fuse_reply_err(req, saverr);
1087 static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name,
1088 mode_t mode, dev_t rdev)
1090 lo_mknod_symlink(req, parent, name, mode, rdev, NULL);
1093 static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name,
1094 mode_t mode)
1096 lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL);
1099 static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent,
1100 const char *name)
1102 lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link);
1105 static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent,
1106 const char *name)
1108 int res;
1109 struct lo_data *lo = lo_data(req);
1110 struct lo_inode *parent_inode;
1111 struct lo_inode *inode;
1112 struct fuse_entry_param e;
1113 char procname[64];
1114 int saverr;
1116 if (!is_safe_path_component(name)) {
1117 fuse_reply_err(req, EINVAL);
1118 return;
1121 parent_inode = lo_inode(req, parent);
1122 inode = lo_inode(req, ino);
1123 if (!parent_inode || !inode) {
1124 errno = EBADF;
1125 goto out_err;
1128 memset(&e, 0, sizeof(struct fuse_entry_param));
1129 e.attr_timeout = lo->timeout;
1130 e.entry_timeout = lo->timeout;
1132 sprintf(procname, "%i", inode->fd);
1133 res = linkat(lo->proc_self_fd, procname, parent_inode->fd, name,
1134 AT_SYMLINK_FOLLOW);
1135 if (res == -1) {
1136 goto out_err;
1139 res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1140 if (res == -1) {
1141 goto out_err;
1144 pthread_mutex_lock(&lo->mutex);
1145 inode->nlookup++;
1146 pthread_mutex_unlock(&lo->mutex);
1147 e.ino = inode->fuse_ino;
1149 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
1150 name, (unsigned long long)e.ino);
1152 fuse_reply_entry(req, &e);
1153 lo_inode_put(lo, &parent_inode);
1154 lo_inode_put(lo, &inode);
1155 return;
1157 out_err:
1158 saverr = errno;
1159 lo_inode_put(lo, &parent_inode);
1160 lo_inode_put(lo, &inode);
1161 fuse_reply_err(req, saverr);
1164 /* Increments nlookup and caller must release refcount using lo_inode_put() */
1165 static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent,
1166 const char *name)
1168 int res;
1169 uint64_t mnt_id;
1170 struct stat attr;
1171 struct lo_data *lo = lo_data(req);
1172 struct lo_inode *dir = lo_inode(req, parent);
1174 if (!dir) {
1175 return NULL;
1178 res = do_statx(lo, dir->fd, name, &attr,
1179 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW, &mnt_id);
1180 lo_inode_put(lo, &dir);
1181 if (res == -1) {
1182 return NULL;
1185 return lo_find(lo, &attr, mnt_id);
1188 static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
1190 int res;
1191 struct lo_inode *inode;
1192 struct lo_data *lo = lo_data(req);
1194 if (!is_safe_path_component(name)) {
1195 fuse_reply_err(req, EINVAL);
1196 return;
1199 inode = lookup_name(req, parent, name);
1200 if (!inode) {
1201 fuse_reply_err(req, EIO);
1202 return;
1205 res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR);
1207 fuse_reply_err(req, res == -1 ? errno : 0);
1208 unref_inode_lolocked(lo, inode, 1);
1209 lo_inode_put(lo, &inode);
1212 static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
1213 fuse_ino_t newparent, const char *newname,
1214 unsigned int flags)
1216 int res;
1217 struct lo_inode *parent_inode;
1218 struct lo_inode *newparent_inode;
1219 struct lo_inode *oldinode = NULL;
1220 struct lo_inode *newinode = NULL;
1221 struct lo_data *lo = lo_data(req);
1223 if (!is_safe_path_component(name) || !is_safe_path_component(newname)) {
1224 fuse_reply_err(req, EINVAL);
1225 return;
1228 parent_inode = lo_inode(req, parent);
1229 newparent_inode = lo_inode(req, newparent);
1230 if (!parent_inode || !newparent_inode) {
1231 fuse_reply_err(req, EBADF);
1232 goto out;
1235 oldinode = lookup_name(req, parent, name);
1236 newinode = lookup_name(req, newparent, newname);
1238 if (!oldinode) {
1239 fuse_reply_err(req, EIO);
1240 goto out;
1243 if (flags) {
1244 #ifndef SYS_renameat2
1245 fuse_reply_err(req, EINVAL);
1246 #else
1247 res = syscall(SYS_renameat2, parent_inode->fd, name,
1248 newparent_inode->fd, newname, flags);
1249 if (res == -1 && errno == ENOSYS) {
1250 fuse_reply_err(req, EINVAL);
1251 } else {
1252 fuse_reply_err(req, res == -1 ? errno : 0);
1254 #endif
1255 goto out;
1258 res = renameat(parent_inode->fd, name, newparent_inode->fd, newname);
1260 fuse_reply_err(req, res == -1 ? errno : 0);
1261 out:
1262 unref_inode_lolocked(lo, oldinode, 1);
1263 unref_inode_lolocked(lo, newinode, 1);
1264 lo_inode_put(lo, &oldinode);
1265 lo_inode_put(lo, &newinode);
1266 lo_inode_put(lo, &parent_inode);
1267 lo_inode_put(lo, &newparent_inode);
1270 static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name)
1272 int res;
1273 struct lo_inode *inode;
1274 struct lo_data *lo = lo_data(req);
1276 if (!is_safe_path_component(name)) {
1277 fuse_reply_err(req, EINVAL);
1278 return;
1281 inode = lookup_name(req, parent, name);
1282 if (!inode) {
1283 fuse_reply_err(req, EIO);
1284 return;
1287 res = unlinkat(lo_fd(req, parent), name, 0);
1289 fuse_reply_err(req, res == -1 ? errno : 0);
1290 unref_inode_lolocked(lo, inode, 1);
1291 lo_inode_put(lo, &inode);
1294 /* To be called with lo->mutex held */
1295 static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n)
1297 if (!inode) {
1298 return;
1301 assert(inode->nlookup >= n);
1302 inode->nlookup -= n;
1303 if (!inode->nlookup) {
1304 lo_map_remove(&lo->ino_map, inode->fuse_ino);
1305 g_hash_table_remove(lo->inodes, &inode->key);
1306 if (g_hash_table_size(inode->posix_locks)) {
1307 fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
1309 g_hash_table_destroy(inode->posix_locks);
1310 pthread_mutex_destroy(&inode->plock_mutex);
1312 /* Drop our refcount from lo_do_lookup() */
1313 lo_inode_put(lo, &inode);
1317 static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
1318 uint64_t n)
1320 if (!inode) {
1321 return;
1324 pthread_mutex_lock(&lo->mutex);
1325 unref_inode(lo, inode, n);
1326 pthread_mutex_unlock(&lo->mutex);
1329 static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
1331 struct lo_data *lo = lo_data(req);
1332 struct lo_inode *inode;
1334 inode = lo_inode(req, ino);
1335 if (!inode) {
1336 return;
1339 fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n",
1340 (unsigned long long)ino, (unsigned long long)inode->nlookup,
1341 (unsigned long long)nlookup);
1343 unref_inode_lolocked(lo, inode, nlookup);
1344 lo_inode_put(lo, &inode);
1347 static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
1349 lo_forget_one(req, ino, nlookup);
1350 fuse_reply_none(req);
1353 static void lo_forget_multi(fuse_req_t req, size_t count,
1354 struct fuse_forget_data *forgets)
1356 int i;
1358 for (i = 0; i < count; i++) {
1359 lo_forget_one(req, forgets[i].ino, forgets[i].nlookup);
1361 fuse_reply_none(req);
1364 static void lo_readlink(fuse_req_t req, fuse_ino_t ino)
1366 char buf[PATH_MAX + 1];
1367 int res;
1369 res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf));
1370 if (res == -1) {
1371 return (void)fuse_reply_err(req, errno);
1374 if (res == sizeof(buf)) {
1375 return (void)fuse_reply_err(req, ENAMETOOLONG);
1378 buf[res] = '\0';
1380 fuse_reply_readlink(req, buf);
1383 struct lo_dirp {
1384 gint refcount;
1385 DIR *dp;
1386 struct dirent *entry;
1387 off_t offset;
1390 static void lo_dirp_put(struct lo_dirp **dp)
1392 struct lo_dirp *d = *dp;
1394 if (!d) {
1395 return;
1397 *dp = NULL;
1399 if (g_atomic_int_dec_and_test(&d->refcount)) {
1400 closedir(d->dp);
1401 free(d);
1405 /* Call lo_dirp_put() on the return value when no longer needed */
1406 static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi)
1408 struct lo_data *lo = lo_data(req);
1409 struct lo_map_elem *elem;
1411 pthread_mutex_lock(&lo->mutex);
1412 elem = lo_map_get(&lo->dirp_map, fi->fh);
1413 if (elem) {
1414 g_atomic_int_inc(&elem->dirp->refcount);
1416 pthread_mutex_unlock(&lo->mutex);
1417 if (!elem) {
1418 return NULL;
1421 return elem->dirp;
1424 static void lo_opendir(fuse_req_t req, fuse_ino_t ino,
1425 struct fuse_file_info *fi)
1427 int error = ENOMEM;
1428 struct lo_data *lo = lo_data(req);
1429 struct lo_dirp *d;
1430 int fd;
1431 ssize_t fh;
1433 d = calloc(1, sizeof(struct lo_dirp));
1434 if (d == NULL) {
1435 goto out_err;
1438 fd = openat(lo_fd(req, ino), ".", O_RDONLY);
1439 if (fd == -1) {
1440 goto out_errno;
1443 d->dp = fdopendir(fd);
1444 if (d->dp == NULL) {
1445 goto out_errno;
1448 d->offset = 0;
1449 d->entry = NULL;
1451 g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */
1452 pthread_mutex_lock(&lo->mutex);
1453 fh = lo_add_dirp_mapping(req, d);
1454 pthread_mutex_unlock(&lo->mutex);
1455 if (fh == -1) {
1456 goto out_err;
1459 fi->fh = fh;
1460 if (lo->cache == CACHE_ALWAYS) {
1461 fi->cache_readdir = 1;
1463 fuse_reply_open(req, fi);
1464 return;
1466 out_errno:
1467 error = errno;
1468 out_err:
1469 if (d) {
1470 if (d->dp) {
1471 closedir(d->dp);
1472 } else if (fd != -1) {
1473 close(fd);
1475 free(d);
1477 fuse_reply_err(req, error);
1480 static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
1481 off_t offset, struct fuse_file_info *fi, int plus)
1483 struct lo_data *lo = lo_data(req);
1484 struct lo_dirp *d = NULL;
1485 struct lo_inode *dinode;
1486 char *buf = NULL;
1487 char *p;
1488 size_t rem = size;
1489 int err = EBADF;
1491 dinode = lo_inode(req, ino);
1492 if (!dinode) {
1493 goto error;
1496 d = lo_dirp(req, fi);
1497 if (!d) {
1498 goto error;
1501 err = ENOMEM;
1502 buf = calloc(1, size);
1503 if (!buf) {
1504 goto error;
1506 p = buf;
1508 if (offset != d->offset) {
1509 seekdir(d->dp, offset);
1510 d->entry = NULL;
1511 d->offset = offset;
1513 while (1) {
1514 size_t entsize;
1515 off_t nextoff;
1516 const char *name;
1518 if (!d->entry) {
1519 errno = 0;
1520 d->entry = readdir(d->dp);
1521 if (!d->entry) {
1522 if (errno) { /* Error */
1523 err = errno;
1524 goto error;
1525 } else { /* End of stream */
1526 break;
1530 nextoff = d->entry->d_off;
1531 name = d->entry->d_name;
1533 fuse_ino_t entry_ino = 0;
1534 struct fuse_entry_param e = (struct fuse_entry_param){
1535 .attr.st_ino = d->entry->d_ino,
1536 .attr.st_mode = d->entry->d_type << 12,
1539 /* Hide root's parent directory */
1540 if (dinode == &lo->root && strcmp(name, "..") == 0) {
1541 e.attr.st_ino = lo->root.key.ino;
1542 e.attr.st_mode = DT_DIR << 12;
1545 if (plus) {
1546 if (!is_dot_or_dotdot(name)) {
1547 err = lo_do_lookup(req, ino, name, &e);
1548 if (err) {
1549 goto error;
1551 entry_ino = e.ino;
1554 entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff);
1555 } else {
1556 entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff);
1558 if (entsize > rem) {
1559 if (entry_ino != 0) {
1560 lo_forget_one(req, entry_ino, 1);
1562 break;
1565 p += entsize;
1566 rem -= entsize;
1568 d->entry = NULL;
1569 d->offset = nextoff;
1572 err = 0;
1573 error:
1574 lo_dirp_put(&d);
1575 lo_inode_put(lo, &dinode);
1578 * If there's an error, we can only signal it if we haven't stored
1579 * any entries yet - otherwise we'd end up with wrong lookup
1580 * counts for the entries that are already in the buffer. So we
1581 * return what we've collected until that point.
1583 if (err && rem == size) {
1584 fuse_reply_err(req, err);
1585 } else {
1586 fuse_reply_buf(req, buf, size - rem);
1588 free(buf);
1591 static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
1592 off_t offset, struct fuse_file_info *fi)
1594 lo_do_readdir(req, ino, size, offset, fi, 0);
1597 static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size,
1598 off_t offset, struct fuse_file_info *fi)
1600 lo_do_readdir(req, ino, size, offset, fi, 1);
1603 static void lo_releasedir(fuse_req_t req, fuse_ino_t ino,
1604 struct fuse_file_info *fi)
1606 struct lo_data *lo = lo_data(req);
1607 struct lo_map_elem *elem;
1608 struct lo_dirp *d;
1610 (void)ino;
1612 pthread_mutex_lock(&lo->mutex);
1613 elem = lo_map_get(&lo->dirp_map, fi->fh);
1614 if (!elem) {
1615 pthread_mutex_unlock(&lo->mutex);
1616 fuse_reply_err(req, EBADF);
1617 return;
1620 d = elem->dirp;
1621 lo_map_remove(&lo->dirp_map, fi->fh);
1622 pthread_mutex_unlock(&lo->mutex);
1624 lo_dirp_put(&d); /* paired with lo_opendir() */
1626 fuse_reply_err(req, 0);
1629 static void update_open_flags(int writeback, int allow_direct_io,
1630 struct fuse_file_info *fi)
1633 * With writeback cache, kernel may send read requests even
1634 * when userspace opened write-only
1636 if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) {
1637 fi->flags &= ~O_ACCMODE;
1638 fi->flags |= O_RDWR;
1642 * With writeback cache, O_APPEND is handled by the kernel.
1643 * This breaks atomicity (since the file may change in the
1644 * underlying filesystem, so that the kernel's idea of the
1645 * end of the file isn't accurate anymore). In this example,
1646 * we just accept that. A more rigorous filesystem may want
1647 * to return an error here
1649 if (writeback && (fi->flags & O_APPEND)) {
1650 fi->flags &= ~O_APPEND;
1654 * O_DIRECT in guest should not necessarily mean bypassing page
1655 * cache on host as well. Therefore, we discard it by default
1656 * ('-o no_allow_direct_io'). If somebody needs that behavior,
1657 * the '-o allow_direct_io' option should be set.
1659 if (!allow_direct_io) {
1660 fi->flags &= ~O_DIRECT;
1664 static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
1665 mode_t mode, struct fuse_file_info *fi)
1667 int fd;
1668 struct lo_data *lo = lo_data(req);
1669 struct lo_inode *parent_inode;
1670 struct fuse_entry_param e;
1671 int err;
1672 struct lo_cred old = {};
1674 fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", parent,
1675 name);
1677 if (!is_safe_path_component(name)) {
1678 fuse_reply_err(req, EINVAL);
1679 return;
1682 parent_inode = lo_inode(req, parent);
1683 if (!parent_inode) {
1684 fuse_reply_err(req, EBADF);
1685 return;
1688 err = lo_change_cred(req, &old);
1689 if (err) {
1690 goto out;
1693 update_open_flags(lo->writeback, lo->allow_direct_io, fi);
1695 fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW,
1696 mode);
1697 err = fd == -1 ? errno : 0;
1698 lo_restore_cred(&old);
1700 if (!err) {
1701 ssize_t fh;
1703 pthread_mutex_lock(&lo->mutex);
1704 fh = lo_add_fd_mapping(req, fd);
1705 pthread_mutex_unlock(&lo->mutex);
1706 if (fh == -1) {
1707 close(fd);
1708 err = ENOMEM;
1709 goto out;
1712 fi->fh = fh;
1713 err = lo_do_lookup(req, parent, name, &e);
1715 if (lo->cache == CACHE_NONE) {
1716 fi->direct_io = 1;
1717 } else if (lo->cache == CACHE_ALWAYS) {
1718 fi->keep_cache = 1;
1721 out:
1722 lo_inode_put(lo, &parent_inode);
1724 if (err) {
1725 fuse_reply_err(req, err);
1726 } else {
1727 fuse_reply_create(req, &e, fi);
1731 /* Should be called with inode->plock_mutex held */
1732 static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo,
1733 struct lo_inode *inode,
1734 uint64_t lock_owner,
1735 pid_t pid, int *err)
1737 struct lo_inode_plock *plock;
1738 char procname[64];
1739 int fd;
1741 plock =
1742 g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner));
1744 if (plock) {
1745 return plock;
1748 plock = malloc(sizeof(struct lo_inode_plock));
1749 if (!plock) {
1750 *err = ENOMEM;
1751 return NULL;
1754 /* Open another instance of file which can be used for ofd locks. */
1755 sprintf(procname, "%i", inode->fd);
1757 /* TODO: What if file is not writable? */
1758 fd = openat(lo->proc_self_fd, procname, O_RDWR);
1759 if (fd == -1) {
1760 *err = errno;
1761 free(plock);
1762 return NULL;
1765 plock->lock_owner = lock_owner;
1766 plock->fd = fd;
1767 g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner),
1768 plock);
1769 return plock;
1772 static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
1773 struct flock *lock)
1775 struct lo_data *lo = lo_data(req);
1776 struct lo_inode *inode;
1777 struct lo_inode_plock *plock;
1778 int ret, saverr = 0;
1780 fuse_log(FUSE_LOG_DEBUG,
1781 "lo_getlk(ino=%" PRIu64 ", flags=%d)"
1782 " owner=0x%lx, l_type=%d l_start=0x%lx"
1783 " l_len=0x%lx\n",
1784 ino, fi->flags, fi->lock_owner, lock->l_type, lock->l_start,
1785 lock->l_len);
1787 inode = lo_inode(req, ino);
1788 if (!inode) {
1789 fuse_reply_err(req, EBADF);
1790 return;
1793 pthread_mutex_lock(&inode->plock_mutex);
1794 plock =
1795 lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
1796 if (!plock) {
1797 saverr = ret;
1798 goto out;
1801 ret = fcntl(plock->fd, F_OFD_GETLK, lock);
1802 if (ret == -1) {
1803 saverr = errno;
1806 out:
1807 pthread_mutex_unlock(&inode->plock_mutex);
1808 lo_inode_put(lo, &inode);
1810 if (saverr) {
1811 fuse_reply_err(req, saverr);
1812 } else {
1813 fuse_reply_lock(req, lock);
1817 static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
1818 struct flock *lock, int sleep)
1820 struct lo_data *lo = lo_data(req);
1821 struct lo_inode *inode;
1822 struct lo_inode_plock *plock;
1823 int ret, saverr = 0;
1825 fuse_log(FUSE_LOG_DEBUG,
1826 "lo_setlk(ino=%" PRIu64 ", flags=%d)"
1827 " cmd=%d pid=%d owner=0x%lx sleep=%d l_whence=%d"
1828 " l_start=0x%lx l_len=0x%lx\n",
1829 ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep,
1830 lock->l_whence, lock->l_start, lock->l_len);
1832 if (sleep) {
1833 fuse_reply_err(req, EOPNOTSUPP);
1834 return;
1837 inode = lo_inode(req, ino);
1838 if (!inode) {
1839 fuse_reply_err(req, EBADF);
1840 return;
1843 pthread_mutex_lock(&inode->plock_mutex);
1844 plock =
1845 lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
1847 if (!plock) {
1848 saverr = ret;
1849 goto out;
1852 /* TODO: Is it alright to modify flock? */
1853 lock->l_pid = 0;
1854 ret = fcntl(plock->fd, F_OFD_SETLK, lock);
1855 if (ret == -1) {
1856 saverr = errno;
1859 out:
1860 pthread_mutex_unlock(&inode->plock_mutex);
1861 lo_inode_put(lo, &inode);
1863 fuse_reply_err(req, saverr);
1866 static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
1867 struct fuse_file_info *fi)
1869 int res;
1870 struct lo_dirp *d;
1871 int fd;
1873 (void)ino;
1875 d = lo_dirp(req, fi);
1876 if (!d) {
1877 fuse_reply_err(req, EBADF);
1878 return;
1881 fd = dirfd(d->dp);
1882 if (datasync) {
1883 res = fdatasync(fd);
1884 } else {
1885 res = fsync(fd);
1888 lo_dirp_put(&d);
1890 fuse_reply_err(req, res == -1 ? errno : 0);
1893 static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
1895 int fd;
1896 ssize_t fh;
1897 char buf[64];
1898 struct lo_data *lo = lo_data(req);
1900 fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino,
1901 fi->flags);
1903 update_open_flags(lo->writeback, lo->allow_direct_io, fi);
1905 sprintf(buf, "%i", lo_fd(req, ino));
1906 fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW);
1907 if (fd == -1) {
1908 return (void)fuse_reply_err(req, errno);
1911 pthread_mutex_lock(&lo->mutex);
1912 fh = lo_add_fd_mapping(req, fd);
1913 pthread_mutex_unlock(&lo->mutex);
1914 if (fh == -1) {
1915 close(fd);
1916 fuse_reply_err(req, ENOMEM);
1917 return;
1920 fi->fh = fh;
1921 if (lo->cache == CACHE_NONE) {
1922 fi->direct_io = 1;
1923 } else if (lo->cache == CACHE_ALWAYS) {
1924 fi->keep_cache = 1;
1926 fuse_reply_open(req, fi);
1929 static void lo_release(fuse_req_t req, fuse_ino_t ino,
1930 struct fuse_file_info *fi)
1932 struct lo_data *lo = lo_data(req);
1933 struct lo_map_elem *elem;
1934 int fd = -1;
1936 (void)ino;
1938 pthread_mutex_lock(&lo->mutex);
1939 elem = lo_map_get(&lo->fd_map, fi->fh);
1940 if (elem) {
1941 fd = elem->fd;
1942 elem = NULL;
1943 lo_map_remove(&lo->fd_map, fi->fh);
1945 pthread_mutex_unlock(&lo->mutex);
1947 close(fd);
1948 fuse_reply_err(req, 0);
1951 static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
1953 int res;
1954 (void)ino;
1955 struct lo_inode *inode;
1957 inode = lo_inode(req, ino);
1958 if (!inode) {
1959 fuse_reply_err(req, EBADF);
1960 return;
1963 /* An fd is going away. Cleanup associated posix locks */
1964 pthread_mutex_lock(&inode->plock_mutex);
1965 g_hash_table_remove(inode->posix_locks, GUINT_TO_POINTER(fi->lock_owner));
1966 pthread_mutex_unlock(&inode->plock_mutex);
1968 res = close(dup(lo_fi_fd(req, fi)));
1969 lo_inode_put(lo_data(req), &inode);
1970 fuse_reply_err(req, res == -1 ? errno : 0);
1973 static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync,
1974 struct fuse_file_info *fi)
1976 int res;
1977 int fd;
1978 char *buf;
1980 fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino,
1981 (void *)fi);
1983 if (!fi) {
1984 struct lo_data *lo = lo_data(req);
1986 res = asprintf(&buf, "%i", lo_fd(req, ino));
1987 if (res == -1) {
1988 return (void)fuse_reply_err(req, errno);
1991 fd = openat(lo->proc_self_fd, buf, O_RDWR);
1992 free(buf);
1993 if (fd == -1) {
1994 return (void)fuse_reply_err(req, errno);
1996 } else {
1997 fd = lo_fi_fd(req, fi);
2000 if (datasync) {
2001 res = fdatasync(fd);
2002 } else {
2003 res = fsync(fd);
2005 if (!fi) {
2006 close(fd);
2008 fuse_reply_err(req, res == -1 ? errno : 0);
2011 static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset,
2012 struct fuse_file_info *fi)
2014 struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size);
2016 fuse_log(FUSE_LOG_DEBUG,
2017 "lo_read(ino=%" PRIu64 ", size=%zd, "
2018 "off=%lu)\n",
2019 ino, size, (unsigned long)offset);
2021 buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
2022 buf.buf[0].fd = lo_fi_fd(req, fi);
2023 buf.buf[0].pos = offset;
2025 fuse_reply_data(req, &buf);
2028 static void lo_write_buf(fuse_req_t req, fuse_ino_t ino,
2029 struct fuse_bufvec *in_buf, off_t off,
2030 struct fuse_file_info *fi)
2032 (void)ino;
2033 ssize_t res;
2034 struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf));
2035 bool cap_fsetid_dropped = false;
2037 out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
2038 out_buf.buf[0].fd = lo_fi_fd(req, fi);
2039 out_buf.buf[0].pos = off;
2041 fuse_log(FUSE_LOG_DEBUG,
2042 "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ino,
2043 out_buf.buf[0].size, (unsigned long)off);
2046 * If kill_priv is set, drop CAP_FSETID which should lead to kernel
2047 * clearing setuid/setgid on file.
2049 if (fi->kill_priv) {
2050 res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
2051 if (res != 0) {
2052 fuse_reply_err(req, res);
2053 return;
2057 res = fuse_buf_copy(&out_buf, in_buf);
2058 if (res < 0) {
2059 fuse_reply_err(req, -res);
2060 } else {
2061 fuse_reply_write(req, (size_t)res);
2064 if (cap_fsetid_dropped) {
2065 res = gain_effective_cap("FSETID");
2066 if (res) {
2067 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
2072 static void lo_statfs(fuse_req_t req, fuse_ino_t ino)
2074 int res;
2075 struct statvfs stbuf;
2077 res = fstatvfs(lo_fd(req, ino), &stbuf);
2078 if (res == -1) {
2079 fuse_reply_err(req, errno);
2080 } else {
2081 fuse_reply_statfs(req, &stbuf);
2085 static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset,
2086 off_t length, struct fuse_file_info *fi)
2088 int err = EOPNOTSUPP;
2089 (void)ino;
2091 #ifdef CONFIG_FALLOCATE
2092 err = fallocate(lo_fi_fd(req, fi), mode, offset, length);
2093 if (err < 0) {
2094 err = errno;
2097 #elif defined(CONFIG_POSIX_FALLOCATE)
2098 if (mode) {
2099 fuse_reply_err(req, EOPNOTSUPP);
2100 return;
2103 err = posix_fallocate(lo_fi_fd(req, fi), offset, length);
2104 #endif
2106 fuse_reply_err(req, err);
2109 static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
2110 int op)
2112 int res;
2113 (void)ino;
2115 res = flock(lo_fi_fd(req, fi), op);
2117 fuse_reply_err(req, res == -1 ? errno : 0);
2120 /* types */
2122 * Exit; process attribute unmodified if matched.
2123 * An empty key applies to all.
2125 #define XATTR_MAP_FLAG_OK (1 << 0)
2127 * The attribute is unwanted;
2128 * EPERM on write, hidden on read.
2130 #define XATTR_MAP_FLAG_BAD (1 << 1)
2132 * For attr that start with 'key' prepend 'prepend'
2133 * 'key' may be empty to prepend for all attrs
2134 * key is defined from set/remove point of view.
2135 * Automatically reversed on read
2137 #define XATTR_MAP_FLAG_PREFIX (1 << 2)
2139 /* scopes */
2140 /* Apply rule to get/set/remove */
2141 #define XATTR_MAP_FLAG_CLIENT (1 << 16)
2142 /* Apply rule to list */
2143 #define XATTR_MAP_FLAG_SERVER (1 << 17)
2144 /* Apply rule to all */
2145 #define XATTR_MAP_FLAG_ALL (XATTR_MAP_FLAG_SERVER | XATTR_MAP_FLAG_CLIENT)
2147 static void add_xattrmap_entry(struct lo_data *lo,
2148 const XattrMapEntry *new_entry)
2150 XattrMapEntry *res = g_realloc_n(lo->xattr_map_list,
2151 lo->xattr_map_nentries + 1,
2152 sizeof(XattrMapEntry));
2153 res[lo->xattr_map_nentries++] = *new_entry;
2155 lo->xattr_map_list = res;
2158 static void free_xattrmap(struct lo_data *lo)
2160 XattrMapEntry *map = lo->xattr_map_list;
2161 size_t i;
2163 if (!map) {
2164 return;
2167 for (i = 0; i < lo->xattr_map_nentries; i++) {
2168 g_free(map[i].key);
2169 g_free(map[i].prepend);
2172 g_free(map);
2173 lo->xattr_map_list = NULL;
2174 lo->xattr_map_nentries = -1;
2178 * Handle the 'map' type, which is sugar for a set of commands
2179 * for the common case of prefixing a subset or everything,
2180 * and allowing anything not prefixed through.
2181 * It must be the last entry in the stream, although there
2182 * can be other entries before it.
2183 * The form is:
2184 * :map:key:prefix:
2186 * key maybe empty in which case all entries are prefixed.
2188 static void parse_xattrmap_map(struct lo_data *lo,
2189 const char *rule, char sep)
2191 const char *tmp;
2192 char *key;
2193 char *prefix;
2194 XattrMapEntry tmp_entry;
2196 if (*rule != sep) {
2197 fuse_log(FUSE_LOG_ERR,
2198 "%s: Expecting '%c' after 'map' keyword, found '%c'\n",
2199 __func__, sep, *rule);
2200 exit(1);
2203 rule++;
2205 /* At start of 'key' field */
2206 tmp = strchr(rule, sep);
2207 if (!tmp) {
2208 fuse_log(FUSE_LOG_ERR,
2209 "%s: Missing '%c' at end of key field in map rule\n",
2210 __func__, sep);
2211 exit(1);
2214 key = g_strndup(rule, tmp - rule);
2215 rule = tmp + 1;
2217 /* At start of prefix field */
2218 tmp = strchr(rule, sep);
2219 if (!tmp) {
2220 fuse_log(FUSE_LOG_ERR,
2221 "%s: Missing '%c' at end of prefix field in map rule\n",
2222 __func__, sep);
2223 exit(1);
2226 prefix = g_strndup(rule, tmp - rule);
2227 rule = tmp + 1;
2230 * This should be the end of the string, we don't allow
2231 * any more commands after 'map'.
2233 if (*rule) {
2234 fuse_log(FUSE_LOG_ERR,
2235 "%s: Expecting end of command after map, found '%c'\n",
2236 __func__, *rule);
2237 exit(1);
2240 /* 1st: Prefix matches/everything */
2241 tmp_entry.flags = XATTR_MAP_FLAG_PREFIX | XATTR_MAP_FLAG_ALL;
2242 tmp_entry.key = g_strdup(key);
2243 tmp_entry.prepend = g_strdup(prefix);
2244 add_xattrmap_entry(lo, &tmp_entry);
2246 if (!*key) {
2247 /* Prefix all case */
2249 /* 2nd: Hide any non-prefixed entries on the host */
2250 tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_ALL;
2251 tmp_entry.key = g_strdup("");
2252 tmp_entry.prepend = g_strdup("");
2253 add_xattrmap_entry(lo, &tmp_entry);
2254 } else {
2255 /* Prefix matching case */
2257 /* 2nd: Hide non-prefixed but matching entries on the host */
2258 tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_SERVER;
2259 tmp_entry.key = g_strdup(""); /* Not used */
2260 tmp_entry.prepend = g_strdup(key);
2261 add_xattrmap_entry(lo, &tmp_entry);
2263 /* 3rd: Stop the client accessing prefixed attributes directly */
2264 tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_CLIENT;
2265 tmp_entry.key = g_strdup(prefix);
2266 tmp_entry.prepend = g_strdup(""); /* Not used */
2267 add_xattrmap_entry(lo, &tmp_entry);
2269 /* 4th: Everything else is OK */
2270 tmp_entry.flags = XATTR_MAP_FLAG_OK | XATTR_MAP_FLAG_ALL;
2271 tmp_entry.key = g_strdup("");
2272 tmp_entry.prepend = g_strdup("");
2273 add_xattrmap_entry(lo, &tmp_entry);
2276 g_free(key);
2277 g_free(prefix);
2280 static void parse_xattrmap(struct lo_data *lo)
2282 const char *map = lo->xattrmap;
2283 const char *tmp;
2285 lo->xattr_map_nentries = 0;
2286 while (*map) {
2287 XattrMapEntry tmp_entry;
2288 char sep;
2290 if (isspace(*map)) {
2291 map++;
2292 continue;
2294 /* The separator is the first non-space of the rule */
2295 sep = *map++;
2296 if (!sep) {
2297 break;
2300 tmp_entry.flags = 0;
2301 /* Start of 'type' */
2302 if (strstart(map, "prefix", &map)) {
2303 tmp_entry.flags |= XATTR_MAP_FLAG_PREFIX;
2304 } else if (strstart(map, "ok", &map)) {
2305 tmp_entry.flags |= XATTR_MAP_FLAG_OK;
2306 } else if (strstart(map, "bad", &map)) {
2307 tmp_entry.flags |= XATTR_MAP_FLAG_BAD;
2308 } else if (strstart(map, "map", &map)) {
2310 * map is sugar that adds a number of rules, and must be
2311 * the last entry.
2313 parse_xattrmap_map(lo, map, sep);
2314 return;
2315 } else {
2316 fuse_log(FUSE_LOG_ERR,
2317 "%s: Unexpected type;"
2318 "Expecting 'prefix', 'ok', 'bad' or 'map' in rule %zu\n",
2319 __func__, lo->xattr_map_nentries);
2320 exit(1);
2323 if (*map++ != sep) {
2324 fuse_log(FUSE_LOG_ERR,
2325 "%s: Missing '%c' at end of type field of rule %zu\n",
2326 __func__, sep, lo->xattr_map_nentries);
2327 exit(1);
2330 /* Start of 'scope' */
2331 if (strstart(map, "client", &map)) {
2332 tmp_entry.flags |= XATTR_MAP_FLAG_CLIENT;
2333 } else if (strstart(map, "server", &map)) {
2334 tmp_entry.flags |= XATTR_MAP_FLAG_SERVER;
2335 } else if (strstart(map, "all", &map)) {
2336 tmp_entry.flags |= XATTR_MAP_FLAG_ALL;
2337 } else {
2338 fuse_log(FUSE_LOG_ERR,
2339 "%s: Unexpected scope;"
2340 " Expecting 'client', 'server', or 'all', in rule %zu\n",
2341 __func__, lo->xattr_map_nentries);
2342 exit(1);
2345 if (*map++ != sep) {
2346 fuse_log(FUSE_LOG_ERR,
2347 "%s: Expecting '%c' found '%c'"
2348 " after scope in rule %zu\n",
2349 __func__, sep, *map, lo->xattr_map_nentries);
2350 exit(1);
2353 /* At start of 'key' field */
2354 tmp = strchr(map, sep);
2355 if (!tmp) {
2356 fuse_log(FUSE_LOG_ERR,
2357 "%s: Missing '%c' at end of key field of rule %zu",
2358 __func__, sep, lo->xattr_map_nentries);
2359 exit(1);
2361 tmp_entry.key = g_strndup(map, tmp - map);
2362 map = tmp + 1;
2364 /* At start of 'prepend' field */
2365 tmp = strchr(map, sep);
2366 if (!tmp) {
2367 fuse_log(FUSE_LOG_ERR,
2368 "%s: Missing '%c' at end of prepend field of rule %zu",
2369 __func__, sep, lo->xattr_map_nentries);
2370 exit(1);
2372 tmp_entry.prepend = g_strndup(map, tmp - map);
2373 map = tmp + 1;
2375 add_xattrmap_entry(lo, &tmp_entry);
2376 /* End of rule - go around again for another rule */
2379 if (!lo->xattr_map_nentries) {
2380 fuse_log(FUSE_LOG_ERR, "Empty xattr map\n");
2381 exit(1);
2386 * For use with getxattr/setxattr/removexattr, where the client
2387 * gives us a name and we may need to choose a different one.
2388 * Allocates a buffer for the result placing it in *out_name.
2389 * If there's no change then *out_name is not set.
2390 * Returns 0 on success
2391 * Can return -EPERM to indicate we block a given attribute
2392 * (in which case out_name is not allocated)
2393 * Can return -ENOMEM to indicate out_name couldn't be allocated.
2395 static int xattr_map_client(const struct lo_data *lo, const char *client_name,
2396 char **out_name)
2398 size_t i;
2399 for (i = 0; i < lo->xattr_map_nentries; i++) {
2400 const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
2402 if ((cur_entry->flags & XATTR_MAP_FLAG_CLIENT) &&
2403 (strstart(client_name, cur_entry->key, NULL))) {
2404 if (cur_entry->flags & XATTR_MAP_FLAG_BAD) {
2405 return -EPERM;
2407 if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
2408 /* Unmodified name */
2409 return 0;
2411 if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
2412 *out_name = g_try_malloc(strlen(client_name) +
2413 strlen(cur_entry->prepend) + 1);
2414 if (!*out_name) {
2415 return -ENOMEM;
2417 sprintf(*out_name, "%s%s", cur_entry->prepend, client_name);
2418 return 0;
2423 return -EPERM;
2427 * For use with listxattr where the server fs gives us a name and we may need
2428 * to sanitize this for the client.
2429 * Returns a pointer to the result in *out_name
2430 * This is always the original string or the current string with some prefix
2431 * removed; no reallocation is done.
2432 * Returns 0 on success
2433 * Can return -ENODATA to indicate the name should be dropped from the list.
2435 static int xattr_map_server(const struct lo_data *lo, const char *server_name,
2436 const char **out_name)
2438 size_t i;
2439 const char *end;
2441 for (i = 0; i < lo->xattr_map_nentries; i++) {
2442 const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
2444 if ((cur_entry->flags & XATTR_MAP_FLAG_SERVER) &&
2445 (strstart(server_name, cur_entry->prepend, &end))) {
2446 if (cur_entry->flags & XATTR_MAP_FLAG_BAD) {
2447 return -ENODATA;
2449 if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
2450 *out_name = server_name;
2451 return 0;
2453 if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
2454 /* Remove prefix */
2455 *out_name = end;
2456 return 0;
2461 return -ENODATA;
2464 static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
2465 size_t size)
2467 struct lo_data *lo = lo_data(req);
2468 char *value = NULL;
2469 char procname[64];
2470 const char *name;
2471 char *mapped_name;
2472 struct lo_inode *inode;
2473 ssize_t ret;
2474 int saverr;
2475 int fd = -1;
2477 mapped_name = NULL;
2478 name = in_name;
2479 if (lo->xattrmap) {
2480 ret = xattr_map_client(lo, in_name, &mapped_name);
2481 if (ret < 0) {
2482 if (ret == -EPERM) {
2483 ret = -ENODATA;
2485 fuse_reply_err(req, -ret);
2486 return;
2488 if (mapped_name) {
2489 name = mapped_name;
2493 inode = lo_inode(req, ino);
2494 if (!inode) {
2495 fuse_reply_err(req, EBADF);
2496 g_free(mapped_name);
2497 return;
2500 saverr = ENOSYS;
2501 if (!lo_data(req)->xattr) {
2502 goto out;
2505 fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n",
2506 ino, name, size);
2508 if (size) {
2509 value = malloc(size);
2510 if (!value) {
2511 goto out_err;
2515 sprintf(procname, "%i", inode->fd);
2517 * It is not safe to open() non-regular/non-dir files in file server
2518 * unless O_PATH is used, so use that method for regular files/dir
2519 * only (as it seems giving less performance overhead).
2520 * Otherwise, call fchdir() to avoid open().
2522 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2523 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2524 if (fd < 0) {
2525 goto out_err;
2527 ret = fgetxattr(fd, name, value, size);
2528 } else {
2529 /* fchdir should not fail here */
2530 assert(fchdir(lo->proc_self_fd) == 0);
2531 ret = getxattr(procname, name, value, size);
2532 assert(fchdir(lo->root.fd) == 0);
2535 if (ret == -1) {
2536 goto out_err;
2538 if (size) {
2539 saverr = 0;
2540 if (ret == 0) {
2541 goto out;
2543 fuse_reply_buf(req, value, ret);
2544 } else {
2545 fuse_reply_xattr(req, ret);
2547 out_free:
2548 free(value);
2550 if (fd >= 0) {
2551 close(fd);
2554 lo_inode_put(lo, &inode);
2555 return;
2557 out_err:
2558 saverr = errno;
2559 out:
2560 fuse_reply_err(req, saverr);
2561 g_free(mapped_name);
2562 goto out_free;
2565 static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
2567 struct lo_data *lo = lo_data(req);
2568 char *value = NULL;
2569 char procname[64];
2570 struct lo_inode *inode;
2571 ssize_t ret;
2572 int saverr;
2573 int fd = -1;
2575 inode = lo_inode(req, ino);
2576 if (!inode) {
2577 fuse_reply_err(req, EBADF);
2578 return;
2581 saverr = ENOSYS;
2582 if (!lo_data(req)->xattr) {
2583 goto out;
2586 fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino,
2587 size);
2589 if (size) {
2590 value = malloc(size);
2591 if (!value) {
2592 goto out_err;
2596 sprintf(procname, "%i", inode->fd);
2597 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2598 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2599 if (fd < 0) {
2600 goto out_err;
2602 ret = flistxattr(fd, value, size);
2603 } else {
2604 /* fchdir should not fail here */
2605 assert(fchdir(lo->proc_self_fd) == 0);
2606 ret = listxattr(procname, value, size);
2607 assert(fchdir(lo->root.fd) == 0);
2610 if (ret == -1) {
2611 goto out_err;
2613 if (size) {
2614 saverr = 0;
2615 if (ret == 0) {
2616 goto out;
2619 if (lo->xattr_map_list) {
2621 * Map the names back, some attributes might be dropped,
2622 * some shortened, but not increased, so we shouldn't
2623 * run out of room.
2625 size_t out_index, in_index;
2626 out_index = 0;
2627 in_index = 0;
2628 while (in_index < ret) {
2629 const char *map_out;
2630 char *in_ptr = value + in_index;
2631 /* Length of current attribute name */
2632 size_t in_len = strlen(value + in_index) + 1;
2634 int mapret = xattr_map_server(lo, in_ptr, &map_out);
2635 if (mapret != -ENODATA && mapret != 0) {
2636 /* Shouldn't happen */
2637 saverr = -mapret;
2638 goto out;
2640 if (mapret == 0) {
2641 /* Either unchanged, or truncated */
2642 size_t out_len;
2643 if (map_out != in_ptr) {
2644 /* +1 copies the NIL */
2645 out_len = strlen(map_out) + 1;
2646 } else {
2647 /* No change */
2648 out_len = in_len;
2651 * Move result along, may still be needed for an unchanged
2652 * entry if a previous entry was changed.
2654 memmove(value + out_index, map_out, out_len);
2656 out_index += out_len;
2658 in_index += in_len;
2660 ret = out_index;
2661 if (ret == 0) {
2662 goto out;
2665 fuse_reply_buf(req, value, ret);
2666 } else {
2668 * xattrmap only ever shortens the result,
2669 * so we don't need to do anything clever with the
2670 * allocation length here.
2672 fuse_reply_xattr(req, ret);
2674 out_free:
2675 free(value);
2677 if (fd >= 0) {
2678 close(fd);
2681 lo_inode_put(lo, &inode);
2682 return;
2684 out_err:
2685 saverr = errno;
2686 out:
2687 fuse_reply_err(req, saverr);
2688 goto out_free;
2691 static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
2692 const char *value, size_t size, int flags)
2694 char procname[64];
2695 const char *name;
2696 char *mapped_name;
2697 struct lo_data *lo = lo_data(req);
2698 struct lo_inode *inode;
2699 ssize_t ret;
2700 int saverr;
2701 int fd = -1;
2703 mapped_name = NULL;
2704 name = in_name;
2705 if (lo->xattrmap) {
2706 ret = xattr_map_client(lo, in_name, &mapped_name);
2707 if (ret < 0) {
2708 fuse_reply_err(req, -ret);
2709 return;
2711 if (mapped_name) {
2712 name = mapped_name;
2716 inode = lo_inode(req, ino);
2717 if (!inode) {
2718 fuse_reply_err(req, EBADF);
2719 g_free(mapped_name);
2720 return;
2723 saverr = ENOSYS;
2724 if (!lo_data(req)->xattr) {
2725 goto out;
2728 fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64
2729 ", name=%s value=%s size=%zd)\n", ino, name, value, size);
2731 sprintf(procname, "%i", inode->fd);
2732 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2733 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2734 if (fd < 0) {
2735 saverr = errno;
2736 goto out;
2738 ret = fsetxattr(fd, name, value, size, flags);
2739 } else {
2740 /* fchdir should not fail here */
2741 assert(fchdir(lo->proc_self_fd) == 0);
2742 ret = setxattr(procname, name, value, size, flags);
2743 assert(fchdir(lo->root.fd) == 0);
2746 saverr = ret == -1 ? errno : 0;
2748 out:
2749 if (fd >= 0) {
2750 close(fd);
2753 lo_inode_put(lo, &inode);
2754 g_free(mapped_name);
2755 fuse_reply_err(req, saverr);
2758 static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *in_name)
2760 char procname[64];
2761 const char *name;
2762 char *mapped_name;
2763 struct lo_data *lo = lo_data(req);
2764 struct lo_inode *inode;
2765 ssize_t ret;
2766 int saverr;
2767 int fd = -1;
2769 mapped_name = NULL;
2770 name = in_name;
2771 if (lo->xattrmap) {
2772 ret = xattr_map_client(lo, in_name, &mapped_name);
2773 if (ret < 0) {
2774 fuse_reply_err(req, -ret);
2775 return;
2777 if (mapped_name) {
2778 name = mapped_name;
2782 inode = lo_inode(req, ino);
2783 if (!inode) {
2784 fuse_reply_err(req, EBADF);
2785 g_free(mapped_name);
2786 return;
2789 saverr = ENOSYS;
2790 if (!lo_data(req)->xattr) {
2791 goto out;
2794 fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino,
2795 name);
2797 sprintf(procname, "%i", inode->fd);
2798 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2799 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2800 if (fd < 0) {
2801 saverr = errno;
2802 goto out;
2804 ret = fremovexattr(fd, name);
2805 } else {
2806 /* fchdir should not fail here */
2807 assert(fchdir(lo->proc_self_fd) == 0);
2808 ret = removexattr(procname, name);
2809 assert(fchdir(lo->root.fd) == 0);
2812 saverr = ret == -1 ? errno : 0;
2814 out:
2815 if (fd >= 0) {
2816 close(fd);
2819 lo_inode_put(lo, &inode);
2820 g_free(mapped_name);
2821 fuse_reply_err(req, saverr);
2824 #ifdef HAVE_COPY_FILE_RANGE
2825 static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in,
2826 struct fuse_file_info *fi_in, fuse_ino_t ino_out,
2827 off_t off_out, struct fuse_file_info *fi_out,
2828 size_t len, int flags)
2830 int in_fd, out_fd;
2831 ssize_t res;
2833 in_fd = lo_fi_fd(req, fi_in);
2834 out_fd = lo_fi_fd(req, fi_out);
2836 fuse_log(FUSE_LOG_DEBUG,
2837 "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, "
2838 "off=%lu, ino=%" PRIu64 "/fd=%d, "
2839 "off=%lu, size=%zd, flags=0x%x)\n",
2840 ino_in, in_fd, off_in, ino_out, out_fd, off_out, len, flags);
2842 res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags);
2843 if (res < 0) {
2844 fuse_reply_err(req, errno);
2845 } else {
2846 fuse_reply_write(req, res);
2849 #endif
2851 static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
2852 struct fuse_file_info *fi)
2854 off_t res;
2856 (void)ino;
2857 res = lseek(lo_fi_fd(req, fi), off, whence);
2858 if (res != -1) {
2859 fuse_reply_lseek(req, res);
2860 } else {
2861 fuse_reply_err(req, errno);
2865 static void lo_destroy(void *userdata)
2867 struct lo_data *lo = (struct lo_data *)userdata;
2869 pthread_mutex_lock(&lo->mutex);
2870 while (true) {
2871 GHashTableIter iter;
2872 gpointer key, value;
2874 g_hash_table_iter_init(&iter, lo->inodes);
2875 if (!g_hash_table_iter_next(&iter, &key, &value)) {
2876 break;
2879 struct lo_inode *inode = value;
2880 unref_inode(lo, inode, inode->nlookup);
2882 pthread_mutex_unlock(&lo->mutex);
2885 static struct fuse_lowlevel_ops lo_oper = {
2886 .init = lo_init,
2887 .lookup = lo_lookup,
2888 .mkdir = lo_mkdir,
2889 .mknod = lo_mknod,
2890 .symlink = lo_symlink,
2891 .link = lo_link,
2892 .unlink = lo_unlink,
2893 .rmdir = lo_rmdir,
2894 .rename = lo_rename,
2895 .forget = lo_forget,
2896 .forget_multi = lo_forget_multi,
2897 .getattr = lo_getattr,
2898 .setattr = lo_setattr,
2899 .readlink = lo_readlink,
2900 .opendir = lo_opendir,
2901 .readdir = lo_readdir,
2902 .readdirplus = lo_readdirplus,
2903 .releasedir = lo_releasedir,
2904 .fsyncdir = lo_fsyncdir,
2905 .create = lo_create,
2906 .getlk = lo_getlk,
2907 .setlk = lo_setlk,
2908 .open = lo_open,
2909 .release = lo_release,
2910 .flush = lo_flush,
2911 .fsync = lo_fsync,
2912 .read = lo_read,
2913 .write_buf = lo_write_buf,
2914 .statfs = lo_statfs,
2915 .fallocate = lo_fallocate,
2916 .flock = lo_flock,
2917 .getxattr = lo_getxattr,
2918 .listxattr = lo_listxattr,
2919 .setxattr = lo_setxattr,
2920 .removexattr = lo_removexattr,
2921 #ifdef HAVE_COPY_FILE_RANGE
2922 .copy_file_range = lo_copy_file_range,
2923 #endif
2924 .lseek = lo_lseek,
2925 .destroy = lo_destroy,
2928 /* Print vhost-user.json backend program capabilities */
2929 static void print_capabilities(void)
2931 printf("{\n");
2932 printf(" \"type\": \"fs\"\n");
2933 printf("}\n");
2937 * Drop all Linux capabilities because the wait parent process only needs to
2938 * sit in waitpid(2) and terminate.
2940 static void setup_wait_parent_capabilities(void)
2942 capng_setpid(syscall(SYS_gettid));
2943 capng_clear(CAPNG_SELECT_BOTH);
2944 capng_apply(CAPNG_SELECT_BOTH);
2948 * Move to a new mount, net, and pid namespaces to isolate this process.
2950 static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
2952 pid_t child;
2955 * Create a new pid namespace for *child* processes. We'll have to
2956 * fork in order to enter the new pid namespace. A new mount namespace
2957 * is also needed so that we can remount /proc for the new pid
2958 * namespace.
2960 * Our UNIX domain sockets have been created. Now we can move to
2961 * an empty network namespace to prevent TCP/IP and other network
2962 * activity in case this process is compromised.
2964 if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) {
2965 fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
2966 exit(1);
2969 child = fork();
2970 if (child < 0) {
2971 fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n");
2972 exit(1);
2974 if (child > 0) {
2975 pid_t waited;
2976 int wstatus;
2978 setup_wait_parent_capabilities();
2980 /* The parent waits for the child */
2981 do {
2982 waited = waitpid(child, &wstatus, 0);
2983 } while (waited < 0 && errno == EINTR && !se->exited);
2985 /* We were terminated by a signal, see fuse_signals.c */
2986 if (se->exited) {
2987 exit(0);
2990 if (WIFEXITED(wstatus)) {
2991 exit(WEXITSTATUS(wstatus));
2994 exit(1);
2997 /* Send us SIGTERM when the parent thread terminates, see prctl(2) */
2998 prctl(PR_SET_PDEATHSIG, SIGTERM);
3001 * If the mounts have shared propagation then we want to opt out so our
3002 * mount changes don't affect the parent mount namespace.
3004 if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) {
3005 fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n");
3006 exit(1);
3009 /* The child must remount /proc to use the new pid namespace */
3010 if (mount("proc", "/proc", "proc",
3011 MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) {
3012 fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n");
3013 exit(1);
3017 * We only need /proc/self/fd. Prevent ".." from accessing parent
3018 * directories of /proc/self/fd by bind-mounting it over /proc. Since / was
3019 * previously remounted with MS_REC | MS_SLAVE this mount change only
3020 * affects our process.
3022 if (mount("/proc/self/fd", "/proc", NULL, MS_BIND, NULL) < 0) {
3023 fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, MS_BIND): %m\n");
3024 exit(1);
3027 /* Get the /proc (actually /proc/self/fd, see above) file descriptor */
3028 lo->proc_self_fd = open("/proc", O_PATH);
3029 if (lo->proc_self_fd == -1) {
3030 fuse_log(FUSE_LOG_ERR, "open(/proc, O_PATH): %m\n");
3031 exit(1);
3036 * Capture the capability state, we'll need to restore this for individual
3037 * threads later; see load_capng.
3039 static void setup_capng(void)
3041 /* Note this accesses /proc so has to happen before the sandbox */
3042 if (capng_get_caps_process()) {
3043 fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n");
3044 exit(1);
3046 pthread_mutex_init(&cap.mutex, NULL);
3047 pthread_mutex_lock(&cap.mutex);
3048 cap.saved = capng_save_state();
3049 if (!cap.saved) {
3050 fuse_log(FUSE_LOG_ERR, "capng_save_state\n");
3051 exit(1);
3053 pthread_mutex_unlock(&cap.mutex);
3056 static void cleanup_capng(void)
3058 free(cap.saved);
3059 cap.saved = NULL;
3060 pthread_mutex_destroy(&cap.mutex);
3065 * Make the source directory our root so symlinks cannot escape and no other
3066 * files are accessible. Assumes unshare(CLONE_NEWNS) was already called.
3068 static void setup_mounts(const char *source)
3070 int oldroot;
3071 int newroot;
3073 if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) {
3074 fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
3075 exit(1);
3078 /* This magic is based on lxc's lxc_pivot_root() */
3079 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
3080 if (oldroot < 0) {
3081 fuse_log(FUSE_LOG_ERR, "open(/): %m\n");
3082 exit(1);
3085 newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
3086 if (newroot < 0) {
3087 fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source);
3088 exit(1);
3091 if (fchdir(newroot) < 0) {
3092 fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
3093 exit(1);
3096 if (syscall(__NR_pivot_root, ".", ".") < 0) {
3097 fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n");
3098 exit(1);
3101 if (fchdir(oldroot) < 0) {
3102 fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n");
3103 exit(1);
3106 if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) {
3107 fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n");
3108 exit(1);
3111 if (umount2(".", MNT_DETACH) < 0) {
3112 fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n");
3113 exit(1);
3116 if (fchdir(newroot) < 0) {
3117 fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
3118 exit(1);
3121 close(newroot);
3122 close(oldroot);
3126 * Only keep whitelisted capabilities that are needed for file system operation
3127 * The (possibly NULL) modcaps_in string passed in is free'd before exit.
3129 static void setup_capabilities(char *modcaps_in)
3131 char *modcaps = modcaps_in;
3132 pthread_mutex_lock(&cap.mutex);
3133 capng_restore_state(&cap.saved);
3136 * Whitelist file system-related capabilities that are needed for a file
3137 * server to act like root. Drop everything else like networking and
3138 * sysadmin capabilities.
3140 * Exclusions:
3141 * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl
3142 * and we don't support that.
3143 * 2. CAP_MAC_OVERRIDE is not included because it only seems to be
3144 * used by the Smack LSM. Omit it until there is demand for it.
3146 capng_setpid(syscall(SYS_gettid));
3147 capng_clear(CAPNG_SELECT_BOTH);
3148 if (capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE,
3149 CAP_CHOWN,
3150 CAP_DAC_OVERRIDE,
3151 CAP_FOWNER,
3152 CAP_FSETID,
3153 CAP_SETGID,
3154 CAP_SETUID,
3155 CAP_MKNOD,
3156 CAP_SETFCAP,
3157 -1)) {
3158 fuse_log(FUSE_LOG_ERR, "%s: capng_updatev failed\n", __func__);
3159 exit(1);
3163 * The modcaps option is a colon separated list of caps,
3164 * each preceded by either + or -.
3166 while (modcaps) {
3167 capng_act_t action;
3168 int cap;
3170 char *next = strchr(modcaps, ':');
3171 if (next) {
3172 *next = '\0';
3173 next++;
3176 switch (modcaps[0]) {
3177 case '+':
3178 action = CAPNG_ADD;
3179 break;
3181 case '-':
3182 action = CAPNG_DROP;
3183 break;
3185 default:
3186 fuse_log(FUSE_LOG_ERR,
3187 "%s: Expecting '+'/'-' in modcaps but found '%c'\n",
3188 __func__, modcaps[0]);
3189 exit(1);
3191 cap = capng_name_to_capability(modcaps + 1);
3192 if (cap < 0) {
3193 fuse_log(FUSE_LOG_ERR, "%s: Unknown capability '%s'\n", __func__,
3194 modcaps);
3195 exit(1);
3197 if (capng_update(action, CAPNG_PERMITTED | CAPNG_EFFECTIVE, cap)) {
3198 fuse_log(FUSE_LOG_ERR, "%s: capng_update failed for '%s'\n",
3199 __func__, modcaps);
3200 exit(1);
3203 modcaps = next;
3205 g_free(modcaps_in);
3207 if (capng_apply(CAPNG_SELECT_BOTH)) {
3208 fuse_log(FUSE_LOG_ERR, "%s: capng_apply failed\n", __func__);
3209 exit(1);
3212 cap.saved = capng_save_state();
3213 if (!cap.saved) {
3214 fuse_log(FUSE_LOG_ERR, "%s: capng_save_state failed\n", __func__);
3215 exit(1);
3217 pthread_mutex_unlock(&cap.mutex);
3221 * Use chroot as a weaker sandbox for environments where the process is
3222 * launched without CAP_SYS_ADMIN.
3224 static void setup_chroot(struct lo_data *lo)
3226 lo->proc_self_fd = open("/proc/self/fd", O_PATH);
3227 if (lo->proc_self_fd == -1) {
3228 fuse_log(FUSE_LOG_ERR, "open(\"/proc/self/fd\", O_PATH): %m\n");
3229 exit(1);
3233 * Make the shared directory the file system root so that FUSE_OPEN
3234 * (lo_open()) cannot escape the shared directory by opening a symlink.
3236 * The chroot(2) syscall is later disabled by seccomp and the
3237 * CAP_SYS_CHROOT capability is dropped so that tampering with the chroot
3238 * is not possible.
3240 * However, it's still possible to escape the chroot via lo->proc_self_fd
3241 * but that requires first gaining control of the process.
3243 if (chroot(lo->source) != 0) {
3244 fuse_log(FUSE_LOG_ERR, "chroot(\"%s\"): %m\n", lo->source);
3245 exit(1);
3248 /* Move into the chroot */
3249 if (chdir("/") != 0) {
3250 fuse_log(FUSE_LOG_ERR, "chdir(\"/\"): %m\n");
3251 exit(1);
3256 * Lock down this process to prevent access to other processes or files outside
3257 * source directory. This reduces the impact of arbitrary code execution bugs.
3259 static void setup_sandbox(struct lo_data *lo, struct fuse_session *se,
3260 bool enable_syslog)
3262 if (lo->sandbox == SANDBOX_NAMESPACE) {
3263 setup_namespaces(lo, se);
3264 setup_mounts(lo->source);
3265 } else {
3266 setup_chroot(lo);
3269 setup_seccomp(enable_syslog);
3270 setup_capabilities(g_strdup(lo->modcaps));
3273 /* Set the maximum number of open file descriptors */
3274 static void setup_nofile_rlimit(unsigned long rlimit_nofile)
3276 struct rlimit rlim = {
3277 .rlim_cur = rlimit_nofile,
3278 .rlim_max = rlimit_nofile,
3281 if (rlimit_nofile == 0) {
3282 return; /* nothing to do */
3285 if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
3286 /* Ignore SELinux denials */
3287 if (errno == EPERM) {
3288 return;
3291 fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n");
3292 exit(1);
3296 static void log_func(enum fuse_log_level level, const char *fmt, va_list ap)
3298 g_autofree char *localfmt = NULL;
3300 if (current_log_level < level) {
3301 return;
3304 if (current_log_level == FUSE_LOG_DEBUG) {
3305 if (!use_syslog) {
3306 localfmt = g_strdup_printf("[%" PRId64 "] [ID: %08ld] %s",
3307 get_clock(), syscall(__NR_gettid), fmt);
3308 } else {
3309 localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid),
3310 fmt);
3312 fmt = localfmt;
3315 if (use_syslog) {
3316 int priority = LOG_ERR;
3317 switch (level) {
3318 case FUSE_LOG_EMERG:
3319 priority = LOG_EMERG;
3320 break;
3321 case FUSE_LOG_ALERT:
3322 priority = LOG_ALERT;
3323 break;
3324 case FUSE_LOG_CRIT:
3325 priority = LOG_CRIT;
3326 break;
3327 case FUSE_LOG_ERR:
3328 priority = LOG_ERR;
3329 break;
3330 case FUSE_LOG_WARNING:
3331 priority = LOG_WARNING;
3332 break;
3333 case FUSE_LOG_NOTICE:
3334 priority = LOG_NOTICE;
3335 break;
3336 case FUSE_LOG_INFO:
3337 priority = LOG_INFO;
3338 break;
3339 case FUSE_LOG_DEBUG:
3340 priority = LOG_DEBUG;
3341 break;
3343 vsyslog(priority, fmt, ap);
3344 } else {
3345 vfprintf(stderr, fmt, ap);
3349 static void setup_root(struct lo_data *lo, struct lo_inode *root)
3351 int fd, res;
3352 struct stat stat;
3353 uint64_t mnt_id;
3355 fd = open("/", O_PATH);
3356 if (fd == -1) {
3357 fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source);
3358 exit(1);
3361 res = do_statx(lo, fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
3362 &mnt_id);
3363 if (res == -1) {
3364 fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source);
3365 exit(1);
3368 root->filetype = S_IFDIR;
3369 root->fd = fd;
3370 root->key.ino = stat.st_ino;
3371 root->key.dev = stat.st_dev;
3372 root->key.mnt_id = mnt_id;
3373 root->nlookup = 2;
3374 g_atomic_int_set(&root->refcount, 2);
3377 static guint lo_key_hash(gconstpointer key)
3379 const struct lo_key *lkey = key;
3381 return (guint)lkey->ino + (guint)lkey->dev + (guint)lkey->mnt_id;
3384 static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
3386 const struct lo_key *la = a;
3387 const struct lo_key *lb = b;
3389 return la->ino == lb->ino && la->dev == lb->dev && la->mnt_id == lb->mnt_id;
3392 static void fuse_lo_data_cleanup(struct lo_data *lo)
3394 if (lo->inodes) {
3395 g_hash_table_destroy(lo->inodes);
3397 lo_map_destroy(&lo->fd_map);
3398 lo_map_destroy(&lo->dirp_map);
3399 lo_map_destroy(&lo->ino_map);
3401 if (lo->proc_self_fd >= 0) {
3402 close(lo->proc_self_fd);
3405 if (lo->root.fd >= 0) {
3406 close(lo->root.fd);
3409 free(lo->xattrmap);
3410 free_xattrmap(lo);
3411 free(lo->source);
3414 int main(int argc, char *argv[])
3416 struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
3417 struct fuse_session *se;
3418 struct fuse_cmdline_opts opts;
3419 struct lo_data lo = {
3420 .sandbox = SANDBOX_NAMESPACE,
3421 .debug = 0,
3422 .writeback = 0,
3423 .posix_lock = 0,
3424 .allow_direct_io = 0,
3425 .proc_self_fd = -1,
3427 struct lo_map_elem *root_elem;
3428 struct lo_map_elem *reserve_elem;
3429 int ret = -1;
3431 /* Don't mask creation mode, kernel already did that */
3432 umask(0);
3434 qemu_init_exec_dir(argv[0]);
3436 pthread_mutex_init(&lo.mutex, NULL);
3437 lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal);
3438 lo.root.fd = -1;
3439 lo.root.fuse_ino = FUSE_ROOT_ID;
3440 lo.cache = CACHE_AUTO;
3443 * Set up the ino map like this:
3444 * [0] Reserved (will not be used)
3445 * [1] Root inode
3447 lo_map_init(&lo.ino_map);
3448 reserve_elem = lo_map_reserve(&lo.ino_map, 0);
3449 if (!reserve_elem) {
3450 fuse_log(FUSE_LOG_ERR, "failed to alloc reserve_elem.\n");
3451 goto err_out1;
3453 reserve_elem->in_use = false;
3454 root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino);
3455 if (!root_elem) {
3456 fuse_log(FUSE_LOG_ERR, "failed to alloc root_elem.\n");
3457 goto err_out1;
3459 root_elem->inode = &lo.root;
3461 lo_map_init(&lo.dirp_map);
3462 lo_map_init(&lo.fd_map);
3464 if (fuse_parse_cmdline(&args, &opts) != 0) {
3465 goto err_out1;
3467 fuse_set_log_func(log_func);
3468 use_syslog = opts.syslog;
3469 if (use_syslog) {
3470 openlog("virtiofsd", LOG_PID, LOG_DAEMON);
3473 if (opts.show_help) {
3474 printf("usage: %s [options]\n\n", argv[0]);
3475 fuse_cmdline_help();
3476 printf(" -o source=PATH shared directory tree\n");
3477 fuse_lowlevel_help();
3478 ret = 0;
3479 goto err_out1;
3480 } else if (opts.show_version) {
3481 fuse_lowlevel_version();
3482 ret = 0;
3483 goto err_out1;
3484 } else if (opts.print_capabilities) {
3485 print_capabilities();
3486 ret = 0;
3487 goto err_out1;
3490 if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) {
3491 goto err_out1;
3494 if (opts.log_level != 0) {
3495 current_log_level = opts.log_level;
3496 } else {
3497 /* default log level is INFO */
3498 current_log_level = FUSE_LOG_INFO;
3500 lo.debug = opts.debug;
3501 if (lo.debug) {
3502 current_log_level = FUSE_LOG_DEBUG;
3504 if (lo.source) {
3505 struct stat stat;
3506 int res;
3508 res = lstat(lo.source, &stat);
3509 if (res == -1) {
3510 fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n",
3511 lo.source);
3512 exit(1);
3514 if (!S_ISDIR(stat.st_mode)) {
3515 fuse_log(FUSE_LOG_ERR, "source is not a directory\n");
3516 exit(1);
3518 } else {
3519 lo.source = strdup("/");
3520 if (!lo.source) {
3521 fuse_log(FUSE_LOG_ERR, "failed to strdup source\n");
3522 goto err_out1;
3526 if (lo.xattrmap) {
3527 parse_xattrmap(&lo);
3530 if (!lo.timeout_set) {
3531 switch (lo.cache) {
3532 case CACHE_NONE:
3533 lo.timeout = 0.0;
3534 break;
3536 case CACHE_AUTO:
3537 lo.timeout = 1.0;
3538 break;
3540 case CACHE_ALWAYS:
3541 lo.timeout = 86400.0;
3542 break;
3544 } else if (lo.timeout < 0) {
3545 fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout);
3546 exit(1);
3549 lo.use_statx = true;
3551 se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo);
3552 if (se == NULL) {
3553 goto err_out1;
3556 if (fuse_set_signal_handlers(se) != 0) {
3557 goto err_out2;
3560 if (fuse_session_mount(se) != 0) {
3561 goto err_out3;
3564 fuse_daemonize(opts.foreground);
3566 setup_nofile_rlimit(opts.rlimit_nofile);
3568 /* Must be before sandbox since it wants /proc */
3569 setup_capng();
3571 setup_sandbox(&lo, se, opts.syslog);
3573 setup_root(&lo, &lo.root);
3574 /* Block until ctrl+c or fusermount -u */
3575 ret = virtio_loop(se);
3577 fuse_session_unmount(se);
3578 cleanup_capng();
3579 err_out3:
3580 fuse_remove_signal_handlers(se);
3581 err_out2:
3582 fuse_session_destroy(se);
3583 err_out1:
3584 fuse_opt_free_args(&args);
3586 fuse_lo_data_cleanup(&lo);
3588 return ret ? 1 : 0;