include/qemu/osdep.h: Move system includes to top
[qemu/ar7.git] / tools / virtiofsd / passthrough_ll.c
blob1553d2ef454f55a3103b452841d51215e9ecfc2b
1 /*
2 * FUSE: Filesystem in Userspace
3 * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
5 * This program can be distributed under the terms of the GNU GPLv2.
6 * See the file COPYING.
7 */
9 /*
11 * This file system mirrors the existing file system hierarchy of the
12 * system, starting at the root file system. This is implemented by
13 * just "passing through" all requests to the corresponding user-space
14 * libc functions. In contrast to passthrough.c and passthrough_fh.c,
15 * this implementation uses the low-level API. Its performance should
16 * be the least bad among the three, but many operations are not
17 * implemented. In particular, it is not possible to remove files (or
18 * directories) because the code necessary to defer actual removal
19 * until the file is not opened anymore would make the example much
20 * more complicated.
22 * When writeback caching is enabled (-o writeback mount option), it
23 * is only possible to write to files for which the mounting user has
24 * read permissions. This is because the writeback cache requires the
25 * kernel to be able to issue read requests for all files (which the
26 * passthrough filesystem cannot satisfy if it can't read the file in
27 * the underlying filesystem).
29 * Compile with:
31 * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o
32 * passthrough_ll
34 * ## Source code ##
35 * \include passthrough_ll.c
38 #include "qemu/osdep.h"
39 #include "qemu/timer.h"
40 #include "qemu-version.h"
41 #include "qemu-common.h"
42 #include "fuse_virtio.h"
43 #include "fuse_log.h"
44 #include "fuse_lowlevel.h"
45 #include "standard-headers/linux/fuse.h"
46 #include <cap-ng.h>
47 #include <dirent.h>
48 #include <pthread.h>
49 #include <sys/file.h>
50 #include <sys/mount.h>
51 #include <sys/prctl.h>
52 #include <sys/resource.h>
53 #include <sys/syscall.h>
54 #include <sys/wait.h>
55 #include <sys/xattr.h>
56 #include <syslog.h>
58 #include "qemu/cutils.h"
59 #include "passthrough_helpers.h"
60 #include "passthrough_seccomp.h"
62 /* Keep track of inode posix locks for each owner. */
63 struct lo_inode_plock {
64 uint64_t lock_owner;
65 int fd; /* fd for OFD locks */
68 struct lo_map_elem {
69 union {
70 struct lo_inode *inode;
71 struct lo_dirp *dirp;
72 int fd;
73 ssize_t freelist;
75 bool in_use;
78 /* Maps FUSE fh or ino values to internal objects */
79 struct lo_map {
80 struct lo_map_elem *elems;
81 size_t nelems;
82 ssize_t freelist;
85 struct lo_key {
86 ino_t ino;
87 dev_t dev;
88 uint64_t mnt_id;
91 struct lo_inode {
92 int fd;
95 * Atomic reference count for this object. The nlookup field holds a
96 * reference and release it when nlookup reaches 0.
98 gint refcount;
100 struct lo_key key;
103 * This counter keeps the inode alive during the FUSE session.
104 * Incremented when the FUSE inode number is sent in a reply
105 * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc). Decremented when an inode is
106 * released by a FUSE_FORGET request.
108 * Note that this value is untrusted because the client can manipulate
109 * it arbitrarily using FUSE_FORGET requests.
111 * Protected by lo->mutex.
113 uint64_t nlookup;
115 fuse_ino_t fuse_ino;
116 pthread_mutex_t plock_mutex;
117 GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
119 mode_t filetype;
122 struct lo_cred {
123 uid_t euid;
124 gid_t egid;
127 enum {
128 CACHE_NONE,
129 CACHE_AUTO,
130 CACHE_ALWAYS,
133 enum {
134 SANDBOX_NAMESPACE,
135 SANDBOX_CHROOT,
138 typedef struct xattr_map_entry {
139 char *key;
140 char *prepend;
141 unsigned int flags;
142 } XattrMapEntry;
144 struct lo_data {
145 pthread_mutex_t mutex;
146 int sandbox;
147 int debug;
148 int writeback;
149 int flock;
150 int posix_lock;
151 int xattr;
152 char *xattrmap;
153 char *xattr_security_capability;
154 char *source;
155 char *modcaps;
156 double timeout;
157 int cache;
158 int timeout_set;
159 int readdirplus_set;
160 int readdirplus_clear;
161 int allow_direct_io;
162 int announce_submounts;
163 bool use_statx;
164 struct lo_inode root;
165 GHashTable *inodes; /* protected by lo->mutex */
166 struct lo_map ino_map; /* protected by lo->mutex */
167 struct lo_map dirp_map; /* protected by lo->mutex */
168 struct lo_map fd_map; /* protected by lo->mutex */
169 XattrMapEntry *xattr_map_list;
170 size_t xattr_map_nentries;
172 /* An O_PATH file descriptor to /proc/self/fd/ */
173 int proc_self_fd;
174 int user_killpriv_v2, killpriv_v2;
177 static const struct fuse_opt lo_opts[] = {
178 { "sandbox=namespace",
179 offsetof(struct lo_data, sandbox),
180 SANDBOX_NAMESPACE },
181 { "sandbox=chroot",
182 offsetof(struct lo_data, sandbox),
183 SANDBOX_CHROOT },
184 { "writeback", offsetof(struct lo_data, writeback), 1 },
185 { "no_writeback", offsetof(struct lo_data, writeback), 0 },
186 { "source=%s", offsetof(struct lo_data, source), 0 },
187 { "flock", offsetof(struct lo_data, flock), 1 },
188 { "no_flock", offsetof(struct lo_data, flock), 0 },
189 { "posix_lock", offsetof(struct lo_data, posix_lock), 1 },
190 { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 },
191 { "xattr", offsetof(struct lo_data, xattr), 1 },
192 { "no_xattr", offsetof(struct lo_data, xattr), 0 },
193 { "xattrmap=%s", offsetof(struct lo_data, xattrmap), 0 },
194 { "modcaps=%s", offsetof(struct lo_data, modcaps), 0 },
195 { "timeout=%lf", offsetof(struct lo_data, timeout), 0 },
196 { "timeout=", offsetof(struct lo_data, timeout_set), 1 },
197 { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE },
198 { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO },
199 { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS },
200 { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 },
201 { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 },
202 { "allow_direct_io", offsetof(struct lo_data, allow_direct_io), 1 },
203 { "no_allow_direct_io", offsetof(struct lo_data, allow_direct_io), 0 },
204 { "announce_submounts", offsetof(struct lo_data, announce_submounts), 1 },
205 { "killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 1 },
206 { "no_killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 0 },
207 FUSE_OPT_END
209 static bool use_syslog = false;
210 static int current_log_level;
211 static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
212 uint64_t n);
214 static struct {
215 pthread_mutex_t mutex;
216 void *saved;
217 } cap;
218 /* That we loaded cap-ng in the current thread from the saved */
219 static __thread bool cap_loaded = 0;
221 static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
222 uint64_t mnt_id);
223 static int xattr_map_client(const struct lo_data *lo, const char *client_name,
224 char **out_name);
226 static bool is_dot_or_dotdot(const char *name)
228 return name[0] == '.' &&
229 (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'));
232 /* Is `path` a single path component that is not "." or ".."? */
233 static bool is_safe_path_component(const char *path)
235 if (strchr(path, '/')) {
236 return false;
239 return !is_dot_or_dotdot(path);
242 static bool is_empty(const char *name)
244 return name[0] == '\0';
247 static struct lo_data *lo_data(fuse_req_t req)
249 return (struct lo_data *)fuse_req_userdata(req);
253 * Load capng's state from our saved state if the current thread
254 * hadn't previously been loaded.
255 * returns 0 on success
257 static int load_capng(void)
259 if (!cap_loaded) {
260 pthread_mutex_lock(&cap.mutex);
261 capng_restore_state(&cap.saved);
263 * restore_state free's the saved copy
264 * so make another.
266 cap.saved = capng_save_state();
267 if (!cap.saved) {
268 pthread_mutex_unlock(&cap.mutex);
269 fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n");
270 return -EINVAL;
272 pthread_mutex_unlock(&cap.mutex);
275 * We want to use the loaded state for our pid,
276 * not the original
278 capng_setpid(syscall(SYS_gettid));
279 cap_loaded = true;
281 return 0;
285 * Helpers for dropping and regaining effective capabilities. Returns 0
286 * on success, error otherwise
288 static int drop_effective_cap(const char *cap_name, bool *cap_dropped)
290 int cap, ret;
292 cap = capng_name_to_capability(cap_name);
293 if (cap < 0) {
294 ret = errno;
295 fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
296 cap_name, strerror(errno));
297 goto out;
300 if (load_capng()) {
301 ret = errno;
302 fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
303 goto out;
306 /* We dont have this capability in effective set already. */
307 if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) {
308 ret = 0;
309 goto out;
312 if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) {
313 ret = errno;
314 fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n");
315 goto out;
318 if (capng_apply(CAPNG_SELECT_CAPS)) {
319 ret = errno;
320 fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n");
321 goto out;
324 ret = 0;
325 if (cap_dropped) {
326 *cap_dropped = true;
329 out:
330 return ret;
333 static int gain_effective_cap(const char *cap_name)
335 int cap;
336 int ret = 0;
338 cap = capng_name_to_capability(cap_name);
339 if (cap < 0) {
340 ret = errno;
341 fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
342 cap_name, strerror(errno));
343 goto out;
346 if (load_capng()) {
347 ret = errno;
348 fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
349 goto out;
352 if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) {
353 ret = errno;
354 fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n");
355 goto out;
358 if (capng_apply(CAPNG_SELECT_CAPS)) {
359 ret = errno;
360 fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n");
361 goto out;
363 ret = 0;
365 out:
366 return ret;
370 * The host kernel normally drops security.capability xattr's on
371 * any write, however if we're remapping xattr names we need to drop
372 * whatever the clients security.capability is actually stored as.
374 static int drop_security_capability(const struct lo_data *lo, int fd)
376 if (!lo->xattr_security_capability) {
377 /* We didn't remap the name, let the host kernel do it */
378 return 0;
380 if (!fremovexattr(fd, lo->xattr_security_capability)) {
381 /* All good */
382 return 0;
385 switch (errno) {
386 case ENODATA:
387 /* Attribute didn't exist, that's fine */
388 return 0;
390 case ENOTSUP:
391 /* FS didn't support attribute anyway, also fine */
392 return 0;
394 default:
395 /* Hmm other error */
396 return errno;
400 static void lo_map_init(struct lo_map *map)
402 map->elems = NULL;
403 map->nelems = 0;
404 map->freelist = -1;
407 static void lo_map_destroy(struct lo_map *map)
409 free(map->elems);
412 static int lo_map_grow(struct lo_map *map, size_t new_nelems)
414 struct lo_map_elem *new_elems;
415 size_t i;
417 if (new_nelems <= map->nelems) {
418 return 1;
421 new_elems = realloc(map->elems, sizeof(map->elems[0]) * new_nelems);
422 if (!new_elems) {
423 return 0;
426 for (i = map->nelems; i < new_nelems; i++) {
427 new_elems[i].freelist = i + 1;
428 new_elems[i].in_use = false;
430 new_elems[new_nelems - 1].freelist = -1;
432 map->elems = new_elems;
433 map->freelist = map->nelems;
434 map->nelems = new_nelems;
435 return 1;
438 static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map)
440 struct lo_map_elem *elem;
442 if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) {
443 return NULL;
446 elem = &map->elems[map->freelist];
447 map->freelist = elem->freelist;
449 elem->in_use = true;
451 return elem;
454 static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key)
456 ssize_t *prev;
458 if (!lo_map_grow(map, key + 1)) {
459 return NULL;
462 for (prev = &map->freelist; *prev != -1;
463 prev = &map->elems[*prev].freelist) {
464 if (*prev == key) {
465 struct lo_map_elem *elem = &map->elems[key];
467 *prev = elem->freelist;
468 elem->in_use = true;
469 return elem;
472 return NULL;
475 static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key)
477 if (key >= map->nelems) {
478 return NULL;
480 if (!map->elems[key].in_use) {
481 return NULL;
483 return &map->elems[key];
486 static void lo_map_remove(struct lo_map *map, size_t key)
488 struct lo_map_elem *elem;
490 if (key >= map->nelems) {
491 return;
494 elem = &map->elems[key];
495 if (!elem->in_use) {
496 return;
499 elem->in_use = false;
501 elem->freelist = map->freelist;
502 map->freelist = key;
505 /* Assumes lo->mutex is held */
506 static ssize_t lo_add_fd_mapping(struct lo_data *lo, int fd)
508 struct lo_map_elem *elem;
510 elem = lo_map_alloc_elem(&lo->fd_map);
511 if (!elem) {
512 return -1;
515 elem->fd = fd;
516 return elem - lo->fd_map.elems;
519 /* Assumes lo->mutex is held */
520 static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp)
522 struct lo_map_elem *elem;
524 elem = lo_map_alloc_elem(&lo_data(req)->dirp_map);
525 if (!elem) {
526 return -1;
529 elem->dirp = dirp;
530 return elem - lo_data(req)->dirp_map.elems;
533 /* Assumes lo->mutex is held */
534 static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode)
536 struct lo_map_elem *elem;
538 elem = lo_map_alloc_elem(&lo_data(req)->ino_map);
539 if (!elem) {
540 return -1;
543 elem->inode = inode;
544 return elem - lo_data(req)->ino_map.elems;
547 static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep)
549 struct lo_inode *inode = *inodep;
551 if (!inode) {
552 return;
555 *inodep = NULL;
557 if (g_atomic_int_dec_and_test(&inode->refcount)) {
558 close(inode->fd);
559 free(inode);
563 /* Caller must release refcount using lo_inode_put() */
564 static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino)
566 struct lo_data *lo = lo_data(req);
567 struct lo_map_elem *elem;
569 pthread_mutex_lock(&lo->mutex);
570 elem = lo_map_get(&lo->ino_map, ino);
571 if (elem) {
572 g_atomic_int_inc(&elem->inode->refcount);
574 pthread_mutex_unlock(&lo->mutex);
576 if (!elem) {
577 return NULL;
580 return elem->inode;
584 * TODO Remove this helper and force callers to hold an inode refcount until
585 * they are done with the fd. This will be done in a later patch to make
586 * review easier.
588 static int lo_fd(fuse_req_t req, fuse_ino_t ino)
590 struct lo_inode *inode = lo_inode(req, ino);
591 int fd;
593 if (!inode) {
594 return -1;
597 fd = inode->fd;
598 lo_inode_put(lo_data(req), &inode);
599 return fd;
603 * Open a file descriptor for an inode. Returns -EBADF if the inode is not a
604 * regular file or a directory.
606 * Use this helper function instead of raw openat(2) to prevent security issues
607 * when a malicious client opens special files such as block device nodes.
608 * Symlink inodes are also rejected since symlinks must already have been
609 * traversed on the client side.
611 static int lo_inode_open(struct lo_data *lo, struct lo_inode *inode,
612 int open_flags)
614 g_autofree char *fd_str = g_strdup_printf("%d", inode->fd);
615 int fd;
617 if (!S_ISREG(inode->filetype) && !S_ISDIR(inode->filetype)) {
618 return -EBADF;
622 * The file is a symlink so O_NOFOLLOW must be ignored. We checked earlier
623 * that the inode is not a special file but if an external process races
624 * with us then symlinks are traversed here. It is not possible to escape
625 * the shared directory since it is mounted as "/" though.
627 fd = openat(lo->proc_self_fd, fd_str, open_flags & ~O_NOFOLLOW);
628 if (fd < 0) {
629 return -errno;
631 return fd;
634 static void lo_init(void *userdata, struct fuse_conn_info *conn)
636 struct lo_data *lo = (struct lo_data *)userdata;
638 if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) {
639 conn->want |= FUSE_CAP_EXPORT_SUPPORT;
642 if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) {
643 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n");
644 conn->want |= FUSE_CAP_WRITEBACK_CACHE;
646 if (conn->capable & FUSE_CAP_FLOCK_LOCKS) {
647 if (lo->flock) {
648 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n");
649 conn->want |= FUSE_CAP_FLOCK_LOCKS;
650 } else {
651 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n");
652 conn->want &= ~FUSE_CAP_FLOCK_LOCKS;
656 if (conn->capable & FUSE_CAP_POSIX_LOCKS) {
657 if (lo->posix_lock) {
658 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n");
659 conn->want |= FUSE_CAP_POSIX_LOCKS;
660 } else {
661 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n");
662 conn->want &= ~FUSE_CAP_POSIX_LOCKS;
666 if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) ||
667 lo->readdirplus_clear) {
668 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n");
669 conn->want &= ~FUSE_CAP_READDIRPLUS;
672 if (!(conn->capable & FUSE_CAP_SUBMOUNTS) && lo->announce_submounts) {
673 fuse_log(FUSE_LOG_WARNING, "lo_init: Cannot announce submounts, client "
674 "does not support it\n");
675 lo->announce_submounts = false;
678 if (lo->user_killpriv_v2 == 1) {
680 * User explicitly asked for this option. Enable it unconditionally.
681 * If connection does not have this capability, it should fail
682 * in fuse_lowlevel.c
684 fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n");
685 conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2;
686 lo->killpriv_v2 = 1;
687 } else if (lo->user_killpriv_v2 == -1 &&
688 conn->capable & FUSE_CAP_HANDLE_KILLPRIV_V2) {
690 * User did not specify a value for killpriv_v2. By default enable it
691 * if connection offers this capability
693 fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n");
694 conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2;
695 lo->killpriv_v2 = 1;
696 } else {
698 * Either user specified to disable killpriv_v2, or connection does
699 * not offer this capability. Disable killpriv_v2 in both the cases
701 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling killpriv_v2\n");
702 conn->want &= ~FUSE_CAP_HANDLE_KILLPRIV_V2;
703 lo->killpriv_v2 = 0;
707 static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
708 struct fuse_file_info *fi)
710 int res;
711 struct stat buf;
712 struct lo_data *lo = lo_data(req);
714 (void)fi;
716 res =
717 fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
718 if (res == -1) {
719 return (void)fuse_reply_err(req, errno);
722 fuse_reply_attr(req, &buf, lo->timeout);
725 static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi)
727 struct lo_data *lo = lo_data(req);
728 struct lo_map_elem *elem;
730 pthread_mutex_lock(&lo->mutex);
731 elem = lo_map_get(&lo->fd_map, fi->fh);
732 pthread_mutex_unlock(&lo->mutex);
734 if (!elem) {
735 return -1;
738 return elem->fd;
741 static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
742 int valid, struct fuse_file_info *fi)
744 int saverr;
745 char procname[64];
746 struct lo_data *lo = lo_data(req);
747 struct lo_inode *inode;
748 int ifd;
749 int res;
750 int fd = -1;
752 inode = lo_inode(req, ino);
753 if (!inode) {
754 fuse_reply_err(req, EBADF);
755 return;
758 ifd = inode->fd;
760 /* If fi->fh is invalid we'll report EBADF later */
761 if (fi) {
762 fd = lo_fi_fd(req, fi);
765 if (valid & FUSE_SET_ATTR_MODE) {
766 if (fi) {
767 res = fchmod(fd, attr->st_mode);
768 } else {
769 sprintf(procname, "%i", ifd);
770 res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0);
772 if (res == -1) {
773 saverr = errno;
774 goto out_err;
777 if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) {
778 uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1;
779 gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1;
781 saverr = drop_security_capability(lo, ifd);
782 if (saverr) {
783 goto out_err;
786 res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
787 if (res == -1) {
788 saverr = errno;
789 goto out_err;
792 if (valid & FUSE_SET_ATTR_SIZE) {
793 int truncfd;
794 bool kill_suidgid;
795 bool cap_fsetid_dropped = false;
797 kill_suidgid = lo->killpriv_v2 && (valid & FUSE_SET_ATTR_KILL_SUIDGID);
798 if (fi) {
799 truncfd = fd;
800 } else {
801 truncfd = lo_inode_open(lo, inode, O_RDWR);
802 if (truncfd < 0) {
803 saverr = -truncfd;
804 goto out_err;
808 saverr = drop_security_capability(lo, truncfd);
809 if (saverr) {
810 if (!fi) {
811 close(truncfd);
813 goto out_err;
816 if (kill_suidgid) {
817 res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
818 if (res != 0) {
819 saverr = res;
820 if (!fi) {
821 close(truncfd);
823 goto out_err;
827 res = ftruncate(truncfd, attr->st_size);
828 saverr = res == -1 ? errno : 0;
830 if (cap_fsetid_dropped) {
831 if (gain_effective_cap("FSETID")) {
832 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
835 if (!fi) {
836 close(truncfd);
838 if (res == -1) {
839 goto out_err;
842 if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) {
843 struct timespec tv[2];
845 tv[0].tv_sec = 0;
846 tv[1].tv_sec = 0;
847 tv[0].tv_nsec = UTIME_OMIT;
848 tv[1].tv_nsec = UTIME_OMIT;
850 if (valid & FUSE_SET_ATTR_ATIME_NOW) {
851 tv[0].tv_nsec = UTIME_NOW;
852 } else if (valid & FUSE_SET_ATTR_ATIME) {
853 tv[0] = attr->st_atim;
856 if (valid & FUSE_SET_ATTR_MTIME_NOW) {
857 tv[1].tv_nsec = UTIME_NOW;
858 } else if (valid & FUSE_SET_ATTR_MTIME) {
859 tv[1] = attr->st_mtim;
862 if (fi) {
863 res = futimens(fd, tv);
864 } else {
865 sprintf(procname, "%i", inode->fd);
866 res = utimensat(lo->proc_self_fd, procname, tv, 0);
868 if (res == -1) {
869 saverr = errno;
870 goto out_err;
873 lo_inode_put(lo, &inode);
875 return lo_getattr(req, ino, fi);
877 out_err:
878 lo_inode_put(lo, &inode);
879 fuse_reply_err(req, saverr);
882 static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
883 uint64_t mnt_id)
885 struct lo_inode *p;
886 struct lo_key key = {
887 .ino = st->st_ino,
888 .dev = st->st_dev,
889 .mnt_id = mnt_id,
892 pthread_mutex_lock(&lo->mutex);
893 p = g_hash_table_lookup(lo->inodes, &key);
894 if (p) {
895 assert(p->nlookup > 0);
896 p->nlookup++;
897 g_atomic_int_inc(&p->refcount);
899 pthread_mutex_unlock(&lo->mutex);
901 return p;
904 /* value_destroy_func for posix_locks GHashTable */
905 static void posix_locks_value_destroy(gpointer data)
907 struct lo_inode_plock *plock = data;
910 * We had used open() for locks and had only one fd. So
911 * closing this fd should release all OFD locks.
913 close(plock->fd);
914 free(plock);
917 static int do_statx(struct lo_data *lo, int dirfd, const char *pathname,
918 struct stat *statbuf, int flags, uint64_t *mnt_id)
920 int res;
922 #if defined(CONFIG_STATX) && defined(STATX_MNT_ID)
923 if (lo->use_statx) {
924 struct statx statxbuf;
926 res = statx(dirfd, pathname, flags, STATX_BASIC_STATS | STATX_MNT_ID,
927 &statxbuf);
928 if (!res) {
929 memset(statbuf, 0, sizeof(*statbuf));
930 statbuf->st_dev = makedev(statxbuf.stx_dev_major,
931 statxbuf.stx_dev_minor);
932 statbuf->st_ino = statxbuf.stx_ino;
933 statbuf->st_mode = statxbuf.stx_mode;
934 statbuf->st_nlink = statxbuf.stx_nlink;
935 statbuf->st_uid = statxbuf.stx_uid;
936 statbuf->st_gid = statxbuf.stx_gid;
937 statbuf->st_rdev = makedev(statxbuf.stx_rdev_major,
938 statxbuf.stx_rdev_minor);
939 statbuf->st_size = statxbuf.stx_size;
940 statbuf->st_blksize = statxbuf.stx_blksize;
941 statbuf->st_blocks = statxbuf.stx_blocks;
942 statbuf->st_atim.tv_sec = statxbuf.stx_atime.tv_sec;
943 statbuf->st_atim.tv_nsec = statxbuf.stx_atime.tv_nsec;
944 statbuf->st_mtim.tv_sec = statxbuf.stx_mtime.tv_sec;
945 statbuf->st_mtim.tv_nsec = statxbuf.stx_mtime.tv_nsec;
946 statbuf->st_ctim.tv_sec = statxbuf.stx_ctime.tv_sec;
947 statbuf->st_ctim.tv_nsec = statxbuf.stx_ctime.tv_nsec;
949 if (statxbuf.stx_mask & STATX_MNT_ID) {
950 *mnt_id = statxbuf.stx_mnt_id;
951 } else {
952 *mnt_id = 0;
954 return 0;
955 } else if (errno != ENOSYS) {
956 return -1;
958 lo->use_statx = false;
959 /* fallback */
961 #endif
962 res = fstatat(dirfd, pathname, statbuf, flags);
963 if (res == -1) {
964 return -1;
966 *mnt_id = 0;
968 return 0;
972 * Increments nlookup on the inode on success. unref_inode_lolocked() must be
973 * called eventually to decrement nlookup again. If inodep is non-NULL, the
974 * inode pointer is stored and the caller must call lo_inode_put().
976 static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
977 struct fuse_entry_param *e,
978 struct lo_inode **inodep)
980 int newfd;
981 int res;
982 int saverr;
983 uint64_t mnt_id;
984 struct lo_data *lo = lo_data(req);
985 struct lo_inode *inode = NULL;
986 struct lo_inode *dir = lo_inode(req, parent);
988 if (inodep) {
989 *inodep = NULL; /* in case there is an error */
993 * name_to_handle_at() and open_by_handle_at() can reach here with fuse
994 * mount point in guest, but we don't have its inode info in the
995 * ino_map.
997 if (!dir) {
998 return ENOENT;
1001 memset(e, 0, sizeof(*e));
1002 e->attr_timeout = lo->timeout;
1003 e->entry_timeout = lo->timeout;
1005 /* Do not allow escaping root directory */
1006 if (dir == &lo->root && strcmp(name, "..") == 0) {
1007 name = ".";
1010 newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW);
1011 if (newfd == -1) {
1012 goto out_err;
1015 res = do_statx(lo, newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
1016 &mnt_id);
1017 if (res == -1) {
1018 goto out_err;
1021 if (S_ISDIR(e->attr.st_mode) && lo->announce_submounts &&
1022 (e->attr.st_dev != dir->key.dev || mnt_id != dir->key.mnt_id)) {
1023 e->attr_flags |= FUSE_ATTR_SUBMOUNT;
1026 inode = lo_find(lo, &e->attr, mnt_id);
1027 if (inode) {
1028 close(newfd);
1029 } else {
1030 inode = calloc(1, sizeof(struct lo_inode));
1031 if (!inode) {
1032 goto out_err;
1035 /* cache only filetype */
1036 inode->filetype = (e->attr.st_mode & S_IFMT);
1039 * One for the caller and one for nlookup (released in
1040 * unref_inode_lolocked())
1042 g_atomic_int_set(&inode->refcount, 2);
1044 inode->nlookup = 1;
1045 inode->fd = newfd;
1046 inode->key.ino = e->attr.st_ino;
1047 inode->key.dev = e->attr.st_dev;
1048 inode->key.mnt_id = mnt_id;
1049 if (lo->posix_lock) {
1050 pthread_mutex_init(&inode->plock_mutex, NULL);
1051 inode->posix_locks = g_hash_table_new_full(
1052 g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
1054 pthread_mutex_lock(&lo->mutex);
1055 inode->fuse_ino = lo_add_inode_mapping(req, inode);
1056 g_hash_table_insert(lo->inodes, &inode->key, inode);
1057 pthread_mutex_unlock(&lo->mutex);
1059 e->ino = inode->fuse_ino;
1061 /* Transfer ownership of inode pointer to caller or drop it */
1062 if (inodep) {
1063 *inodep = inode;
1064 } else {
1065 lo_inode_put(lo, &inode);
1068 lo_inode_put(lo, &dir);
1070 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
1071 name, (unsigned long long)e->ino);
1073 return 0;
1075 out_err:
1076 saverr = errno;
1077 if (newfd != -1) {
1078 close(newfd);
1080 lo_inode_put(lo, &inode);
1081 lo_inode_put(lo, &dir);
1082 return saverr;
1085 static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
1087 struct fuse_entry_param e;
1088 int err;
1090 fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent,
1091 name);
1093 if (is_empty(name)) {
1094 fuse_reply_err(req, ENOENT);
1095 return;
1099 * Don't use is_safe_path_component(), allow "." and ".." for NFS export
1100 * support.
1102 if (strchr(name, '/')) {
1103 fuse_reply_err(req, EINVAL);
1104 return;
1107 err = lo_do_lookup(req, parent, name, &e, NULL);
1108 if (err) {
1109 fuse_reply_err(req, err);
1110 } else {
1111 fuse_reply_entry(req, &e);
1116 * On some archs, setres*id is limited to 2^16 but they
1117 * provide setres*id32 variants that allow 2^32.
1118 * Others just let setres*id do 2^32 anyway.
1120 #ifdef SYS_setresgid32
1121 #define OURSYS_setresgid SYS_setresgid32
1122 #else
1123 #define OURSYS_setresgid SYS_setresgid
1124 #endif
1126 #ifdef SYS_setresuid32
1127 #define OURSYS_setresuid SYS_setresuid32
1128 #else
1129 #define OURSYS_setresuid SYS_setresuid
1130 #endif
1133 * Change to uid/gid of caller so that file is created with
1134 * ownership of caller.
1135 * TODO: What about selinux context?
1137 static int lo_change_cred(fuse_req_t req, struct lo_cred *old)
1139 int res;
1141 old->euid = geteuid();
1142 old->egid = getegid();
1144 res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1);
1145 if (res == -1) {
1146 return errno;
1149 res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1);
1150 if (res == -1) {
1151 int errno_save = errno;
1153 syscall(OURSYS_setresgid, -1, old->egid, -1);
1154 return errno_save;
1157 return 0;
1160 /* Regain Privileges */
1161 static void lo_restore_cred(struct lo_cred *old)
1163 int res;
1165 res = syscall(OURSYS_setresuid, -1, old->euid, -1);
1166 if (res == -1) {
1167 fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid);
1168 exit(1);
1171 res = syscall(OURSYS_setresgid, -1, old->egid, -1);
1172 if (res == -1) {
1173 fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid);
1174 exit(1);
1178 static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
1179 const char *name, mode_t mode, dev_t rdev,
1180 const char *link)
1182 int res;
1183 int saverr;
1184 struct lo_data *lo = lo_data(req);
1185 struct lo_inode *dir;
1186 struct fuse_entry_param e;
1187 struct lo_cred old = {};
1189 if (is_empty(name)) {
1190 fuse_reply_err(req, ENOENT);
1191 return;
1194 if (!is_safe_path_component(name)) {
1195 fuse_reply_err(req, EINVAL);
1196 return;
1199 dir = lo_inode(req, parent);
1200 if (!dir) {
1201 fuse_reply_err(req, EBADF);
1202 return;
1205 saverr = lo_change_cred(req, &old);
1206 if (saverr) {
1207 goto out;
1210 res = mknod_wrapper(dir->fd, name, link, mode, rdev);
1212 saverr = errno;
1214 lo_restore_cred(&old);
1216 if (res == -1) {
1217 goto out;
1220 saverr = lo_do_lookup(req, parent, name, &e, NULL);
1221 if (saverr) {
1222 goto out;
1225 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
1226 name, (unsigned long long)e.ino);
1228 fuse_reply_entry(req, &e);
1229 lo_inode_put(lo, &dir);
1230 return;
1232 out:
1233 lo_inode_put(lo, &dir);
1234 fuse_reply_err(req, saverr);
1237 static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name,
1238 mode_t mode, dev_t rdev)
1240 lo_mknod_symlink(req, parent, name, mode, rdev, NULL);
1243 static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name,
1244 mode_t mode)
1246 lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL);
1249 static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent,
1250 const char *name)
1252 lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link);
1255 static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent,
1256 const char *name)
1258 int res;
1259 struct lo_data *lo = lo_data(req);
1260 struct lo_inode *parent_inode;
1261 struct lo_inode *inode;
1262 struct fuse_entry_param e;
1263 char procname[64];
1264 int saverr;
1266 if (is_empty(name)) {
1267 fuse_reply_err(req, ENOENT);
1268 return;
1271 if (!is_safe_path_component(name)) {
1272 fuse_reply_err(req, EINVAL);
1273 return;
1276 parent_inode = lo_inode(req, parent);
1277 inode = lo_inode(req, ino);
1278 if (!parent_inode || !inode) {
1279 errno = EBADF;
1280 goto out_err;
1283 memset(&e, 0, sizeof(struct fuse_entry_param));
1284 e.attr_timeout = lo->timeout;
1285 e.entry_timeout = lo->timeout;
1287 sprintf(procname, "%i", inode->fd);
1288 res = linkat(lo->proc_self_fd, procname, parent_inode->fd, name,
1289 AT_SYMLINK_FOLLOW);
1290 if (res == -1) {
1291 goto out_err;
1294 res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1295 if (res == -1) {
1296 goto out_err;
1299 pthread_mutex_lock(&lo->mutex);
1300 inode->nlookup++;
1301 pthread_mutex_unlock(&lo->mutex);
1302 e.ino = inode->fuse_ino;
1304 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
1305 name, (unsigned long long)e.ino);
1307 fuse_reply_entry(req, &e);
1308 lo_inode_put(lo, &parent_inode);
1309 lo_inode_put(lo, &inode);
1310 return;
1312 out_err:
1313 saverr = errno;
1314 lo_inode_put(lo, &parent_inode);
1315 lo_inode_put(lo, &inode);
1316 fuse_reply_err(req, saverr);
1319 /* Increments nlookup and caller must release refcount using lo_inode_put() */
1320 static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent,
1321 const char *name)
1323 int res;
1324 uint64_t mnt_id;
1325 struct stat attr;
1326 struct lo_data *lo = lo_data(req);
1327 struct lo_inode *dir = lo_inode(req, parent);
1329 if (!dir) {
1330 return NULL;
1333 res = do_statx(lo, dir->fd, name, &attr, AT_SYMLINK_NOFOLLOW, &mnt_id);
1334 lo_inode_put(lo, &dir);
1335 if (res == -1) {
1336 return NULL;
1339 return lo_find(lo, &attr, mnt_id);
1342 static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
1344 int res;
1345 struct lo_inode *inode;
1346 struct lo_data *lo = lo_data(req);
1348 if (is_empty(name)) {
1349 fuse_reply_err(req, ENOENT);
1350 return;
1353 if (!is_safe_path_component(name)) {
1354 fuse_reply_err(req, EINVAL);
1355 return;
1358 inode = lookup_name(req, parent, name);
1359 if (!inode) {
1360 fuse_reply_err(req, EIO);
1361 return;
1364 res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR);
1366 fuse_reply_err(req, res == -1 ? errno : 0);
1367 unref_inode_lolocked(lo, inode, 1);
1368 lo_inode_put(lo, &inode);
1371 static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
1372 fuse_ino_t newparent, const char *newname,
1373 unsigned int flags)
1375 int res;
1376 struct lo_inode *parent_inode;
1377 struct lo_inode *newparent_inode;
1378 struct lo_inode *oldinode = NULL;
1379 struct lo_inode *newinode = NULL;
1380 struct lo_data *lo = lo_data(req);
1382 if (is_empty(name) || is_empty(newname)) {
1383 fuse_reply_err(req, ENOENT);
1384 return;
1387 if (!is_safe_path_component(name) || !is_safe_path_component(newname)) {
1388 fuse_reply_err(req, EINVAL);
1389 return;
1392 parent_inode = lo_inode(req, parent);
1393 newparent_inode = lo_inode(req, newparent);
1394 if (!parent_inode || !newparent_inode) {
1395 fuse_reply_err(req, EBADF);
1396 goto out;
1399 oldinode = lookup_name(req, parent, name);
1400 newinode = lookup_name(req, newparent, newname);
1402 if (!oldinode) {
1403 fuse_reply_err(req, EIO);
1404 goto out;
1407 if (flags) {
1408 #ifndef SYS_renameat2
1409 fuse_reply_err(req, EINVAL);
1410 #else
1411 res = syscall(SYS_renameat2, parent_inode->fd, name,
1412 newparent_inode->fd, newname, flags);
1413 if (res == -1 && errno == ENOSYS) {
1414 fuse_reply_err(req, EINVAL);
1415 } else {
1416 fuse_reply_err(req, res == -1 ? errno : 0);
1418 #endif
1419 goto out;
1422 res = renameat(parent_inode->fd, name, newparent_inode->fd, newname);
1424 fuse_reply_err(req, res == -1 ? errno : 0);
1425 out:
1426 unref_inode_lolocked(lo, oldinode, 1);
1427 unref_inode_lolocked(lo, newinode, 1);
1428 lo_inode_put(lo, &oldinode);
1429 lo_inode_put(lo, &newinode);
1430 lo_inode_put(lo, &parent_inode);
1431 lo_inode_put(lo, &newparent_inode);
1434 static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name)
1436 int res;
1437 struct lo_inode *inode;
1438 struct lo_data *lo = lo_data(req);
1440 if (is_empty(name)) {
1441 fuse_reply_err(req, ENOENT);
1442 return;
1445 if (!is_safe_path_component(name)) {
1446 fuse_reply_err(req, EINVAL);
1447 return;
1450 inode = lookup_name(req, parent, name);
1451 if (!inode) {
1452 fuse_reply_err(req, EIO);
1453 return;
1456 res = unlinkat(lo_fd(req, parent), name, 0);
1458 fuse_reply_err(req, res == -1 ? errno : 0);
1459 unref_inode_lolocked(lo, inode, 1);
1460 lo_inode_put(lo, &inode);
1463 /* To be called with lo->mutex held */
1464 static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n)
1466 if (!inode) {
1467 return;
1470 assert(inode->nlookup >= n);
1471 inode->nlookup -= n;
1472 if (!inode->nlookup) {
1473 lo_map_remove(&lo->ino_map, inode->fuse_ino);
1474 g_hash_table_remove(lo->inodes, &inode->key);
1475 if (lo->posix_lock) {
1476 if (g_hash_table_size(inode->posix_locks)) {
1477 fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
1479 g_hash_table_destroy(inode->posix_locks);
1480 pthread_mutex_destroy(&inode->plock_mutex);
1482 /* Drop our refcount from lo_do_lookup() */
1483 lo_inode_put(lo, &inode);
1487 static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
1488 uint64_t n)
1490 if (!inode) {
1491 return;
1494 pthread_mutex_lock(&lo->mutex);
1495 unref_inode(lo, inode, n);
1496 pthread_mutex_unlock(&lo->mutex);
1499 static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
1501 struct lo_data *lo = lo_data(req);
1502 struct lo_inode *inode;
1504 inode = lo_inode(req, ino);
1505 if (!inode) {
1506 return;
1509 fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n",
1510 (unsigned long long)ino, (unsigned long long)inode->nlookup,
1511 (unsigned long long)nlookup);
1513 unref_inode_lolocked(lo, inode, nlookup);
1514 lo_inode_put(lo, &inode);
1517 static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
1519 lo_forget_one(req, ino, nlookup);
1520 fuse_reply_none(req);
1523 static void lo_forget_multi(fuse_req_t req, size_t count,
1524 struct fuse_forget_data *forgets)
1526 int i;
1528 for (i = 0; i < count; i++) {
1529 lo_forget_one(req, forgets[i].ino, forgets[i].nlookup);
1531 fuse_reply_none(req);
1534 static void lo_readlink(fuse_req_t req, fuse_ino_t ino)
1536 char buf[PATH_MAX + 1];
1537 int res;
1539 res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf));
1540 if (res == -1) {
1541 return (void)fuse_reply_err(req, errno);
1544 if (res == sizeof(buf)) {
1545 return (void)fuse_reply_err(req, ENAMETOOLONG);
1548 buf[res] = '\0';
1550 fuse_reply_readlink(req, buf);
1553 struct lo_dirp {
1554 gint refcount;
1555 DIR *dp;
1556 struct dirent *entry;
1557 off_t offset;
1560 static void lo_dirp_put(struct lo_dirp **dp)
1562 struct lo_dirp *d = *dp;
1564 if (!d) {
1565 return;
1567 *dp = NULL;
1569 if (g_atomic_int_dec_and_test(&d->refcount)) {
1570 closedir(d->dp);
1571 free(d);
1575 /* Call lo_dirp_put() on the return value when no longer needed */
1576 static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi)
1578 struct lo_data *lo = lo_data(req);
1579 struct lo_map_elem *elem;
1581 pthread_mutex_lock(&lo->mutex);
1582 elem = lo_map_get(&lo->dirp_map, fi->fh);
1583 if (elem) {
1584 g_atomic_int_inc(&elem->dirp->refcount);
1586 pthread_mutex_unlock(&lo->mutex);
1587 if (!elem) {
1588 return NULL;
1591 return elem->dirp;
1594 static void lo_opendir(fuse_req_t req, fuse_ino_t ino,
1595 struct fuse_file_info *fi)
1597 int error = ENOMEM;
1598 struct lo_data *lo = lo_data(req);
1599 struct lo_dirp *d;
1600 int fd;
1601 ssize_t fh;
1603 d = calloc(1, sizeof(struct lo_dirp));
1604 if (d == NULL) {
1605 goto out_err;
1608 fd = openat(lo_fd(req, ino), ".", O_RDONLY);
1609 if (fd == -1) {
1610 goto out_errno;
1613 d->dp = fdopendir(fd);
1614 if (d->dp == NULL) {
1615 goto out_errno;
1618 d->offset = 0;
1619 d->entry = NULL;
1621 g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */
1622 pthread_mutex_lock(&lo->mutex);
1623 fh = lo_add_dirp_mapping(req, d);
1624 pthread_mutex_unlock(&lo->mutex);
1625 if (fh == -1) {
1626 goto out_err;
1629 fi->fh = fh;
1630 if (lo->cache == CACHE_ALWAYS) {
1631 fi->cache_readdir = 1;
1633 fuse_reply_open(req, fi);
1634 return;
1636 out_errno:
1637 error = errno;
1638 out_err:
1639 if (d) {
1640 if (d->dp) {
1641 closedir(d->dp);
1642 } else if (fd != -1) {
1643 close(fd);
1645 free(d);
1647 fuse_reply_err(req, error);
1650 static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
1651 off_t offset, struct fuse_file_info *fi, int plus)
1653 struct lo_data *lo = lo_data(req);
1654 struct lo_dirp *d = NULL;
1655 struct lo_inode *dinode;
1656 char *buf = NULL;
1657 char *p;
1658 size_t rem = size;
1659 int err = EBADF;
1661 dinode = lo_inode(req, ino);
1662 if (!dinode) {
1663 goto error;
1666 d = lo_dirp(req, fi);
1667 if (!d) {
1668 goto error;
1671 err = ENOMEM;
1672 buf = calloc(1, size);
1673 if (!buf) {
1674 goto error;
1676 p = buf;
1678 if (offset != d->offset) {
1679 seekdir(d->dp, offset);
1680 d->entry = NULL;
1681 d->offset = offset;
1683 while (1) {
1684 size_t entsize;
1685 off_t nextoff;
1686 const char *name;
1688 if (!d->entry) {
1689 errno = 0;
1690 d->entry = readdir(d->dp);
1691 if (!d->entry) {
1692 if (errno) { /* Error */
1693 err = errno;
1694 goto error;
1695 } else { /* End of stream */
1696 break;
1700 nextoff = d->entry->d_off;
1701 name = d->entry->d_name;
1703 fuse_ino_t entry_ino = 0;
1704 struct fuse_entry_param e = (struct fuse_entry_param){
1705 .attr.st_ino = d->entry->d_ino,
1706 .attr.st_mode = d->entry->d_type << 12,
1709 /* Hide root's parent directory */
1710 if (dinode == &lo->root && strcmp(name, "..") == 0) {
1711 e.attr.st_ino = lo->root.key.ino;
1712 e.attr.st_mode = DT_DIR << 12;
1715 if (plus) {
1716 if (!is_dot_or_dotdot(name)) {
1717 err = lo_do_lookup(req, ino, name, &e, NULL);
1718 if (err) {
1719 goto error;
1721 entry_ino = e.ino;
1724 entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff);
1725 } else {
1726 entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff);
1728 if (entsize > rem) {
1729 if (entry_ino != 0) {
1730 lo_forget_one(req, entry_ino, 1);
1732 break;
1735 p += entsize;
1736 rem -= entsize;
1738 d->entry = NULL;
1739 d->offset = nextoff;
1742 err = 0;
1743 error:
1744 lo_dirp_put(&d);
1745 lo_inode_put(lo, &dinode);
1748 * If there's an error, we can only signal it if we haven't stored
1749 * any entries yet - otherwise we'd end up with wrong lookup
1750 * counts for the entries that are already in the buffer. So we
1751 * return what we've collected until that point.
1753 if (err && rem == size) {
1754 fuse_reply_err(req, err);
1755 } else {
1756 fuse_reply_buf(req, buf, size - rem);
1758 free(buf);
1761 static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
1762 off_t offset, struct fuse_file_info *fi)
1764 lo_do_readdir(req, ino, size, offset, fi, 0);
1767 static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size,
1768 off_t offset, struct fuse_file_info *fi)
1770 lo_do_readdir(req, ino, size, offset, fi, 1);
1773 static void lo_releasedir(fuse_req_t req, fuse_ino_t ino,
1774 struct fuse_file_info *fi)
1776 struct lo_data *lo = lo_data(req);
1777 struct lo_map_elem *elem;
1778 struct lo_dirp *d;
1780 (void)ino;
1782 pthread_mutex_lock(&lo->mutex);
1783 elem = lo_map_get(&lo->dirp_map, fi->fh);
1784 if (!elem) {
1785 pthread_mutex_unlock(&lo->mutex);
1786 fuse_reply_err(req, EBADF);
1787 return;
1790 d = elem->dirp;
1791 lo_map_remove(&lo->dirp_map, fi->fh);
1792 pthread_mutex_unlock(&lo->mutex);
1794 lo_dirp_put(&d); /* paired with lo_opendir() */
1796 fuse_reply_err(req, 0);
1799 static void update_open_flags(int writeback, int allow_direct_io,
1800 struct fuse_file_info *fi)
1803 * With writeback cache, kernel may send read requests even
1804 * when userspace opened write-only
1806 if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) {
1807 fi->flags &= ~O_ACCMODE;
1808 fi->flags |= O_RDWR;
1812 * With writeback cache, O_APPEND is handled by the kernel.
1813 * This breaks atomicity (since the file may change in the
1814 * underlying filesystem, so that the kernel's idea of the
1815 * end of the file isn't accurate anymore). In this example,
1816 * we just accept that. A more rigorous filesystem may want
1817 * to return an error here
1819 if (writeback && (fi->flags & O_APPEND)) {
1820 fi->flags &= ~O_APPEND;
1824 * O_DIRECT in guest should not necessarily mean bypassing page
1825 * cache on host as well. Therefore, we discard it by default
1826 * ('-o no_allow_direct_io'). If somebody needs that behavior,
1827 * the '-o allow_direct_io' option should be set.
1829 if (!allow_direct_io) {
1830 fi->flags &= ~O_DIRECT;
1835 * Open a regular file, set up an fd mapping, and fill out the struct
1836 * fuse_file_info for it. If existing_fd is not negative, use that fd instead
1837 * opening a new one. Takes ownership of existing_fd.
1839 * Returns 0 on success or a positive errno.
1841 static int lo_do_open(struct lo_data *lo, struct lo_inode *inode,
1842 int existing_fd, struct fuse_file_info *fi)
1844 ssize_t fh;
1845 int fd = existing_fd;
1846 int err;
1847 bool cap_fsetid_dropped = false;
1848 bool kill_suidgid = lo->killpriv_v2 && fi->kill_priv;
1850 update_open_flags(lo->writeback, lo->allow_direct_io, fi);
1852 if (fd < 0) {
1853 if (kill_suidgid) {
1854 err = drop_effective_cap("FSETID", &cap_fsetid_dropped);
1855 if (err) {
1856 return err;
1860 fd = lo_inode_open(lo, inode, fi->flags);
1862 if (cap_fsetid_dropped) {
1863 if (gain_effective_cap("FSETID")) {
1864 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
1867 if (fd < 0) {
1868 return -fd;
1870 if (fi->flags & (O_TRUNC)) {
1871 int err = drop_security_capability(lo, fd);
1872 if (err) {
1873 close(fd);
1874 return err;
1879 pthread_mutex_lock(&lo->mutex);
1880 fh = lo_add_fd_mapping(lo, fd);
1881 pthread_mutex_unlock(&lo->mutex);
1882 if (fh == -1) {
1883 close(fd);
1884 return ENOMEM;
1887 fi->fh = fh;
1888 if (lo->cache == CACHE_NONE) {
1889 fi->direct_io = 1;
1890 } else if (lo->cache == CACHE_ALWAYS) {
1891 fi->keep_cache = 1;
1893 return 0;
1896 static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
1897 mode_t mode, struct fuse_file_info *fi)
1899 int fd = -1;
1900 struct lo_data *lo = lo_data(req);
1901 struct lo_inode *parent_inode;
1902 struct lo_inode *inode = NULL;
1903 struct fuse_entry_param e;
1904 int err;
1905 struct lo_cred old = {};
1907 fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)"
1908 " kill_priv=%d\n", parent, name, fi->kill_priv);
1910 if (!is_safe_path_component(name)) {
1911 fuse_reply_err(req, EINVAL);
1912 return;
1915 parent_inode = lo_inode(req, parent);
1916 if (!parent_inode) {
1917 fuse_reply_err(req, EBADF);
1918 return;
1921 err = lo_change_cred(req, &old);
1922 if (err) {
1923 goto out;
1926 update_open_flags(lo->writeback, lo->allow_direct_io, fi);
1928 /* Try to create a new file but don't open existing files */
1929 fd = openat(parent_inode->fd, name, fi->flags | O_CREAT | O_EXCL, mode);
1930 err = fd == -1 ? errno : 0;
1932 lo_restore_cred(&old);
1934 /* Ignore the error if file exists and O_EXCL was not given */
1935 if (err && (err != EEXIST || (fi->flags & O_EXCL))) {
1936 goto out;
1939 err = lo_do_lookup(req, parent, name, &e, &inode);
1940 if (err) {
1941 goto out;
1944 err = lo_do_open(lo, inode, fd, fi);
1945 fd = -1; /* lo_do_open() takes ownership of fd */
1946 if (err) {
1947 /* Undo lo_do_lookup() nlookup ref */
1948 unref_inode_lolocked(lo, inode, 1);
1951 out:
1952 lo_inode_put(lo, &inode);
1953 lo_inode_put(lo, &parent_inode);
1955 if (err) {
1956 if (fd >= 0) {
1957 close(fd);
1960 fuse_reply_err(req, err);
1961 } else {
1962 fuse_reply_create(req, &e, fi);
1966 /* Should be called with inode->plock_mutex held */
1967 static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo,
1968 struct lo_inode *inode,
1969 uint64_t lock_owner,
1970 pid_t pid, int *err)
1972 struct lo_inode_plock *plock;
1973 int fd;
1975 plock =
1976 g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner));
1978 if (plock) {
1979 return plock;
1982 plock = malloc(sizeof(struct lo_inode_plock));
1983 if (!plock) {
1984 *err = ENOMEM;
1985 return NULL;
1988 /* Open another instance of file which can be used for ofd locks. */
1989 /* TODO: What if file is not writable? */
1990 fd = lo_inode_open(lo, inode, O_RDWR);
1991 if (fd < 0) {
1992 *err = -fd;
1993 free(plock);
1994 return NULL;
1997 plock->lock_owner = lock_owner;
1998 plock->fd = fd;
1999 g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner),
2000 plock);
2001 return plock;
2004 static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
2005 struct flock *lock)
2007 struct lo_data *lo = lo_data(req);
2008 struct lo_inode *inode;
2009 struct lo_inode_plock *plock;
2010 int ret, saverr = 0;
2012 fuse_log(FUSE_LOG_DEBUG,
2013 "lo_getlk(ino=%" PRIu64 ", flags=%d)"
2014 " owner=0x%lx, l_type=%d l_start=0x%lx"
2015 " l_len=0x%lx\n",
2016 ino, fi->flags, fi->lock_owner, lock->l_type, lock->l_start,
2017 lock->l_len);
2019 if (!lo->posix_lock) {
2020 fuse_reply_err(req, ENOSYS);
2021 return;
2024 inode = lo_inode(req, ino);
2025 if (!inode) {
2026 fuse_reply_err(req, EBADF);
2027 return;
2030 pthread_mutex_lock(&inode->plock_mutex);
2031 plock =
2032 lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
2033 if (!plock) {
2034 saverr = ret;
2035 goto out;
2038 ret = fcntl(plock->fd, F_OFD_GETLK, lock);
2039 if (ret == -1) {
2040 saverr = errno;
2043 out:
2044 pthread_mutex_unlock(&inode->plock_mutex);
2045 lo_inode_put(lo, &inode);
2047 if (saverr) {
2048 fuse_reply_err(req, saverr);
2049 } else {
2050 fuse_reply_lock(req, lock);
2054 static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
2055 struct flock *lock, int sleep)
2057 struct lo_data *lo = lo_data(req);
2058 struct lo_inode *inode;
2059 struct lo_inode_plock *plock;
2060 int ret, saverr = 0;
2062 fuse_log(FUSE_LOG_DEBUG,
2063 "lo_setlk(ino=%" PRIu64 ", flags=%d)"
2064 " cmd=%d pid=%d owner=0x%lx sleep=%d l_whence=%d"
2065 " l_start=0x%lx l_len=0x%lx\n",
2066 ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep,
2067 lock->l_whence, lock->l_start, lock->l_len);
2069 if (!lo->posix_lock) {
2070 fuse_reply_err(req, ENOSYS);
2071 return;
2074 if (sleep) {
2075 fuse_reply_err(req, EOPNOTSUPP);
2076 return;
2079 inode = lo_inode(req, ino);
2080 if (!inode) {
2081 fuse_reply_err(req, EBADF);
2082 return;
2085 pthread_mutex_lock(&inode->plock_mutex);
2086 plock =
2087 lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
2089 if (!plock) {
2090 saverr = ret;
2091 goto out;
2094 /* TODO: Is it alright to modify flock? */
2095 lock->l_pid = 0;
2096 ret = fcntl(plock->fd, F_OFD_SETLK, lock);
2097 if (ret == -1) {
2098 saverr = errno;
2101 out:
2102 pthread_mutex_unlock(&inode->plock_mutex);
2103 lo_inode_put(lo, &inode);
2105 fuse_reply_err(req, saverr);
2108 static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
2109 struct fuse_file_info *fi)
2111 int res;
2112 struct lo_dirp *d;
2113 int fd;
2115 (void)ino;
2117 d = lo_dirp(req, fi);
2118 if (!d) {
2119 fuse_reply_err(req, EBADF);
2120 return;
2123 fd = dirfd(d->dp);
2124 if (datasync) {
2125 res = fdatasync(fd);
2126 } else {
2127 res = fsync(fd);
2130 lo_dirp_put(&d);
2132 fuse_reply_err(req, res == -1 ? errno : 0);
2135 static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
2137 struct lo_data *lo = lo_data(req);
2138 struct lo_inode *inode = lo_inode(req, ino);
2139 int err;
2141 fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d, kill_priv=%d)"
2142 "\n", ino, fi->flags, fi->kill_priv);
2144 if (!inode) {
2145 fuse_reply_err(req, EBADF);
2146 return;
2149 err = lo_do_open(lo, inode, -1, fi);
2150 lo_inode_put(lo, &inode);
2151 if (err) {
2152 fuse_reply_err(req, err);
2153 } else {
2154 fuse_reply_open(req, fi);
2158 static void lo_release(fuse_req_t req, fuse_ino_t ino,
2159 struct fuse_file_info *fi)
2161 struct lo_data *lo = lo_data(req);
2162 struct lo_map_elem *elem;
2163 int fd = -1;
2165 (void)ino;
2167 pthread_mutex_lock(&lo->mutex);
2168 elem = lo_map_get(&lo->fd_map, fi->fh);
2169 if (elem) {
2170 fd = elem->fd;
2171 elem = NULL;
2172 lo_map_remove(&lo->fd_map, fi->fh);
2174 pthread_mutex_unlock(&lo->mutex);
2176 close(fd);
2177 fuse_reply_err(req, 0);
2180 static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
2182 int res;
2183 (void)ino;
2184 struct lo_inode *inode;
2185 struct lo_data *lo = lo_data(req);
2187 inode = lo_inode(req, ino);
2188 if (!inode) {
2189 fuse_reply_err(req, EBADF);
2190 return;
2193 if (!S_ISREG(inode->filetype)) {
2194 lo_inode_put(lo, &inode);
2195 fuse_reply_err(req, EBADF);
2196 return;
2199 /* An fd is going away. Cleanup associated posix locks */
2200 if (lo->posix_lock) {
2201 pthread_mutex_lock(&inode->plock_mutex);
2202 g_hash_table_remove(inode->posix_locks,
2203 GUINT_TO_POINTER(fi->lock_owner));
2204 pthread_mutex_unlock(&inode->plock_mutex);
2206 res = close(dup(lo_fi_fd(req, fi)));
2207 lo_inode_put(lo, &inode);
2208 fuse_reply_err(req, res == -1 ? errno : 0);
2211 static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync,
2212 struct fuse_file_info *fi)
2214 struct lo_inode *inode = lo_inode(req, ino);
2215 struct lo_data *lo = lo_data(req);
2216 int res;
2217 int fd;
2219 fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino,
2220 (void *)fi);
2222 if (!inode) {
2223 fuse_reply_err(req, EBADF);
2224 return;
2227 if (!fi) {
2228 fd = lo_inode_open(lo, inode, O_RDWR);
2229 if (fd < 0) {
2230 res = -fd;
2231 goto out;
2233 } else {
2234 fd = lo_fi_fd(req, fi);
2237 if (datasync) {
2238 res = fdatasync(fd) == -1 ? errno : 0;
2239 } else {
2240 res = fsync(fd) == -1 ? errno : 0;
2242 if (!fi) {
2243 close(fd);
2245 out:
2246 lo_inode_put(lo, &inode);
2247 fuse_reply_err(req, res);
2250 static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset,
2251 struct fuse_file_info *fi)
2253 struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size);
2255 fuse_log(FUSE_LOG_DEBUG,
2256 "lo_read(ino=%" PRIu64 ", size=%zd, "
2257 "off=%lu)\n",
2258 ino, size, (unsigned long)offset);
2260 buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
2261 buf.buf[0].fd = lo_fi_fd(req, fi);
2262 buf.buf[0].pos = offset;
2264 fuse_reply_data(req, &buf);
2267 static void lo_write_buf(fuse_req_t req, fuse_ino_t ino,
2268 struct fuse_bufvec *in_buf, off_t off,
2269 struct fuse_file_info *fi)
2271 (void)ino;
2272 ssize_t res;
2273 struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf));
2274 bool cap_fsetid_dropped = false;
2276 out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
2277 out_buf.buf[0].fd = lo_fi_fd(req, fi);
2278 out_buf.buf[0].pos = off;
2280 fuse_log(FUSE_LOG_DEBUG,
2281 "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu kill_priv=%d)\n",
2282 ino, out_buf.buf[0].size, (unsigned long)off, fi->kill_priv);
2284 res = drop_security_capability(lo_data(req), out_buf.buf[0].fd);
2285 if (res) {
2286 fuse_reply_err(req, res);
2287 return;
2291 * If kill_priv is set, drop CAP_FSETID which should lead to kernel
2292 * clearing setuid/setgid on file. Note, for WRITE, we need to do
2293 * this even if killpriv_v2 is not enabled. fuse direct write path
2294 * relies on this.
2296 if (fi->kill_priv) {
2297 res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
2298 if (res != 0) {
2299 fuse_reply_err(req, res);
2300 return;
2304 res = fuse_buf_copy(&out_buf, in_buf);
2305 if (res < 0) {
2306 fuse_reply_err(req, -res);
2307 } else {
2308 fuse_reply_write(req, (size_t)res);
2311 if (cap_fsetid_dropped) {
2312 res = gain_effective_cap("FSETID");
2313 if (res) {
2314 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
2319 static void lo_statfs(fuse_req_t req, fuse_ino_t ino)
2321 int res;
2322 struct statvfs stbuf;
2324 res = fstatvfs(lo_fd(req, ino), &stbuf);
2325 if (res == -1) {
2326 fuse_reply_err(req, errno);
2327 } else {
2328 fuse_reply_statfs(req, &stbuf);
2332 static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset,
2333 off_t length, struct fuse_file_info *fi)
2335 int err = EOPNOTSUPP;
2336 (void)ino;
2338 #ifdef CONFIG_FALLOCATE
2339 err = fallocate(lo_fi_fd(req, fi), mode, offset, length);
2340 if (err < 0) {
2341 err = errno;
2344 #elif defined(CONFIG_POSIX_FALLOCATE)
2345 if (mode) {
2346 fuse_reply_err(req, EOPNOTSUPP);
2347 return;
2350 err = posix_fallocate(lo_fi_fd(req, fi), offset, length);
2351 #endif
2353 fuse_reply_err(req, err);
2356 static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
2357 int op)
2359 int res;
2360 (void)ino;
2362 res = flock(lo_fi_fd(req, fi), op);
2364 fuse_reply_err(req, res == -1 ? errno : 0);
2367 /* types */
2369 * Exit; process attribute unmodified if matched.
2370 * An empty key applies to all.
2372 #define XATTR_MAP_FLAG_OK (1 << 0)
2374 * The attribute is unwanted;
2375 * EPERM on write, hidden on read.
2377 #define XATTR_MAP_FLAG_BAD (1 << 1)
2379 * For attr that start with 'key' prepend 'prepend'
2380 * 'key' may be empty to prepend for all attrs
2381 * key is defined from set/remove point of view.
2382 * Automatically reversed on read
2384 #define XATTR_MAP_FLAG_PREFIX (1 << 2)
2386 /* scopes */
2387 /* Apply rule to get/set/remove */
2388 #define XATTR_MAP_FLAG_CLIENT (1 << 16)
2389 /* Apply rule to list */
2390 #define XATTR_MAP_FLAG_SERVER (1 << 17)
2391 /* Apply rule to all */
2392 #define XATTR_MAP_FLAG_ALL (XATTR_MAP_FLAG_SERVER | XATTR_MAP_FLAG_CLIENT)
2394 static void add_xattrmap_entry(struct lo_data *lo,
2395 const XattrMapEntry *new_entry)
2397 XattrMapEntry *res = g_realloc_n(lo->xattr_map_list,
2398 lo->xattr_map_nentries + 1,
2399 sizeof(XattrMapEntry));
2400 res[lo->xattr_map_nentries++] = *new_entry;
2402 lo->xattr_map_list = res;
2405 static void free_xattrmap(struct lo_data *lo)
2407 XattrMapEntry *map = lo->xattr_map_list;
2408 size_t i;
2410 if (!map) {
2411 return;
2414 for (i = 0; i < lo->xattr_map_nentries; i++) {
2415 g_free(map[i].key);
2416 g_free(map[i].prepend);
2419 g_free(map);
2420 lo->xattr_map_list = NULL;
2421 lo->xattr_map_nentries = -1;
2425 * Handle the 'map' type, which is sugar for a set of commands
2426 * for the common case of prefixing a subset or everything,
2427 * and allowing anything not prefixed through.
2428 * It must be the last entry in the stream, although there
2429 * can be other entries before it.
2430 * The form is:
2431 * :map:key:prefix:
2433 * key maybe empty in which case all entries are prefixed.
2435 static void parse_xattrmap_map(struct lo_data *lo,
2436 const char *rule, char sep)
2438 const char *tmp;
2439 char *key;
2440 char *prefix;
2441 XattrMapEntry tmp_entry;
2443 if (*rule != sep) {
2444 fuse_log(FUSE_LOG_ERR,
2445 "%s: Expecting '%c' after 'map' keyword, found '%c'\n",
2446 __func__, sep, *rule);
2447 exit(1);
2450 rule++;
2452 /* At start of 'key' field */
2453 tmp = strchr(rule, sep);
2454 if (!tmp) {
2455 fuse_log(FUSE_LOG_ERR,
2456 "%s: Missing '%c' at end of key field in map rule\n",
2457 __func__, sep);
2458 exit(1);
2461 key = g_strndup(rule, tmp - rule);
2462 rule = tmp + 1;
2464 /* At start of prefix field */
2465 tmp = strchr(rule, sep);
2466 if (!tmp) {
2467 fuse_log(FUSE_LOG_ERR,
2468 "%s: Missing '%c' at end of prefix field in map rule\n",
2469 __func__, sep);
2470 exit(1);
2473 prefix = g_strndup(rule, tmp - rule);
2474 rule = tmp + 1;
2477 * This should be the end of the string, we don't allow
2478 * any more commands after 'map'.
2480 if (*rule) {
2481 fuse_log(FUSE_LOG_ERR,
2482 "%s: Expecting end of command after map, found '%c'\n",
2483 __func__, *rule);
2484 exit(1);
2487 /* 1st: Prefix matches/everything */
2488 tmp_entry.flags = XATTR_MAP_FLAG_PREFIX | XATTR_MAP_FLAG_ALL;
2489 tmp_entry.key = g_strdup(key);
2490 tmp_entry.prepend = g_strdup(prefix);
2491 add_xattrmap_entry(lo, &tmp_entry);
2493 if (!*key) {
2494 /* Prefix all case */
2496 /* 2nd: Hide any non-prefixed entries on the host */
2497 tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_ALL;
2498 tmp_entry.key = g_strdup("");
2499 tmp_entry.prepend = g_strdup("");
2500 add_xattrmap_entry(lo, &tmp_entry);
2501 } else {
2502 /* Prefix matching case */
2504 /* 2nd: Hide non-prefixed but matching entries on the host */
2505 tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_SERVER;
2506 tmp_entry.key = g_strdup(""); /* Not used */
2507 tmp_entry.prepend = g_strdup(key);
2508 add_xattrmap_entry(lo, &tmp_entry);
2510 /* 3rd: Stop the client accessing prefixed attributes directly */
2511 tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_CLIENT;
2512 tmp_entry.key = g_strdup(prefix);
2513 tmp_entry.prepend = g_strdup(""); /* Not used */
2514 add_xattrmap_entry(lo, &tmp_entry);
2516 /* 4th: Everything else is OK */
2517 tmp_entry.flags = XATTR_MAP_FLAG_OK | XATTR_MAP_FLAG_ALL;
2518 tmp_entry.key = g_strdup("");
2519 tmp_entry.prepend = g_strdup("");
2520 add_xattrmap_entry(lo, &tmp_entry);
2523 g_free(key);
2524 g_free(prefix);
2527 static void parse_xattrmap(struct lo_data *lo)
2529 const char *map = lo->xattrmap;
2530 const char *tmp;
2531 int ret;
2533 lo->xattr_map_nentries = 0;
2534 while (*map) {
2535 XattrMapEntry tmp_entry;
2536 char sep;
2538 if (isspace(*map)) {
2539 map++;
2540 continue;
2542 /* The separator is the first non-space of the rule */
2543 sep = *map++;
2544 if (!sep) {
2545 break;
2548 tmp_entry.flags = 0;
2549 /* Start of 'type' */
2550 if (strstart(map, "prefix", &map)) {
2551 tmp_entry.flags |= XATTR_MAP_FLAG_PREFIX;
2552 } else if (strstart(map, "ok", &map)) {
2553 tmp_entry.flags |= XATTR_MAP_FLAG_OK;
2554 } else if (strstart(map, "bad", &map)) {
2555 tmp_entry.flags |= XATTR_MAP_FLAG_BAD;
2556 } else if (strstart(map, "map", &map)) {
2558 * map is sugar that adds a number of rules, and must be
2559 * the last entry.
2561 parse_xattrmap_map(lo, map, sep);
2562 break;
2563 } else {
2564 fuse_log(FUSE_LOG_ERR,
2565 "%s: Unexpected type;"
2566 "Expecting 'prefix', 'ok', 'bad' or 'map' in rule %zu\n",
2567 __func__, lo->xattr_map_nentries);
2568 exit(1);
2571 if (*map++ != sep) {
2572 fuse_log(FUSE_LOG_ERR,
2573 "%s: Missing '%c' at end of type field of rule %zu\n",
2574 __func__, sep, lo->xattr_map_nentries);
2575 exit(1);
2578 /* Start of 'scope' */
2579 if (strstart(map, "client", &map)) {
2580 tmp_entry.flags |= XATTR_MAP_FLAG_CLIENT;
2581 } else if (strstart(map, "server", &map)) {
2582 tmp_entry.flags |= XATTR_MAP_FLAG_SERVER;
2583 } else if (strstart(map, "all", &map)) {
2584 tmp_entry.flags |= XATTR_MAP_FLAG_ALL;
2585 } else {
2586 fuse_log(FUSE_LOG_ERR,
2587 "%s: Unexpected scope;"
2588 " Expecting 'client', 'server', or 'all', in rule %zu\n",
2589 __func__, lo->xattr_map_nentries);
2590 exit(1);
2593 if (*map++ != sep) {
2594 fuse_log(FUSE_LOG_ERR,
2595 "%s: Expecting '%c' found '%c'"
2596 " after scope in rule %zu\n",
2597 __func__, sep, *map, lo->xattr_map_nentries);
2598 exit(1);
2601 /* At start of 'key' field */
2602 tmp = strchr(map, sep);
2603 if (!tmp) {
2604 fuse_log(FUSE_LOG_ERR,
2605 "%s: Missing '%c' at end of key field of rule %zu",
2606 __func__, sep, lo->xattr_map_nentries);
2607 exit(1);
2609 tmp_entry.key = g_strndup(map, tmp - map);
2610 map = tmp + 1;
2612 /* At start of 'prepend' field */
2613 tmp = strchr(map, sep);
2614 if (!tmp) {
2615 fuse_log(FUSE_LOG_ERR,
2616 "%s: Missing '%c' at end of prepend field of rule %zu",
2617 __func__, sep, lo->xattr_map_nentries);
2618 exit(1);
2620 tmp_entry.prepend = g_strndup(map, tmp - map);
2621 map = tmp + 1;
2623 add_xattrmap_entry(lo, &tmp_entry);
2624 /* End of rule - go around again for another rule */
2627 if (!lo->xattr_map_nentries) {
2628 fuse_log(FUSE_LOG_ERR, "Empty xattr map\n");
2629 exit(1);
2632 ret = xattr_map_client(lo, "security.capability",
2633 &lo->xattr_security_capability);
2634 if (ret) {
2635 fuse_log(FUSE_LOG_ERR, "Failed to map security.capability: %s\n",
2636 strerror(ret));
2637 exit(1);
2639 if (!lo->xattr_security_capability ||
2640 !strcmp(lo->xattr_security_capability, "security.capability")) {
2641 /* 1-1 mapping, don't need to do anything */
2642 free(lo->xattr_security_capability);
2643 lo->xattr_security_capability = NULL;
2648 * For use with getxattr/setxattr/removexattr, where the client
2649 * gives us a name and we may need to choose a different one.
2650 * Allocates a buffer for the result placing it in *out_name.
2651 * If there's no change then *out_name is not set.
2652 * Returns 0 on success
2653 * Can return -EPERM to indicate we block a given attribute
2654 * (in which case out_name is not allocated)
2655 * Can return -ENOMEM to indicate out_name couldn't be allocated.
2657 static int xattr_map_client(const struct lo_data *lo, const char *client_name,
2658 char **out_name)
2660 size_t i;
2661 for (i = 0; i < lo->xattr_map_nentries; i++) {
2662 const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
2664 if ((cur_entry->flags & XATTR_MAP_FLAG_CLIENT) &&
2665 (strstart(client_name, cur_entry->key, NULL))) {
2666 if (cur_entry->flags & XATTR_MAP_FLAG_BAD) {
2667 return -EPERM;
2669 if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
2670 /* Unmodified name */
2671 return 0;
2673 if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
2674 *out_name = g_try_malloc(strlen(client_name) +
2675 strlen(cur_entry->prepend) + 1);
2676 if (!*out_name) {
2677 return -ENOMEM;
2679 sprintf(*out_name, "%s%s", cur_entry->prepend, client_name);
2680 return 0;
2685 return -EPERM;
2689 * For use with listxattr where the server fs gives us a name and we may need
2690 * to sanitize this for the client.
2691 * Returns a pointer to the result in *out_name
2692 * This is always the original string or the current string with some prefix
2693 * removed; no reallocation is done.
2694 * Returns 0 on success
2695 * Can return -ENODATA to indicate the name should be dropped from the list.
2697 static int xattr_map_server(const struct lo_data *lo, const char *server_name,
2698 const char **out_name)
2700 size_t i;
2701 const char *end;
2703 for (i = 0; i < lo->xattr_map_nentries; i++) {
2704 const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
2706 if ((cur_entry->flags & XATTR_MAP_FLAG_SERVER) &&
2707 (strstart(server_name, cur_entry->prepend, &end))) {
2708 if (cur_entry->flags & XATTR_MAP_FLAG_BAD) {
2709 return -ENODATA;
2711 if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
2712 *out_name = server_name;
2713 return 0;
2715 if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
2716 /* Remove prefix */
2717 *out_name = end;
2718 return 0;
2723 return -ENODATA;
2726 static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
2727 size_t size)
2729 struct lo_data *lo = lo_data(req);
2730 char *value = NULL;
2731 char procname[64];
2732 const char *name;
2733 char *mapped_name;
2734 struct lo_inode *inode;
2735 ssize_t ret;
2736 int saverr;
2737 int fd = -1;
2739 mapped_name = NULL;
2740 name = in_name;
2741 if (lo->xattrmap) {
2742 ret = xattr_map_client(lo, in_name, &mapped_name);
2743 if (ret < 0) {
2744 if (ret == -EPERM) {
2745 ret = -ENODATA;
2747 fuse_reply_err(req, -ret);
2748 return;
2750 if (mapped_name) {
2751 name = mapped_name;
2755 inode = lo_inode(req, ino);
2756 if (!inode) {
2757 fuse_reply_err(req, EBADF);
2758 g_free(mapped_name);
2759 return;
2762 saverr = ENOSYS;
2763 if (!lo_data(req)->xattr) {
2764 goto out;
2767 fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n",
2768 ino, name, size);
2770 if (size) {
2771 value = malloc(size);
2772 if (!value) {
2773 goto out_err;
2777 sprintf(procname, "%i", inode->fd);
2779 * It is not safe to open() non-regular/non-dir files in file server
2780 * unless O_PATH is used, so use that method for regular files/dir
2781 * only (as it seems giving less performance overhead).
2782 * Otherwise, call fchdir() to avoid open().
2784 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2785 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2786 if (fd < 0) {
2787 goto out_err;
2789 ret = fgetxattr(fd, name, value, size);
2790 } else {
2791 /* fchdir should not fail here */
2792 assert(fchdir(lo->proc_self_fd) == 0);
2793 ret = getxattr(procname, name, value, size);
2794 assert(fchdir(lo->root.fd) == 0);
2797 if (ret == -1) {
2798 goto out_err;
2800 if (size) {
2801 saverr = 0;
2802 if (ret == 0) {
2803 goto out;
2805 fuse_reply_buf(req, value, ret);
2806 } else {
2807 fuse_reply_xattr(req, ret);
2809 out_free:
2810 free(value);
2812 if (fd >= 0) {
2813 close(fd);
2816 lo_inode_put(lo, &inode);
2817 return;
2819 out_err:
2820 saverr = errno;
2821 out:
2822 fuse_reply_err(req, saverr);
2823 g_free(mapped_name);
2824 goto out_free;
2827 static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
2829 struct lo_data *lo = lo_data(req);
2830 char *value = NULL;
2831 char procname[64];
2832 struct lo_inode *inode;
2833 ssize_t ret;
2834 int saverr;
2835 int fd = -1;
2837 inode = lo_inode(req, ino);
2838 if (!inode) {
2839 fuse_reply_err(req, EBADF);
2840 return;
2843 saverr = ENOSYS;
2844 if (!lo_data(req)->xattr) {
2845 goto out;
2848 fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino,
2849 size);
2851 if (size) {
2852 value = malloc(size);
2853 if (!value) {
2854 goto out_err;
2858 sprintf(procname, "%i", inode->fd);
2859 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2860 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2861 if (fd < 0) {
2862 goto out_err;
2864 ret = flistxattr(fd, value, size);
2865 } else {
2866 /* fchdir should not fail here */
2867 assert(fchdir(lo->proc_self_fd) == 0);
2868 ret = listxattr(procname, value, size);
2869 assert(fchdir(lo->root.fd) == 0);
2872 if (ret == -1) {
2873 goto out_err;
2875 if (size) {
2876 saverr = 0;
2877 if (ret == 0) {
2878 goto out;
2881 if (lo->xattr_map_list) {
2883 * Map the names back, some attributes might be dropped,
2884 * some shortened, but not increased, so we shouldn't
2885 * run out of room.
2887 size_t out_index, in_index;
2888 out_index = 0;
2889 in_index = 0;
2890 while (in_index < ret) {
2891 const char *map_out;
2892 char *in_ptr = value + in_index;
2893 /* Length of current attribute name */
2894 size_t in_len = strlen(value + in_index) + 1;
2896 int mapret = xattr_map_server(lo, in_ptr, &map_out);
2897 if (mapret != -ENODATA && mapret != 0) {
2898 /* Shouldn't happen */
2899 saverr = -mapret;
2900 goto out;
2902 if (mapret == 0) {
2903 /* Either unchanged, or truncated */
2904 size_t out_len;
2905 if (map_out != in_ptr) {
2906 /* +1 copies the NIL */
2907 out_len = strlen(map_out) + 1;
2908 } else {
2909 /* No change */
2910 out_len = in_len;
2913 * Move result along, may still be needed for an unchanged
2914 * entry if a previous entry was changed.
2916 memmove(value + out_index, map_out, out_len);
2918 out_index += out_len;
2920 in_index += in_len;
2922 ret = out_index;
2923 if (ret == 0) {
2924 goto out;
2927 fuse_reply_buf(req, value, ret);
2928 } else {
2930 * xattrmap only ever shortens the result,
2931 * so we don't need to do anything clever with the
2932 * allocation length here.
2934 fuse_reply_xattr(req, ret);
2936 out_free:
2937 free(value);
2939 if (fd >= 0) {
2940 close(fd);
2943 lo_inode_put(lo, &inode);
2944 return;
2946 out_err:
2947 saverr = errno;
2948 out:
2949 fuse_reply_err(req, saverr);
2950 goto out_free;
2953 static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
2954 const char *value, size_t size, int flags)
2956 char procname[64];
2957 const char *name;
2958 char *mapped_name;
2959 struct lo_data *lo = lo_data(req);
2960 struct lo_inode *inode;
2961 ssize_t ret;
2962 int saverr;
2963 int fd = -1;
2965 mapped_name = NULL;
2966 name = in_name;
2967 if (lo->xattrmap) {
2968 ret = xattr_map_client(lo, in_name, &mapped_name);
2969 if (ret < 0) {
2970 fuse_reply_err(req, -ret);
2971 return;
2973 if (mapped_name) {
2974 name = mapped_name;
2978 inode = lo_inode(req, ino);
2979 if (!inode) {
2980 fuse_reply_err(req, EBADF);
2981 g_free(mapped_name);
2982 return;
2985 saverr = ENOSYS;
2986 if (!lo_data(req)->xattr) {
2987 goto out;
2990 fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64
2991 ", name=%s value=%s size=%zd)\n", ino, name, value, size);
2993 sprintf(procname, "%i", inode->fd);
2994 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2995 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2996 if (fd < 0) {
2997 saverr = errno;
2998 goto out;
3000 ret = fsetxattr(fd, name, value, size, flags);
3001 } else {
3002 /* fchdir should not fail here */
3003 assert(fchdir(lo->proc_self_fd) == 0);
3004 ret = setxattr(procname, name, value, size, flags);
3005 assert(fchdir(lo->root.fd) == 0);
3008 saverr = ret == -1 ? errno : 0;
3010 out:
3011 if (fd >= 0) {
3012 close(fd);
3015 lo_inode_put(lo, &inode);
3016 g_free(mapped_name);
3017 fuse_reply_err(req, saverr);
3020 static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *in_name)
3022 char procname[64];
3023 const char *name;
3024 char *mapped_name;
3025 struct lo_data *lo = lo_data(req);
3026 struct lo_inode *inode;
3027 ssize_t ret;
3028 int saverr;
3029 int fd = -1;
3031 mapped_name = NULL;
3032 name = in_name;
3033 if (lo->xattrmap) {
3034 ret = xattr_map_client(lo, in_name, &mapped_name);
3035 if (ret < 0) {
3036 fuse_reply_err(req, -ret);
3037 return;
3039 if (mapped_name) {
3040 name = mapped_name;
3044 inode = lo_inode(req, ino);
3045 if (!inode) {
3046 fuse_reply_err(req, EBADF);
3047 g_free(mapped_name);
3048 return;
3051 saverr = ENOSYS;
3052 if (!lo_data(req)->xattr) {
3053 goto out;
3056 fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino,
3057 name);
3059 sprintf(procname, "%i", inode->fd);
3060 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
3061 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
3062 if (fd < 0) {
3063 saverr = errno;
3064 goto out;
3066 ret = fremovexattr(fd, name);
3067 } else {
3068 /* fchdir should not fail here */
3069 assert(fchdir(lo->proc_self_fd) == 0);
3070 ret = removexattr(procname, name);
3071 assert(fchdir(lo->root.fd) == 0);
3074 saverr = ret == -1 ? errno : 0;
3076 out:
3077 if (fd >= 0) {
3078 close(fd);
3081 lo_inode_put(lo, &inode);
3082 g_free(mapped_name);
3083 fuse_reply_err(req, saverr);
3086 #ifdef HAVE_COPY_FILE_RANGE
3087 static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in,
3088 struct fuse_file_info *fi_in, fuse_ino_t ino_out,
3089 off_t off_out, struct fuse_file_info *fi_out,
3090 size_t len, int flags)
3092 int in_fd, out_fd;
3093 ssize_t res;
3095 in_fd = lo_fi_fd(req, fi_in);
3096 out_fd = lo_fi_fd(req, fi_out);
3098 fuse_log(FUSE_LOG_DEBUG,
3099 "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, "
3100 "off=%lu, ino=%" PRIu64 "/fd=%d, "
3101 "off=%lu, size=%zd, flags=0x%x)\n",
3102 ino_in, in_fd, off_in, ino_out, out_fd, off_out, len, flags);
3104 res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags);
3105 if (res < 0) {
3106 fuse_reply_err(req, errno);
3107 } else {
3108 fuse_reply_write(req, res);
3111 #endif
3113 static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
3114 struct fuse_file_info *fi)
3116 off_t res;
3118 (void)ino;
3119 res = lseek(lo_fi_fd(req, fi), off, whence);
3120 if (res != -1) {
3121 fuse_reply_lseek(req, res);
3122 } else {
3123 fuse_reply_err(req, errno);
3127 static void lo_destroy(void *userdata)
3129 struct lo_data *lo = (struct lo_data *)userdata;
3131 pthread_mutex_lock(&lo->mutex);
3132 while (true) {
3133 GHashTableIter iter;
3134 gpointer key, value;
3136 g_hash_table_iter_init(&iter, lo->inodes);
3137 if (!g_hash_table_iter_next(&iter, &key, &value)) {
3138 break;
3141 struct lo_inode *inode = value;
3142 unref_inode(lo, inode, inode->nlookup);
3144 pthread_mutex_unlock(&lo->mutex);
3147 static struct fuse_lowlevel_ops lo_oper = {
3148 .init = lo_init,
3149 .lookup = lo_lookup,
3150 .mkdir = lo_mkdir,
3151 .mknod = lo_mknod,
3152 .symlink = lo_symlink,
3153 .link = lo_link,
3154 .unlink = lo_unlink,
3155 .rmdir = lo_rmdir,
3156 .rename = lo_rename,
3157 .forget = lo_forget,
3158 .forget_multi = lo_forget_multi,
3159 .getattr = lo_getattr,
3160 .setattr = lo_setattr,
3161 .readlink = lo_readlink,
3162 .opendir = lo_opendir,
3163 .readdir = lo_readdir,
3164 .readdirplus = lo_readdirplus,
3165 .releasedir = lo_releasedir,
3166 .fsyncdir = lo_fsyncdir,
3167 .create = lo_create,
3168 .getlk = lo_getlk,
3169 .setlk = lo_setlk,
3170 .open = lo_open,
3171 .release = lo_release,
3172 .flush = lo_flush,
3173 .fsync = lo_fsync,
3174 .read = lo_read,
3175 .write_buf = lo_write_buf,
3176 .statfs = lo_statfs,
3177 .fallocate = lo_fallocate,
3178 .flock = lo_flock,
3179 .getxattr = lo_getxattr,
3180 .listxattr = lo_listxattr,
3181 .setxattr = lo_setxattr,
3182 .removexattr = lo_removexattr,
3183 #ifdef HAVE_COPY_FILE_RANGE
3184 .copy_file_range = lo_copy_file_range,
3185 #endif
3186 .lseek = lo_lseek,
3187 .destroy = lo_destroy,
3190 /* Print vhost-user.json backend program capabilities */
3191 static void print_capabilities(void)
3193 printf("{\n");
3194 printf(" \"type\": \"fs\"\n");
3195 printf("}\n");
3199 * Drop all Linux capabilities because the wait parent process only needs to
3200 * sit in waitpid(2) and terminate.
3202 static void setup_wait_parent_capabilities(void)
3204 capng_setpid(syscall(SYS_gettid));
3205 capng_clear(CAPNG_SELECT_BOTH);
3206 capng_apply(CAPNG_SELECT_BOTH);
3210 * Move to a new mount, net, and pid namespaces to isolate this process.
3212 static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
3214 pid_t child;
3217 * Create a new pid namespace for *child* processes. We'll have to
3218 * fork in order to enter the new pid namespace. A new mount namespace
3219 * is also needed so that we can remount /proc for the new pid
3220 * namespace.
3222 * Our UNIX domain sockets have been created. Now we can move to
3223 * an empty network namespace to prevent TCP/IP and other network
3224 * activity in case this process is compromised.
3226 if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) {
3227 fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
3228 exit(1);
3231 child = fork();
3232 if (child < 0) {
3233 fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n");
3234 exit(1);
3236 if (child > 0) {
3237 pid_t waited;
3238 int wstatus;
3240 setup_wait_parent_capabilities();
3242 /* The parent waits for the child */
3243 do {
3244 waited = waitpid(child, &wstatus, 0);
3245 } while (waited < 0 && errno == EINTR && !se->exited);
3247 /* We were terminated by a signal, see fuse_signals.c */
3248 if (se->exited) {
3249 exit(0);
3252 if (WIFEXITED(wstatus)) {
3253 exit(WEXITSTATUS(wstatus));
3256 exit(1);
3259 /* Send us SIGTERM when the parent thread terminates, see prctl(2) */
3260 prctl(PR_SET_PDEATHSIG, SIGTERM);
3263 * If the mounts have shared propagation then we want to opt out so our
3264 * mount changes don't affect the parent mount namespace.
3266 if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) {
3267 fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n");
3268 exit(1);
3271 /* The child must remount /proc to use the new pid namespace */
3272 if (mount("proc", "/proc", "proc",
3273 MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) {
3274 fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n");
3275 exit(1);
3279 * We only need /proc/self/fd. Prevent ".." from accessing parent
3280 * directories of /proc/self/fd by bind-mounting it over /proc. Since / was
3281 * previously remounted with MS_REC | MS_SLAVE this mount change only
3282 * affects our process.
3284 if (mount("/proc/self/fd", "/proc", NULL, MS_BIND, NULL) < 0) {
3285 fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, MS_BIND): %m\n");
3286 exit(1);
3289 /* Get the /proc (actually /proc/self/fd, see above) file descriptor */
3290 lo->proc_self_fd = open("/proc", O_PATH);
3291 if (lo->proc_self_fd == -1) {
3292 fuse_log(FUSE_LOG_ERR, "open(/proc, O_PATH): %m\n");
3293 exit(1);
3298 * Capture the capability state, we'll need to restore this for individual
3299 * threads later; see load_capng.
3301 static void setup_capng(void)
3303 /* Note this accesses /proc so has to happen before the sandbox */
3304 if (capng_get_caps_process()) {
3305 fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n");
3306 exit(1);
3308 pthread_mutex_init(&cap.mutex, NULL);
3309 pthread_mutex_lock(&cap.mutex);
3310 cap.saved = capng_save_state();
3311 if (!cap.saved) {
3312 fuse_log(FUSE_LOG_ERR, "capng_save_state\n");
3313 exit(1);
3315 pthread_mutex_unlock(&cap.mutex);
3318 static void cleanup_capng(void)
3320 free(cap.saved);
3321 cap.saved = NULL;
3322 pthread_mutex_destroy(&cap.mutex);
3327 * Make the source directory our root so symlinks cannot escape and no other
3328 * files are accessible. Assumes unshare(CLONE_NEWNS) was already called.
3330 static void setup_mounts(const char *source)
3332 int oldroot;
3333 int newroot;
3335 if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) {
3336 fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
3337 exit(1);
3340 /* This magic is based on lxc's lxc_pivot_root() */
3341 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
3342 if (oldroot < 0) {
3343 fuse_log(FUSE_LOG_ERR, "open(/): %m\n");
3344 exit(1);
3347 newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
3348 if (newroot < 0) {
3349 fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source);
3350 exit(1);
3353 if (fchdir(newroot) < 0) {
3354 fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
3355 exit(1);
3358 if (syscall(__NR_pivot_root, ".", ".") < 0) {
3359 fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n");
3360 exit(1);
3363 if (fchdir(oldroot) < 0) {
3364 fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n");
3365 exit(1);
3368 if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) {
3369 fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n");
3370 exit(1);
3373 if (umount2(".", MNT_DETACH) < 0) {
3374 fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n");
3375 exit(1);
3378 if (fchdir(newroot) < 0) {
3379 fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
3380 exit(1);
3383 close(newroot);
3384 close(oldroot);
3388 * Only keep capabilities in allowlist that are needed for file system operation
3389 * The (possibly NULL) modcaps_in string passed in is free'd before exit.
3391 static void setup_capabilities(char *modcaps_in)
3393 char *modcaps = modcaps_in;
3394 pthread_mutex_lock(&cap.mutex);
3395 capng_restore_state(&cap.saved);
3398 * Add to allowlist file system-related capabilities that are needed for a
3399 * file server to act like root. Drop everything else like networking and
3400 * sysadmin capabilities.
3402 * Exclusions:
3403 * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl
3404 * and we don't support that.
3405 * 2. CAP_MAC_OVERRIDE is not included because it only seems to be
3406 * used by the Smack LSM. Omit it until there is demand for it.
3408 capng_setpid(syscall(SYS_gettid));
3409 capng_clear(CAPNG_SELECT_BOTH);
3410 if (capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE,
3411 CAP_CHOWN,
3412 CAP_DAC_OVERRIDE,
3413 CAP_FOWNER,
3414 CAP_FSETID,
3415 CAP_SETGID,
3416 CAP_SETUID,
3417 CAP_MKNOD,
3418 CAP_SETFCAP,
3419 -1)) {
3420 fuse_log(FUSE_LOG_ERR, "%s: capng_updatev failed\n", __func__);
3421 exit(1);
3425 * The modcaps option is a colon separated list of caps,
3426 * each preceded by either + or -.
3428 while (modcaps) {
3429 capng_act_t action;
3430 int cap;
3432 char *next = strchr(modcaps, ':');
3433 if (next) {
3434 *next = '\0';
3435 next++;
3438 switch (modcaps[0]) {
3439 case '+':
3440 action = CAPNG_ADD;
3441 break;
3443 case '-':
3444 action = CAPNG_DROP;
3445 break;
3447 default:
3448 fuse_log(FUSE_LOG_ERR,
3449 "%s: Expecting '+'/'-' in modcaps but found '%c'\n",
3450 __func__, modcaps[0]);
3451 exit(1);
3453 cap = capng_name_to_capability(modcaps + 1);
3454 if (cap < 0) {
3455 fuse_log(FUSE_LOG_ERR, "%s: Unknown capability '%s'\n", __func__,
3456 modcaps);
3457 exit(1);
3459 if (capng_update(action, CAPNG_PERMITTED | CAPNG_EFFECTIVE, cap)) {
3460 fuse_log(FUSE_LOG_ERR, "%s: capng_update failed for '%s'\n",
3461 __func__, modcaps);
3462 exit(1);
3465 modcaps = next;
3467 g_free(modcaps_in);
3469 if (capng_apply(CAPNG_SELECT_BOTH)) {
3470 fuse_log(FUSE_LOG_ERR, "%s: capng_apply failed\n", __func__);
3471 exit(1);
3474 cap.saved = capng_save_state();
3475 if (!cap.saved) {
3476 fuse_log(FUSE_LOG_ERR, "%s: capng_save_state failed\n", __func__);
3477 exit(1);
3479 pthread_mutex_unlock(&cap.mutex);
3483 * Use chroot as a weaker sandbox for environments where the process is
3484 * launched without CAP_SYS_ADMIN.
3486 static void setup_chroot(struct lo_data *lo)
3488 lo->proc_self_fd = open("/proc/self/fd", O_PATH);
3489 if (lo->proc_self_fd == -1) {
3490 fuse_log(FUSE_LOG_ERR, "open(\"/proc/self/fd\", O_PATH): %m\n");
3491 exit(1);
3495 * Make the shared directory the file system root so that FUSE_OPEN
3496 * (lo_open()) cannot escape the shared directory by opening a symlink.
3498 * The chroot(2) syscall is later disabled by seccomp and the
3499 * CAP_SYS_CHROOT capability is dropped so that tampering with the chroot
3500 * is not possible.
3502 * However, it's still possible to escape the chroot via lo->proc_self_fd
3503 * but that requires first gaining control of the process.
3505 if (chroot(lo->source) != 0) {
3506 fuse_log(FUSE_LOG_ERR, "chroot(\"%s\"): %m\n", lo->source);
3507 exit(1);
3510 /* Move into the chroot */
3511 if (chdir("/") != 0) {
3512 fuse_log(FUSE_LOG_ERR, "chdir(\"/\"): %m\n");
3513 exit(1);
3518 * Lock down this process to prevent access to other processes or files outside
3519 * source directory. This reduces the impact of arbitrary code execution bugs.
3521 static void setup_sandbox(struct lo_data *lo, struct fuse_session *se,
3522 bool enable_syslog)
3524 if (lo->sandbox == SANDBOX_NAMESPACE) {
3525 setup_namespaces(lo, se);
3526 setup_mounts(lo->source);
3527 } else {
3528 setup_chroot(lo);
3531 setup_seccomp(enable_syslog);
3532 setup_capabilities(g_strdup(lo->modcaps));
3535 /* Set the maximum number of open file descriptors */
3536 static void setup_nofile_rlimit(unsigned long rlimit_nofile)
3538 struct rlimit rlim = {
3539 .rlim_cur = rlimit_nofile,
3540 .rlim_max = rlimit_nofile,
3543 if (rlimit_nofile == 0) {
3544 return; /* nothing to do */
3547 if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
3548 /* Ignore SELinux denials */
3549 if (errno == EPERM) {
3550 return;
3553 fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n");
3554 exit(1);
3558 static void log_func(enum fuse_log_level level, const char *fmt, va_list ap)
3560 g_autofree char *localfmt = NULL;
3561 struct timespec ts;
3562 struct tm tm;
3563 char sec_fmt[sizeof "2020-12-07 18:17:54"];
3564 char zone_fmt[sizeof "+0100"];
3566 if (current_log_level < level) {
3567 return;
3570 if (current_log_level == FUSE_LOG_DEBUG) {
3571 if (use_syslog) {
3572 /* no timestamp needed */
3573 localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid),
3574 fmt);
3575 } else {
3576 /* try formatting a broken-down timestamp */
3577 if (clock_gettime(CLOCK_REALTIME, &ts) != -1 &&
3578 localtime_r(&ts.tv_sec, &tm) != NULL &&
3579 strftime(sec_fmt, sizeof sec_fmt, "%Y-%m-%d %H:%M:%S",
3580 &tm) != 0 &&
3581 strftime(zone_fmt, sizeof zone_fmt, "%z", &tm) != 0) {
3582 localfmt = g_strdup_printf("[%s.%02ld%s] [ID: %08ld] %s",
3583 sec_fmt,
3584 ts.tv_nsec / (10L * 1000 * 1000),
3585 zone_fmt, syscall(__NR_gettid),
3586 fmt);
3587 } else {
3588 /* fall back to a flat timestamp */
3589 localfmt = g_strdup_printf("[%" PRId64 "] [ID: %08ld] %s",
3590 get_clock(), syscall(__NR_gettid),
3591 fmt);
3594 fmt = localfmt;
3597 if (use_syslog) {
3598 int priority = LOG_ERR;
3599 switch (level) {
3600 case FUSE_LOG_EMERG:
3601 priority = LOG_EMERG;
3602 break;
3603 case FUSE_LOG_ALERT:
3604 priority = LOG_ALERT;
3605 break;
3606 case FUSE_LOG_CRIT:
3607 priority = LOG_CRIT;
3608 break;
3609 case FUSE_LOG_ERR:
3610 priority = LOG_ERR;
3611 break;
3612 case FUSE_LOG_WARNING:
3613 priority = LOG_WARNING;
3614 break;
3615 case FUSE_LOG_NOTICE:
3616 priority = LOG_NOTICE;
3617 break;
3618 case FUSE_LOG_INFO:
3619 priority = LOG_INFO;
3620 break;
3621 case FUSE_LOG_DEBUG:
3622 priority = LOG_DEBUG;
3623 break;
3625 vsyslog(priority, fmt, ap);
3626 } else {
3627 vfprintf(stderr, fmt, ap);
3631 static void setup_root(struct lo_data *lo, struct lo_inode *root)
3633 int fd, res;
3634 struct stat stat;
3635 uint64_t mnt_id;
3637 fd = open("/", O_PATH);
3638 if (fd == -1) {
3639 fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source);
3640 exit(1);
3643 res = do_statx(lo, fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
3644 &mnt_id);
3645 if (res == -1) {
3646 fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source);
3647 exit(1);
3650 root->filetype = S_IFDIR;
3651 root->fd = fd;
3652 root->key.ino = stat.st_ino;
3653 root->key.dev = stat.st_dev;
3654 root->key.mnt_id = mnt_id;
3655 root->nlookup = 2;
3656 g_atomic_int_set(&root->refcount, 2);
3657 if (lo->posix_lock) {
3658 pthread_mutex_init(&root->plock_mutex, NULL);
3659 root->posix_locks = g_hash_table_new_full(
3660 g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
3664 static guint lo_key_hash(gconstpointer key)
3666 const struct lo_key *lkey = key;
3668 return (guint)lkey->ino + (guint)lkey->dev + (guint)lkey->mnt_id;
3671 static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
3673 const struct lo_key *la = a;
3674 const struct lo_key *lb = b;
3676 return la->ino == lb->ino && la->dev == lb->dev && la->mnt_id == lb->mnt_id;
3679 static void fuse_lo_data_cleanup(struct lo_data *lo)
3681 if (lo->inodes) {
3682 g_hash_table_destroy(lo->inodes);
3685 if (lo->root.posix_locks) {
3686 g_hash_table_destroy(lo->root.posix_locks);
3688 lo_map_destroy(&lo->fd_map);
3689 lo_map_destroy(&lo->dirp_map);
3690 lo_map_destroy(&lo->ino_map);
3692 if (lo->proc_self_fd >= 0) {
3693 close(lo->proc_self_fd);
3696 if (lo->root.fd >= 0) {
3697 close(lo->root.fd);
3700 free(lo->xattrmap);
3701 free_xattrmap(lo);
3702 free(lo->xattr_security_capability);
3703 free(lo->source);
3706 static void qemu_version(void)
3708 printf("virtiofsd version " QEMU_FULL_VERSION "\n" QEMU_COPYRIGHT "\n");
3711 int main(int argc, char *argv[])
3713 struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
3714 struct fuse_session *se;
3715 struct fuse_cmdline_opts opts;
3716 struct lo_data lo = {
3717 .sandbox = SANDBOX_NAMESPACE,
3718 .debug = 0,
3719 .writeback = 0,
3720 .posix_lock = 0,
3721 .allow_direct_io = 0,
3722 .proc_self_fd = -1,
3723 .user_killpriv_v2 = -1,
3725 struct lo_map_elem *root_elem;
3726 struct lo_map_elem *reserve_elem;
3727 int ret = -1;
3729 /* Initialize time conversion information for localtime_r(). */
3730 tzset();
3732 /* Don't mask creation mode, kernel already did that */
3733 umask(0);
3735 qemu_init_exec_dir(argv[0]);
3737 pthread_mutex_init(&lo.mutex, NULL);
3738 lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal);
3739 lo.root.fd = -1;
3740 lo.root.fuse_ino = FUSE_ROOT_ID;
3741 lo.cache = CACHE_AUTO;
3744 * Set up the ino map like this:
3745 * [0] Reserved (will not be used)
3746 * [1] Root inode
3748 lo_map_init(&lo.ino_map);
3749 reserve_elem = lo_map_reserve(&lo.ino_map, 0);
3750 if (!reserve_elem) {
3751 fuse_log(FUSE_LOG_ERR, "failed to alloc reserve_elem.\n");
3752 goto err_out1;
3754 reserve_elem->in_use = false;
3755 root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino);
3756 if (!root_elem) {
3757 fuse_log(FUSE_LOG_ERR, "failed to alloc root_elem.\n");
3758 goto err_out1;
3760 root_elem->inode = &lo.root;
3762 lo_map_init(&lo.dirp_map);
3763 lo_map_init(&lo.fd_map);
3765 if (fuse_parse_cmdline(&args, &opts) != 0) {
3766 goto err_out1;
3768 fuse_set_log_func(log_func);
3769 use_syslog = opts.syslog;
3770 if (use_syslog) {
3771 openlog("virtiofsd", LOG_PID, LOG_DAEMON);
3774 if (opts.show_help) {
3775 printf("usage: %s [options]\n\n", argv[0]);
3776 fuse_cmdline_help();
3777 printf(" -o source=PATH shared directory tree\n");
3778 fuse_lowlevel_help();
3779 ret = 0;
3780 goto err_out1;
3781 } else if (opts.show_version) {
3782 qemu_version();
3783 fuse_lowlevel_version();
3784 ret = 0;
3785 goto err_out1;
3786 } else if (opts.print_capabilities) {
3787 print_capabilities();
3788 ret = 0;
3789 goto err_out1;
3792 if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) {
3793 goto err_out1;
3796 if (opts.log_level != 0) {
3797 current_log_level = opts.log_level;
3798 } else {
3799 /* default log level is INFO */
3800 current_log_level = FUSE_LOG_INFO;
3802 lo.debug = opts.debug;
3803 if (lo.debug) {
3804 current_log_level = FUSE_LOG_DEBUG;
3806 if (lo.source) {
3807 struct stat stat;
3808 int res;
3810 res = lstat(lo.source, &stat);
3811 if (res == -1) {
3812 fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n",
3813 lo.source);
3814 exit(1);
3816 if (!S_ISDIR(stat.st_mode)) {
3817 fuse_log(FUSE_LOG_ERR, "source is not a directory\n");
3818 exit(1);
3820 } else {
3821 lo.source = strdup("/");
3822 if (!lo.source) {
3823 fuse_log(FUSE_LOG_ERR, "failed to strdup source\n");
3824 goto err_out1;
3828 if (lo.xattrmap) {
3829 parse_xattrmap(&lo);
3832 if (!lo.timeout_set) {
3833 switch (lo.cache) {
3834 case CACHE_NONE:
3835 lo.timeout = 0.0;
3836 break;
3838 case CACHE_AUTO:
3839 lo.timeout = 1.0;
3840 break;
3842 case CACHE_ALWAYS:
3843 lo.timeout = 86400.0;
3844 break;
3846 } else if (lo.timeout < 0) {
3847 fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout);
3848 exit(1);
3851 lo.use_statx = true;
3853 se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo);
3854 if (se == NULL) {
3855 goto err_out1;
3858 if (fuse_set_signal_handlers(se) != 0) {
3859 goto err_out2;
3862 if (fuse_session_mount(se) != 0) {
3863 goto err_out3;
3866 fuse_daemonize(opts.foreground);
3868 setup_nofile_rlimit(opts.rlimit_nofile);
3870 /* Must be before sandbox since it wants /proc */
3871 setup_capng();
3873 setup_sandbox(&lo, se, opts.syslog);
3875 setup_root(&lo, &lo.root);
3876 /* Block until ctrl+c or fusermount -u */
3877 ret = virtio_loop(se);
3879 fuse_session_unmount(se);
3880 cleanup_capng();
3881 err_out3:
3882 fuse_remove_signal_handlers(se);
3883 err_out2:
3884 fuse_session_destroy(se);
3885 err_out1:
3886 fuse_opt_free_args(&args);
3888 fuse_lo_data_cleanup(&lo);
3890 return ret ? 1 : 0;