storage-daemon: include current command line option in the errors
[qemu/ar7.git] / tools / virtiofsd / passthrough_ll.c
blobfc7e1b1e8e2b8dcf41f2b762fb6a5058fd61dc26
1 /*
2 * FUSE: Filesystem in Userspace
3 * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
5 * This program can be distributed under the terms of the GNU GPLv2.
6 * See the file COPYING.
7 */
9 /*
11 * This file system mirrors the existing file system hierarchy of the
12 * system, starting at the root file system. This is implemented by
13 * just "passing through" all requests to the corresponding user-space
14 * libc functions. In contrast to passthrough.c and passthrough_fh.c,
15 * this implementation uses the low-level API. Its performance should
16 * be the least bad among the three, but many operations are not
17 * implemented. In particular, it is not possible to remove files (or
18 * directories) because the code necessary to defer actual removal
19 * until the file is not opened anymore would make the example much
20 * more complicated.
22 * When writeback caching is enabled (-o writeback mount option), it
23 * is only possible to write to files for which the mounting user has
24 * read permissions. This is because the writeback cache requires the
25 * kernel to be able to issue read requests for all files (which the
26 * passthrough filesystem cannot satisfy if it can't read the file in
27 * the underlying filesystem).
29 * Compile with:
31 * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o
32 * passthrough_ll
34 * ## Source code ##
35 * \include passthrough_ll.c
38 #include "qemu/osdep.h"
39 #include "qemu/timer.h"
40 #include "fuse_virtio.h"
41 #include "fuse_log.h"
42 #include "fuse_lowlevel.h"
43 #include "standard-headers/linux/fuse.h"
44 #include <cap-ng.h>
45 #include <dirent.h>
46 #include <pthread.h>
47 #include <sys/file.h>
48 #include <sys/mount.h>
49 #include <sys/prctl.h>
50 #include <sys/resource.h>
51 #include <sys/syscall.h>
52 #include <sys/wait.h>
53 #include <sys/xattr.h>
54 #include <syslog.h>
56 #include "qemu/cutils.h"
57 #include "passthrough_helpers.h"
58 #include "passthrough_seccomp.h"
60 /* Keep track of inode posix locks for each owner. */
61 struct lo_inode_plock {
62 uint64_t lock_owner;
63 int fd; /* fd for OFD locks */
66 struct lo_map_elem {
67 union {
68 struct lo_inode *inode;
69 struct lo_dirp *dirp;
70 int fd;
71 ssize_t freelist;
73 bool in_use;
76 /* Maps FUSE fh or ino values to internal objects */
77 struct lo_map {
78 struct lo_map_elem *elems;
79 size_t nelems;
80 ssize_t freelist;
83 struct lo_key {
84 ino_t ino;
85 dev_t dev;
86 uint64_t mnt_id;
89 struct lo_inode {
90 int fd;
93 * Atomic reference count for this object. The nlookup field holds a
94 * reference and release it when nlookup reaches 0.
96 gint refcount;
98 struct lo_key key;
101 * This counter keeps the inode alive during the FUSE session.
102 * Incremented when the FUSE inode number is sent in a reply
103 * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc). Decremented when an inode is
104 * released by a FUSE_FORGET request.
106 * Note that this value is untrusted because the client can manipulate
107 * it arbitrarily using FUSE_FORGET requests.
109 * Protected by lo->mutex.
111 uint64_t nlookup;
113 fuse_ino_t fuse_ino;
114 pthread_mutex_t plock_mutex;
115 GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
117 mode_t filetype;
120 struct lo_cred {
121 uid_t euid;
122 gid_t egid;
125 enum {
126 CACHE_NONE,
127 CACHE_AUTO,
128 CACHE_ALWAYS,
131 enum {
132 SANDBOX_NAMESPACE,
133 SANDBOX_CHROOT,
136 typedef struct xattr_map_entry {
137 char *key;
138 char *prepend;
139 unsigned int flags;
140 } XattrMapEntry;
142 struct lo_data {
143 pthread_mutex_t mutex;
144 int sandbox;
145 int debug;
146 int writeback;
147 int flock;
148 int posix_lock;
149 int xattr;
150 char *xattrmap;
151 char *xattr_security_capability;
152 char *source;
153 char *modcaps;
154 double timeout;
155 int cache;
156 int timeout_set;
157 int readdirplus_set;
158 int readdirplus_clear;
159 int allow_direct_io;
160 int announce_submounts;
161 bool use_statx;
162 struct lo_inode root;
163 GHashTable *inodes; /* protected by lo->mutex */
164 struct lo_map ino_map; /* protected by lo->mutex */
165 struct lo_map dirp_map; /* protected by lo->mutex */
166 struct lo_map fd_map; /* protected by lo->mutex */
167 XattrMapEntry *xattr_map_list;
168 size_t xattr_map_nentries;
170 /* An O_PATH file descriptor to /proc/self/fd/ */
171 int proc_self_fd;
172 int user_killpriv_v2, killpriv_v2;
175 static const struct fuse_opt lo_opts[] = {
176 { "sandbox=namespace",
177 offsetof(struct lo_data, sandbox),
178 SANDBOX_NAMESPACE },
179 { "sandbox=chroot",
180 offsetof(struct lo_data, sandbox),
181 SANDBOX_CHROOT },
182 { "writeback", offsetof(struct lo_data, writeback), 1 },
183 { "no_writeback", offsetof(struct lo_data, writeback), 0 },
184 { "source=%s", offsetof(struct lo_data, source), 0 },
185 { "flock", offsetof(struct lo_data, flock), 1 },
186 { "no_flock", offsetof(struct lo_data, flock), 0 },
187 { "posix_lock", offsetof(struct lo_data, posix_lock), 1 },
188 { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 },
189 { "xattr", offsetof(struct lo_data, xattr), 1 },
190 { "no_xattr", offsetof(struct lo_data, xattr), 0 },
191 { "xattrmap=%s", offsetof(struct lo_data, xattrmap), 0 },
192 { "modcaps=%s", offsetof(struct lo_data, modcaps), 0 },
193 { "timeout=%lf", offsetof(struct lo_data, timeout), 0 },
194 { "timeout=", offsetof(struct lo_data, timeout_set), 1 },
195 { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE },
196 { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO },
197 { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS },
198 { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 },
199 { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 },
200 { "allow_direct_io", offsetof(struct lo_data, allow_direct_io), 1 },
201 { "no_allow_direct_io", offsetof(struct lo_data, allow_direct_io), 0 },
202 { "announce_submounts", offsetof(struct lo_data, announce_submounts), 1 },
203 { "killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 1 },
204 { "no_killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 0 },
205 FUSE_OPT_END
207 static bool use_syslog = false;
208 static int current_log_level;
209 static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
210 uint64_t n);
212 static struct {
213 pthread_mutex_t mutex;
214 void *saved;
215 } cap;
216 /* That we loaded cap-ng in the current thread from the saved */
217 static __thread bool cap_loaded = 0;
219 static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
220 uint64_t mnt_id);
221 static int xattr_map_client(const struct lo_data *lo, const char *client_name,
222 char **out_name);
224 static int is_dot_or_dotdot(const char *name)
226 return name[0] == '.' &&
227 (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'));
230 /* Is `path` a single path component that is not "." or ".."? */
231 static int is_safe_path_component(const char *path)
233 if (strchr(path, '/')) {
234 return 0;
237 return !is_dot_or_dotdot(path);
240 static struct lo_data *lo_data(fuse_req_t req)
242 return (struct lo_data *)fuse_req_userdata(req);
246 * Load capng's state from our saved state if the current thread
247 * hadn't previously been loaded.
248 * returns 0 on success
250 static int load_capng(void)
252 if (!cap_loaded) {
253 pthread_mutex_lock(&cap.mutex);
254 capng_restore_state(&cap.saved);
256 * restore_state free's the saved copy
257 * so make another.
259 cap.saved = capng_save_state();
260 if (!cap.saved) {
261 pthread_mutex_unlock(&cap.mutex);
262 fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n");
263 return -EINVAL;
265 pthread_mutex_unlock(&cap.mutex);
268 * We want to use the loaded state for our pid,
269 * not the original
271 capng_setpid(syscall(SYS_gettid));
272 cap_loaded = true;
274 return 0;
278 * Helpers for dropping and regaining effective capabilities. Returns 0
279 * on success, error otherwise
281 static int drop_effective_cap(const char *cap_name, bool *cap_dropped)
283 int cap, ret;
285 cap = capng_name_to_capability(cap_name);
286 if (cap < 0) {
287 ret = errno;
288 fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
289 cap_name, strerror(errno));
290 goto out;
293 if (load_capng()) {
294 ret = errno;
295 fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
296 goto out;
299 /* We dont have this capability in effective set already. */
300 if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) {
301 ret = 0;
302 goto out;
305 if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) {
306 ret = errno;
307 fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n");
308 goto out;
311 if (capng_apply(CAPNG_SELECT_CAPS)) {
312 ret = errno;
313 fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n");
314 goto out;
317 ret = 0;
318 if (cap_dropped) {
319 *cap_dropped = true;
322 out:
323 return ret;
326 static int gain_effective_cap(const char *cap_name)
328 int cap;
329 int ret = 0;
331 cap = capng_name_to_capability(cap_name);
332 if (cap < 0) {
333 ret = errno;
334 fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
335 cap_name, strerror(errno));
336 goto out;
339 if (load_capng()) {
340 ret = errno;
341 fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
342 goto out;
345 if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) {
346 ret = errno;
347 fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n");
348 goto out;
351 if (capng_apply(CAPNG_SELECT_CAPS)) {
352 ret = errno;
353 fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n");
354 goto out;
356 ret = 0;
358 out:
359 return ret;
363 * The host kernel normally drops security.capability xattr's on
364 * any write, however if we're remapping xattr names we need to drop
365 * whatever the clients security.capability is actually stored as.
367 static int drop_security_capability(const struct lo_data *lo, int fd)
369 if (!lo->xattr_security_capability) {
370 /* We didn't remap the name, let the host kernel do it */
371 return 0;
373 if (!fremovexattr(fd, lo->xattr_security_capability)) {
374 /* All good */
375 return 0;
378 switch (errno) {
379 case ENODATA:
380 /* Attribute didn't exist, that's fine */
381 return 0;
383 case ENOTSUP:
384 /* FS didn't support attribute anyway, also fine */
385 return 0;
387 default:
388 /* Hmm other error */
389 return errno;
393 static void lo_map_init(struct lo_map *map)
395 map->elems = NULL;
396 map->nelems = 0;
397 map->freelist = -1;
400 static void lo_map_destroy(struct lo_map *map)
402 free(map->elems);
405 static int lo_map_grow(struct lo_map *map, size_t new_nelems)
407 struct lo_map_elem *new_elems;
408 size_t i;
410 if (new_nelems <= map->nelems) {
411 return 1;
414 new_elems = realloc(map->elems, sizeof(map->elems[0]) * new_nelems);
415 if (!new_elems) {
416 return 0;
419 for (i = map->nelems; i < new_nelems; i++) {
420 new_elems[i].freelist = i + 1;
421 new_elems[i].in_use = false;
423 new_elems[new_nelems - 1].freelist = -1;
425 map->elems = new_elems;
426 map->freelist = map->nelems;
427 map->nelems = new_nelems;
428 return 1;
431 static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map)
433 struct lo_map_elem *elem;
435 if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) {
436 return NULL;
439 elem = &map->elems[map->freelist];
440 map->freelist = elem->freelist;
442 elem->in_use = true;
444 return elem;
447 static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key)
449 ssize_t *prev;
451 if (!lo_map_grow(map, key + 1)) {
452 return NULL;
455 for (prev = &map->freelist; *prev != -1;
456 prev = &map->elems[*prev].freelist) {
457 if (*prev == key) {
458 struct lo_map_elem *elem = &map->elems[key];
460 *prev = elem->freelist;
461 elem->in_use = true;
462 return elem;
465 return NULL;
468 static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key)
470 if (key >= map->nelems) {
471 return NULL;
473 if (!map->elems[key].in_use) {
474 return NULL;
476 return &map->elems[key];
479 static void lo_map_remove(struct lo_map *map, size_t key)
481 struct lo_map_elem *elem;
483 if (key >= map->nelems) {
484 return;
487 elem = &map->elems[key];
488 if (!elem->in_use) {
489 return;
492 elem->in_use = false;
494 elem->freelist = map->freelist;
495 map->freelist = key;
498 /* Assumes lo->mutex is held */
499 static ssize_t lo_add_fd_mapping(struct lo_data *lo, int fd)
501 struct lo_map_elem *elem;
503 elem = lo_map_alloc_elem(&lo->fd_map);
504 if (!elem) {
505 return -1;
508 elem->fd = fd;
509 return elem - lo->fd_map.elems;
512 /* Assumes lo->mutex is held */
513 static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp)
515 struct lo_map_elem *elem;
517 elem = lo_map_alloc_elem(&lo_data(req)->dirp_map);
518 if (!elem) {
519 return -1;
522 elem->dirp = dirp;
523 return elem - lo_data(req)->dirp_map.elems;
526 /* Assumes lo->mutex is held */
527 static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode)
529 struct lo_map_elem *elem;
531 elem = lo_map_alloc_elem(&lo_data(req)->ino_map);
532 if (!elem) {
533 return -1;
536 elem->inode = inode;
537 return elem - lo_data(req)->ino_map.elems;
540 static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep)
542 struct lo_inode *inode = *inodep;
544 if (!inode) {
545 return;
548 *inodep = NULL;
550 if (g_atomic_int_dec_and_test(&inode->refcount)) {
551 close(inode->fd);
552 free(inode);
556 /* Caller must release refcount using lo_inode_put() */
557 static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino)
559 struct lo_data *lo = lo_data(req);
560 struct lo_map_elem *elem;
562 pthread_mutex_lock(&lo->mutex);
563 elem = lo_map_get(&lo->ino_map, ino);
564 if (elem) {
565 g_atomic_int_inc(&elem->inode->refcount);
567 pthread_mutex_unlock(&lo->mutex);
569 if (!elem) {
570 return NULL;
573 return elem->inode;
577 * TODO Remove this helper and force callers to hold an inode refcount until
578 * they are done with the fd. This will be done in a later patch to make
579 * review easier.
581 static int lo_fd(fuse_req_t req, fuse_ino_t ino)
583 struct lo_inode *inode = lo_inode(req, ino);
584 int fd;
586 if (!inode) {
587 return -1;
590 fd = inode->fd;
591 lo_inode_put(lo_data(req), &inode);
592 return fd;
596 * Open a file descriptor for an inode. Returns -EBADF if the inode is not a
597 * regular file or a directory.
599 * Use this helper function instead of raw openat(2) to prevent security issues
600 * when a malicious client opens special files such as block device nodes.
601 * Symlink inodes are also rejected since symlinks must already have been
602 * traversed on the client side.
604 static int lo_inode_open(struct lo_data *lo, struct lo_inode *inode,
605 int open_flags)
607 g_autofree char *fd_str = g_strdup_printf("%d", inode->fd);
608 int fd;
610 if (!S_ISREG(inode->filetype) && !S_ISDIR(inode->filetype)) {
611 return -EBADF;
615 * The file is a symlink so O_NOFOLLOW must be ignored. We checked earlier
616 * that the inode is not a special file but if an external process races
617 * with us then symlinks are traversed here. It is not possible to escape
618 * the shared directory since it is mounted as "/" though.
620 fd = openat(lo->proc_self_fd, fd_str, open_flags & ~O_NOFOLLOW);
621 if (fd < 0) {
622 return -errno;
624 return fd;
627 static void lo_init(void *userdata, struct fuse_conn_info *conn)
629 struct lo_data *lo = (struct lo_data *)userdata;
631 if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) {
632 conn->want |= FUSE_CAP_EXPORT_SUPPORT;
635 if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) {
636 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n");
637 conn->want |= FUSE_CAP_WRITEBACK_CACHE;
639 if (conn->capable & FUSE_CAP_FLOCK_LOCKS) {
640 if (lo->flock) {
641 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n");
642 conn->want |= FUSE_CAP_FLOCK_LOCKS;
643 } else {
644 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n");
645 conn->want &= ~FUSE_CAP_FLOCK_LOCKS;
649 if (conn->capable & FUSE_CAP_POSIX_LOCKS) {
650 if (lo->posix_lock) {
651 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n");
652 conn->want |= FUSE_CAP_POSIX_LOCKS;
653 } else {
654 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n");
655 conn->want &= ~FUSE_CAP_POSIX_LOCKS;
659 if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) ||
660 lo->readdirplus_clear) {
661 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n");
662 conn->want &= ~FUSE_CAP_READDIRPLUS;
665 if (!(conn->capable & FUSE_CAP_SUBMOUNTS) && lo->announce_submounts) {
666 fuse_log(FUSE_LOG_WARNING, "lo_init: Cannot announce submounts, client "
667 "does not support it\n");
668 lo->announce_submounts = false;
671 if (lo->user_killpriv_v2 == 1) {
673 * User explicitly asked for this option. Enable it unconditionally.
674 * If connection does not have this capability, it should fail
675 * in fuse_lowlevel.c
677 fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n");
678 conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2;
679 lo->killpriv_v2 = 1;
680 } else if (lo->user_killpriv_v2 == -1 &&
681 conn->capable & FUSE_CAP_HANDLE_KILLPRIV_V2) {
683 * User did not specify a value for killpriv_v2. By default enable it
684 * if connection offers this capability
686 fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n");
687 conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2;
688 lo->killpriv_v2 = 1;
689 } else {
691 * Either user specified to disable killpriv_v2, or connection does
692 * not offer this capability. Disable killpriv_v2 in both the cases
694 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling killpriv_v2\n");
695 conn->want &= ~FUSE_CAP_HANDLE_KILLPRIV_V2;
696 lo->killpriv_v2 = 0;
700 static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
701 struct fuse_file_info *fi)
703 int res;
704 struct stat buf;
705 struct lo_data *lo = lo_data(req);
707 (void)fi;
709 res =
710 fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
711 if (res == -1) {
712 return (void)fuse_reply_err(req, errno);
715 fuse_reply_attr(req, &buf, lo->timeout);
718 static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi)
720 struct lo_data *lo = lo_data(req);
721 struct lo_map_elem *elem;
723 pthread_mutex_lock(&lo->mutex);
724 elem = lo_map_get(&lo->fd_map, fi->fh);
725 pthread_mutex_unlock(&lo->mutex);
727 if (!elem) {
728 return -1;
731 return elem->fd;
734 static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
735 int valid, struct fuse_file_info *fi)
737 int saverr;
738 char procname[64];
739 struct lo_data *lo = lo_data(req);
740 struct lo_inode *inode;
741 int ifd;
742 int res;
743 int fd = -1;
745 inode = lo_inode(req, ino);
746 if (!inode) {
747 fuse_reply_err(req, EBADF);
748 return;
751 ifd = inode->fd;
753 /* If fi->fh is invalid we'll report EBADF later */
754 if (fi) {
755 fd = lo_fi_fd(req, fi);
758 if (valid & FUSE_SET_ATTR_MODE) {
759 if (fi) {
760 res = fchmod(fd, attr->st_mode);
761 } else {
762 sprintf(procname, "%i", ifd);
763 res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0);
765 if (res == -1) {
766 saverr = errno;
767 goto out_err;
770 if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) {
771 uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1;
772 gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1;
774 saverr = drop_security_capability(lo, ifd);
775 if (saverr) {
776 goto out_err;
779 res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
780 if (res == -1) {
781 saverr = errno;
782 goto out_err;
785 if (valid & FUSE_SET_ATTR_SIZE) {
786 int truncfd;
787 bool kill_suidgid;
788 bool cap_fsetid_dropped = false;
790 kill_suidgid = lo->killpriv_v2 && (valid & FUSE_SET_ATTR_KILL_SUIDGID);
791 if (fi) {
792 truncfd = fd;
793 } else {
794 truncfd = lo_inode_open(lo, inode, O_RDWR);
795 if (truncfd < 0) {
796 saverr = -truncfd;
797 goto out_err;
801 saverr = drop_security_capability(lo, truncfd);
802 if (saverr) {
803 if (!fi) {
804 close(truncfd);
806 goto out_err;
809 if (kill_suidgid) {
810 res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
811 if (res != 0) {
812 saverr = res;
813 if (!fi) {
814 close(truncfd);
816 goto out_err;
820 res = ftruncate(truncfd, attr->st_size);
821 saverr = res == -1 ? errno : 0;
823 if (cap_fsetid_dropped) {
824 if (gain_effective_cap("FSETID")) {
825 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
828 if (!fi) {
829 close(truncfd);
831 if (res == -1) {
832 goto out_err;
835 if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) {
836 struct timespec tv[2];
838 tv[0].tv_sec = 0;
839 tv[1].tv_sec = 0;
840 tv[0].tv_nsec = UTIME_OMIT;
841 tv[1].tv_nsec = UTIME_OMIT;
843 if (valid & FUSE_SET_ATTR_ATIME_NOW) {
844 tv[0].tv_nsec = UTIME_NOW;
845 } else if (valid & FUSE_SET_ATTR_ATIME) {
846 tv[0] = attr->st_atim;
849 if (valid & FUSE_SET_ATTR_MTIME_NOW) {
850 tv[1].tv_nsec = UTIME_NOW;
851 } else if (valid & FUSE_SET_ATTR_MTIME) {
852 tv[1] = attr->st_mtim;
855 if (fi) {
856 res = futimens(fd, tv);
857 } else {
858 sprintf(procname, "%i", inode->fd);
859 res = utimensat(lo->proc_self_fd, procname, tv, 0);
861 if (res == -1) {
862 saverr = errno;
863 goto out_err;
866 lo_inode_put(lo, &inode);
868 return lo_getattr(req, ino, fi);
870 out_err:
871 lo_inode_put(lo, &inode);
872 fuse_reply_err(req, saverr);
875 static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
876 uint64_t mnt_id)
878 struct lo_inode *p;
879 struct lo_key key = {
880 .ino = st->st_ino,
881 .dev = st->st_dev,
882 .mnt_id = mnt_id,
885 pthread_mutex_lock(&lo->mutex);
886 p = g_hash_table_lookup(lo->inodes, &key);
887 if (p) {
888 assert(p->nlookup > 0);
889 p->nlookup++;
890 g_atomic_int_inc(&p->refcount);
892 pthread_mutex_unlock(&lo->mutex);
894 return p;
897 /* value_destroy_func for posix_locks GHashTable */
898 static void posix_locks_value_destroy(gpointer data)
900 struct lo_inode_plock *plock = data;
903 * We had used open() for locks and had only one fd. So
904 * closing this fd should release all OFD locks.
906 close(plock->fd);
907 free(plock);
910 static int do_statx(struct lo_data *lo, int dirfd, const char *pathname,
911 struct stat *statbuf, int flags, uint64_t *mnt_id)
913 int res;
915 #if defined(CONFIG_STATX) && defined(STATX_MNT_ID)
916 if (lo->use_statx) {
917 struct statx statxbuf;
919 res = statx(dirfd, pathname, flags, STATX_BASIC_STATS | STATX_MNT_ID,
920 &statxbuf);
921 if (!res) {
922 memset(statbuf, 0, sizeof(*statbuf));
923 statbuf->st_dev = makedev(statxbuf.stx_dev_major,
924 statxbuf.stx_dev_minor);
925 statbuf->st_ino = statxbuf.stx_ino;
926 statbuf->st_mode = statxbuf.stx_mode;
927 statbuf->st_nlink = statxbuf.stx_nlink;
928 statbuf->st_uid = statxbuf.stx_uid;
929 statbuf->st_gid = statxbuf.stx_gid;
930 statbuf->st_rdev = makedev(statxbuf.stx_rdev_major,
931 statxbuf.stx_rdev_minor);
932 statbuf->st_size = statxbuf.stx_size;
933 statbuf->st_blksize = statxbuf.stx_blksize;
934 statbuf->st_blocks = statxbuf.stx_blocks;
935 statbuf->st_atim.tv_sec = statxbuf.stx_atime.tv_sec;
936 statbuf->st_atim.tv_nsec = statxbuf.stx_atime.tv_nsec;
937 statbuf->st_mtim.tv_sec = statxbuf.stx_mtime.tv_sec;
938 statbuf->st_mtim.tv_nsec = statxbuf.stx_mtime.tv_nsec;
939 statbuf->st_ctim.tv_sec = statxbuf.stx_ctime.tv_sec;
940 statbuf->st_ctim.tv_nsec = statxbuf.stx_ctime.tv_nsec;
942 if (statxbuf.stx_mask & STATX_MNT_ID) {
943 *mnt_id = statxbuf.stx_mnt_id;
944 } else {
945 *mnt_id = 0;
947 return 0;
948 } else if (errno != ENOSYS) {
949 return -1;
951 lo->use_statx = false;
952 /* fallback */
954 #endif
955 res = fstatat(dirfd, pathname, statbuf, flags);
956 if (res == -1) {
957 return -1;
959 *mnt_id = 0;
961 return 0;
965 * Increments nlookup on the inode on success. unref_inode_lolocked() must be
966 * called eventually to decrement nlookup again. If inodep is non-NULL, the
967 * inode pointer is stored and the caller must call lo_inode_put().
969 static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
970 struct fuse_entry_param *e,
971 struct lo_inode **inodep)
973 int newfd;
974 int res;
975 int saverr;
976 uint64_t mnt_id;
977 struct lo_data *lo = lo_data(req);
978 struct lo_inode *inode = NULL;
979 struct lo_inode *dir = lo_inode(req, parent);
981 if (inodep) {
982 *inodep = NULL; /* in case there is an error */
986 * name_to_handle_at() and open_by_handle_at() can reach here with fuse
987 * mount point in guest, but we don't have its inode info in the
988 * ino_map.
990 if (!dir) {
991 return ENOENT;
994 memset(e, 0, sizeof(*e));
995 e->attr_timeout = lo->timeout;
996 e->entry_timeout = lo->timeout;
998 /* Do not allow escaping root directory */
999 if (dir == &lo->root && strcmp(name, "..") == 0) {
1000 name = ".";
1003 newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW);
1004 if (newfd == -1) {
1005 goto out_err;
1008 res = do_statx(lo, newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
1009 &mnt_id);
1010 if (res == -1) {
1011 goto out_err;
1014 if (S_ISDIR(e->attr.st_mode) && lo->announce_submounts &&
1015 (e->attr.st_dev != dir->key.dev || mnt_id != dir->key.mnt_id)) {
1016 e->attr_flags |= FUSE_ATTR_SUBMOUNT;
1019 inode = lo_find(lo, &e->attr, mnt_id);
1020 if (inode) {
1021 close(newfd);
1022 } else {
1023 inode = calloc(1, sizeof(struct lo_inode));
1024 if (!inode) {
1025 goto out_err;
1028 /* cache only filetype */
1029 inode->filetype = (e->attr.st_mode & S_IFMT);
1032 * One for the caller and one for nlookup (released in
1033 * unref_inode_lolocked())
1035 g_atomic_int_set(&inode->refcount, 2);
1037 inode->nlookup = 1;
1038 inode->fd = newfd;
1039 inode->key.ino = e->attr.st_ino;
1040 inode->key.dev = e->attr.st_dev;
1041 inode->key.mnt_id = mnt_id;
1042 if (lo->posix_lock) {
1043 pthread_mutex_init(&inode->plock_mutex, NULL);
1044 inode->posix_locks = g_hash_table_new_full(
1045 g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
1047 pthread_mutex_lock(&lo->mutex);
1048 inode->fuse_ino = lo_add_inode_mapping(req, inode);
1049 g_hash_table_insert(lo->inodes, &inode->key, inode);
1050 pthread_mutex_unlock(&lo->mutex);
1052 e->ino = inode->fuse_ino;
1054 /* Transfer ownership of inode pointer to caller or drop it */
1055 if (inodep) {
1056 *inodep = inode;
1057 } else {
1058 lo_inode_put(lo, &inode);
1061 lo_inode_put(lo, &dir);
1063 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
1064 name, (unsigned long long)e->ino);
1066 return 0;
1068 out_err:
1069 saverr = errno;
1070 if (newfd != -1) {
1071 close(newfd);
1073 lo_inode_put(lo, &inode);
1074 lo_inode_put(lo, &dir);
1075 return saverr;
1078 static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
1080 struct fuse_entry_param e;
1081 int err;
1083 fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent,
1084 name);
1087 * Don't use is_safe_path_component(), allow "." and ".." for NFS export
1088 * support.
1090 if (strchr(name, '/')) {
1091 fuse_reply_err(req, EINVAL);
1092 return;
1095 err = lo_do_lookup(req, parent, name, &e, NULL);
1096 if (err) {
1097 fuse_reply_err(req, err);
1098 } else {
1099 fuse_reply_entry(req, &e);
1104 * On some archs, setres*id is limited to 2^16 but they
1105 * provide setres*id32 variants that allow 2^32.
1106 * Others just let setres*id do 2^32 anyway.
1108 #ifdef SYS_setresgid32
1109 #define OURSYS_setresgid SYS_setresgid32
1110 #else
1111 #define OURSYS_setresgid SYS_setresgid
1112 #endif
1114 #ifdef SYS_setresuid32
1115 #define OURSYS_setresuid SYS_setresuid32
1116 #else
1117 #define OURSYS_setresuid SYS_setresuid
1118 #endif
1121 * Change to uid/gid of caller so that file is created with
1122 * ownership of caller.
1123 * TODO: What about selinux context?
1125 static int lo_change_cred(fuse_req_t req, struct lo_cred *old)
1127 int res;
1129 old->euid = geteuid();
1130 old->egid = getegid();
1132 res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1);
1133 if (res == -1) {
1134 return errno;
1137 res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1);
1138 if (res == -1) {
1139 int errno_save = errno;
1141 syscall(OURSYS_setresgid, -1, old->egid, -1);
1142 return errno_save;
1145 return 0;
1148 /* Regain Privileges */
1149 static void lo_restore_cred(struct lo_cred *old)
1151 int res;
1153 res = syscall(OURSYS_setresuid, -1, old->euid, -1);
1154 if (res == -1) {
1155 fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid);
1156 exit(1);
1159 res = syscall(OURSYS_setresgid, -1, old->egid, -1);
1160 if (res == -1) {
1161 fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid);
1162 exit(1);
1166 static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
1167 const char *name, mode_t mode, dev_t rdev,
1168 const char *link)
1170 int res;
1171 int saverr;
1172 struct lo_data *lo = lo_data(req);
1173 struct lo_inode *dir;
1174 struct fuse_entry_param e;
1175 struct lo_cred old = {};
1177 if (!is_safe_path_component(name)) {
1178 fuse_reply_err(req, EINVAL);
1179 return;
1182 dir = lo_inode(req, parent);
1183 if (!dir) {
1184 fuse_reply_err(req, EBADF);
1185 return;
1188 saverr = lo_change_cred(req, &old);
1189 if (saverr) {
1190 goto out;
1193 res = mknod_wrapper(dir->fd, name, link, mode, rdev);
1195 saverr = errno;
1197 lo_restore_cred(&old);
1199 if (res == -1) {
1200 goto out;
1203 saverr = lo_do_lookup(req, parent, name, &e, NULL);
1204 if (saverr) {
1205 goto out;
1208 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
1209 name, (unsigned long long)e.ino);
1211 fuse_reply_entry(req, &e);
1212 lo_inode_put(lo, &dir);
1213 return;
1215 out:
1216 lo_inode_put(lo, &dir);
1217 fuse_reply_err(req, saverr);
1220 static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name,
1221 mode_t mode, dev_t rdev)
1223 lo_mknod_symlink(req, parent, name, mode, rdev, NULL);
1226 static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name,
1227 mode_t mode)
1229 lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL);
1232 static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent,
1233 const char *name)
1235 lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link);
1238 static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent,
1239 const char *name)
1241 int res;
1242 struct lo_data *lo = lo_data(req);
1243 struct lo_inode *parent_inode;
1244 struct lo_inode *inode;
1245 struct fuse_entry_param e;
1246 char procname[64];
1247 int saverr;
1249 if (!is_safe_path_component(name)) {
1250 fuse_reply_err(req, EINVAL);
1251 return;
1254 parent_inode = lo_inode(req, parent);
1255 inode = lo_inode(req, ino);
1256 if (!parent_inode || !inode) {
1257 errno = EBADF;
1258 goto out_err;
1261 memset(&e, 0, sizeof(struct fuse_entry_param));
1262 e.attr_timeout = lo->timeout;
1263 e.entry_timeout = lo->timeout;
1265 sprintf(procname, "%i", inode->fd);
1266 res = linkat(lo->proc_self_fd, procname, parent_inode->fd, name,
1267 AT_SYMLINK_FOLLOW);
1268 if (res == -1) {
1269 goto out_err;
1272 res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1273 if (res == -1) {
1274 goto out_err;
1277 pthread_mutex_lock(&lo->mutex);
1278 inode->nlookup++;
1279 pthread_mutex_unlock(&lo->mutex);
1280 e.ino = inode->fuse_ino;
1282 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
1283 name, (unsigned long long)e.ino);
1285 fuse_reply_entry(req, &e);
1286 lo_inode_put(lo, &parent_inode);
1287 lo_inode_put(lo, &inode);
1288 return;
1290 out_err:
1291 saverr = errno;
1292 lo_inode_put(lo, &parent_inode);
1293 lo_inode_put(lo, &inode);
1294 fuse_reply_err(req, saverr);
1297 /* Increments nlookup and caller must release refcount using lo_inode_put() */
1298 static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent,
1299 const char *name)
1301 int res;
1302 uint64_t mnt_id;
1303 struct stat attr;
1304 struct lo_data *lo = lo_data(req);
1305 struct lo_inode *dir = lo_inode(req, parent);
1307 if (!dir) {
1308 return NULL;
1311 res = do_statx(lo, dir->fd, name, &attr,
1312 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW, &mnt_id);
1313 lo_inode_put(lo, &dir);
1314 if (res == -1) {
1315 return NULL;
1318 return lo_find(lo, &attr, mnt_id);
1321 static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
1323 int res;
1324 struct lo_inode *inode;
1325 struct lo_data *lo = lo_data(req);
1327 if (!is_safe_path_component(name)) {
1328 fuse_reply_err(req, EINVAL);
1329 return;
1332 inode = lookup_name(req, parent, name);
1333 if (!inode) {
1334 fuse_reply_err(req, EIO);
1335 return;
1338 res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR);
1340 fuse_reply_err(req, res == -1 ? errno : 0);
1341 unref_inode_lolocked(lo, inode, 1);
1342 lo_inode_put(lo, &inode);
1345 static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
1346 fuse_ino_t newparent, const char *newname,
1347 unsigned int flags)
1349 int res;
1350 struct lo_inode *parent_inode;
1351 struct lo_inode *newparent_inode;
1352 struct lo_inode *oldinode = NULL;
1353 struct lo_inode *newinode = NULL;
1354 struct lo_data *lo = lo_data(req);
1356 if (!is_safe_path_component(name) || !is_safe_path_component(newname)) {
1357 fuse_reply_err(req, EINVAL);
1358 return;
1361 parent_inode = lo_inode(req, parent);
1362 newparent_inode = lo_inode(req, newparent);
1363 if (!parent_inode || !newparent_inode) {
1364 fuse_reply_err(req, EBADF);
1365 goto out;
1368 oldinode = lookup_name(req, parent, name);
1369 newinode = lookup_name(req, newparent, newname);
1371 if (!oldinode) {
1372 fuse_reply_err(req, EIO);
1373 goto out;
1376 if (flags) {
1377 #ifndef SYS_renameat2
1378 fuse_reply_err(req, EINVAL);
1379 #else
1380 res = syscall(SYS_renameat2, parent_inode->fd, name,
1381 newparent_inode->fd, newname, flags);
1382 if (res == -1 && errno == ENOSYS) {
1383 fuse_reply_err(req, EINVAL);
1384 } else {
1385 fuse_reply_err(req, res == -1 ? errno : 0);
1387 #endif
1388 goto out;
1391 res = renameat(parent_inode->fd, name, newparent_inode->fd, newname);
1393 fuse_reply_err(req, res == -1 ? errno : 0);
1394 out:
1395 unref_inode_lolocked(lo, oldinode, 1);
1396 unref_inode_lolocked(lo, newinode, 1);
1397 lo_inode_put(lo, &oldinode);
1398 lo_inode_put(lo, &newinode);
1399 lo_inode_put(lo, &parent_inode);
1400 lo_inode_put(lo, &newparent_inode);
1403 static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name)
1405 int res;
1406 struct lo_inode *inode;
1407 struct lo_data *lo = lo_data(req);
1409 if (!is_safe_path_component(name)) {
1410 fuse_reply_err(req, EINVAL);
1411 return;
1414 inode = lookup_name(req, parent, name);
1415 if (!inode) {
1416 fuse_reply_err(req, EIO);
1417 return;
1420 res = unlinkat(lo_fd(req, parent), name, 0);
1422 fuse_reply_err(req, res == -1 ? errno : 0);
1423 unref_inode_lolocked(lo, inode, 1);
1424 lo_inode_put(lo, &inode);
1427 /* To be called with lo->mutex held */
1428 static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n)
1430 if (!inode) {
1431 return;
1434 assert(inode->nlookup >= n);
1435 inode->nlookup -= n;
1436 if (!inode->nlookup) {
1437 lo_map_remove(&lo->ino_map, inode->fuse_ino);
1438 g_hash_table_remove(lo->inodes, &inode->key);
1439 if (lo->posix_lock) {
1440 if (g_hash_table_size(inode->posix_locks)) {
1441 fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
1443 g_hash_table_destroy(inode->posix_locks);
1444 pthread_mutex_destroy(&inode->plock_mutex);
1446 /* Drop our refcount from lo_do_lookup() */
1447 lo_inode_put(lo, &inode);
1451 static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
1452 uint64_t n)
1454 if (!inode) {
1455 return;
1458 pthread_mutex_lock(&lo->mutex);
1459 unref_inode(lo, inode, n);
1460 pthread_mutex_unlock(&lo->mutex);
1463 static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
1465 struct lo_data *lo = lo_data(req);
1466 struct lo_inode *inode;
1468 inode = lo_inode(req, ino);
1469 if (!inode) {
1470 return;
1473 fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n",
1474 (unsigned long long)ino, (unsigned long long)inode->nlookup,
1475 (unsigned long long)nlookup);
1477 unref_inode_lolocked(lo, inode, nlookup);
1478 lo_inode_put(lo, &inode);
1481 static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
1483 lo_forget_one(req, ino, nlookup);
1484 fuse_reply_none(req);
1487 static void lo_forget_multi(fuse_req_t req, size_t count,
1488 struct fuse_forget_data *forgets)
1490 int i;
1492 for (i = 0; i < count; i++) {
1493 lo_forget_one(req, forgets[i].ino, forgets[i].nlookup);
1495 fuse_reply_none(req);
1498 static void lo_readlink(fuse_req_t req, fuse_ino_t ino)
1500 char buf[PATH_MAX + 1];
1501 int res;
1503 res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf));
1504 if (res == -1) {
1505 return (void)fuse_reply_err(req, errno);
1508 if (res == sizeof(buf)) {
1509 return (void)fuse_reply_err(req, ENAMETOOLONG);
1512 buf[res] = '\0';
1514 fuse_reply_readlink(req, buf);
1517 struct lo_dirp {
1518 gint refcount;
1519 DIR *dp;
1520 struct dirent *entry;
1521 off_t offset;
1524 static void lo_dirp_put(struct lo_dirp **dp)
1526 struct lo_dirp *d = *dp;
1528 if (!d) {
1529 return;
1531 *dp = NULL;
1533 if (g_atomic_int_dec_and_test(&d->refcount)) {
1534 closedir(d->dp);
1535 free(d);
1539 /* Call lo_dirp_put() on the return value when no longer needed */
1540 static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi)
1542 struct lo_data *lo = lo_data(req);
1543 struct lo_map_elem *elem;
1545 pthread_mutex_lock(&lo->mutex);
1546 elem = lo_map_get(&lo->dirp_map, fi->fh);
1547 if (elem) {
1548 g_atomic_int_inc(&elem->dirp->refcount);
1550 pthread_mutex_unlock(&lo->mutex);
1551 if (!elem) {
1552 return NULL;
1555 return elem->dirp;
1558 static void lo_opendir(fuse_req_t req, fuse_ino_t ino,
1559 struct fuse_file_info *fi)
1561 int error = ENOMEM;
1562 struct lo_data *lo = lo_data(req);
1563 struct lo_dirp *d;
1564 int fd;
1565 ssize_t fh;
1567 d = calloc(1, sizeof(struct lo_dirp));
1568 if (d == NULL) {
1569 goto out_err;
1572 fd = openat(lo_fd(req, ino), ".", O_RDONLY);
1573 if (fd == -1) {
1574 goto out_errno;
1577 d->dp = fdopendir(fd);
1578 if (d->dp == NULL) {
1579 goto out_errno;
1582 d->offset = 0;
1583 d->entry = NULL;
1585 g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */
1586 pthread_mutex_lock(&lo->mutex);
1587 fh = lo_add_dirp_mapping(req, d);
1588 pthread_mutex_unlock(&lo->mutex);
1589 if (fh == -1) {
1590 goto out_err;
1593 fi->fh = fh;
1594 if (lo->cache == CACHE_ALWAYS) {
1595 fi->cache_readdir = 1;
1597 fuse_reply_open(req, fi);
1598 return;
1600 out_errno:
1601 error = errno;
1602 out_err:
1603 if (d) {
1604 if (d->dp) {
1605 closedir(d->dp);
1606 } else if (fd != -1) {
1607 close(fd);
1609 free(d);
1611 fuse_reply_err(req, error);
1614 static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
1615 off_t offset, struct fuse_file_info *fi, int plus)
1617 struct lo_data *lo = lo_data(req);
1618 struct lo_dirp *d = NULL;
1619 struct lo_inode *dinode;
1620 char *buf = NULL;
1621 char *p;
1622 size_t rem = size;
1623 int err = EBADF;
1625 dinode = lo_inode(req, ino);
1626 if (!dinode) {
1627 goto error;
1630 d = lo_dirp(req, fi);
1631 if (!d) {
1632 goto error;
1635 err = ENOMEM;
1636 buf = calloc(1, size);
1637 if (!buf) {
1638 goto error;
1640 p = buf;
1642 if (offset != d->offset) {
1643 seekdir(d->dp, offset);
1644 d->entry = NULL;
1645 d->offset = offset;
1647 while (1) {
1648 size_t entsize;
1649 off_t nextoff;
1650 const char *name;
1652 if (!d->entry) {
1653 errno = 0;
1654 d->entry = readdir(d->dp);
1655 if (!d->entry) {
1656 if (errno) { /* Error */
1657 err = errno;
1658 goto error;
1659 } else { /* End of stream */
1660 break;
1664 nextoff = d->entry->d_off;
1665 name = d->entry->d_name;
1667 fuse_ino_t entry_ino = 0;
1668 struct fuse_entry_param e = (struct fuse_entry_param){
1669 .attr.st_ino = d->entry->d_ino,
1670 .attr.st_mode = d->entry->d_type << 12,
1673 /* Hide root's parent directory */
1674 if (dinode == &lo->root && strcmp(name, "..") == 0) {
1675 e.attr.st_ino = lo->root.key.ino;
1676 e.attr.st_mode = DT_DIR << 12;
1679 if (plus) {
1680 if (!is_dot_or_dotdot(name)) {
1681 err = lo_do_lookup(req, ino, name, &e, NULL);
1682 if (err) {
1683 goto error;
1685 entry_ino = e.ino;
1688 entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff);
1689 } else {
1690 entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff);
1692 if (entsize > rem) {
1693 if (entry_ino != 0) {
1694 lo_forget_one(req, entry_ino, 1);
1696 break;
1699 p += entsize;
1700 rem -= entsize;
1702 d->entry = NULL;
1703 d->offset = nextoff;
1706 err = 0;
1707 error:
1708 lo_dirp_put(&d);
1709 lo_inode_put(lo, &dinode);
1712 * If there's an error, we can only signal it if we haven't stored
1713 * any entries yet - otherwise we'd end up with wrong lookup
1714 * counts for the entries that are already in the buffer. So we
1715 * return what we've collected until that point.
1717 if (err && rem == size) {
1718 fuse_reply_err(req, err);
1719 } else {
1720 fuse_reply_buf(req, buf, size - rem);
1722 free(buf);
1725 static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
1726 off_t offset, struct fuse_file_info *fi)
1728 lo_do_readdir(req, ino, size, offset, fi, 0);
1731 static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size,
1732 off_t offset, struct fuse_file_info *fi)
1734 lo_do_readdir(req, ino, size, offset, fi, 1);
1737 static void lo_releasedir(fuse_req_t req, fuse_ino_t ino,
1738 struct fuse_file_info *fi)
1740 struct lo_data *lo = lo_data(req);
1741 struct lo_map_elem *elem;
1742 struct lo_dirp *d;
1744 (void)ino;
1746 pthread_mutex_lock(&lo->mutex);
1747 elem = lo_map_get(&lo->dirp_map, fi->fh);
1748 if (!elem) {
1749 pthread_mutex_unlock(&lo->mutex);
1750 fuse_reply_err(req, EBADF);
1751 return;
1754 d = elem->dirp;
1755 lo_map_remove(&lo->dirp_map, fi->fh);
1756 pthread_mutex_unlock(&lo->mutex);
1758 lo_dirp_put(&d); /* paired with lo_opendir() */
1760 fuse_reply_err(req, 0);
1763 static void update_open_flags(int writeback, int allow_direct_io,
1764 struct fuse_file_info *fi)
1767 * With writeback cache, kernel may send read requests even
1768 * when userspace opened write-only
1770 if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) {
1771 fi->flags &= ~O_ACCMODE;
1772 fi->flags |= O_RDWR;
1776 * With writeback cache, O_APPEND is handled by the kernel.
1777 * This breaks atomicity (since the file may change in the
1778 * underlying filesystem, so that the kernel's idea of the
1779 * end of the file isn't accurate anymore). In this example,
1780 * we just accept that. A more rigorous filesystem may want
1781 * to return an error here
1783 if (writeback && (fi->flags & O_APPEND)) {
1784 fi->flags &= ~O_APPEND;
1788 * O_DIRECT in guest should not necessarily mean bypassing page
1789 * cache on host as well. Therefore, we discard it by default
1790 * ('-o no_allow_direct_io'). If somebody needs that behavior,
1791 * the '-o allow_direct_io' option should be set.
1793 if (!allow_direct_io) {
1794 fi->flags &= ~O_DIRECT;
1799 * Open a regular file, set up an fd mapping, and fill out the struct
1800 * fuse_file_info for it. If existing_fd is not negative, use that fd instead
1801 * opening a new one. Takes ownership of existing_fd.
1803 * Returns 0 on success or a positive errno.
1805 static int lo_do_open(struct lo_data *lo, struct lo_inode *inode,
1806 int existing_fd, struct fuse_file_info *fi)
1808 ssize_t fh;
1809 int fd = existing_fd;
1810 int err;
1811 bool cap_fsetid_dropped = false;
1812 bool kill_suidgid = lo->killpriv_v2 && fi->kill_priv;
1814 update_open_flags(lo->writeback, lo->allow_direct_io, fi);
1816 if (fd < 0) {
1817 if (kill_suidgid) {
1818 err = drop_effective_cap("FSETID", &cap_fsetid_dropped);
1819 if (err) {
1820 return err;
1824 fd = lo_inode_open(lo, inode, fi->flags);
1826 if (cap_fsetid_dropped) {
1827 if (gain_effective_cap("FSETID")) {
1828 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
1831 if (fd < 0) {
1832 return -fd;
1834 if (fi->flags & (O_TRUNC)) {
1835 int err = drop_security_capability(lo, fd);
1836 if (err) {
1837 close(fd);
1838 return err;
1843 pthread_mutex_lock(&lo->mutex);
1844 fh = lo_add_fd_mapping(lo, fd);
1845 pthread_mutex_unlock(&lo->mutex);
1846 if (fh == -1) {
1847 close(fd);
1848 return ENOMEM;
1851 fi->fh = fh;
1852 if (lo->cache == CACHE_NONE) {
1853 fi->direct_io = 1;
1854 } else if (lo->cache == CACHE_ALWAYS) {
1855 fi->keep_cache = 1;
1857 return 0;
1860 static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
1861 mode_t mode, struct fuse_file_info *fi)
1863 int fd = -1;
1864 struct lo_data *lo = lo_data(req);
1865 struct lo_inode *parent_inode;
1866 struct lo_inode *inode = NULL;
1867 struct fuse_entry_param e;
1868 int err;
1869 struct lo_cred old = {};
1871 fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)"
1872 " kill_priv=%d\n", parent, name, fi->kill_priv);
1874 if (!is_safe_path_component(name)) {
1875 fuse_reply_err(req, EINVAL);
1876 return;
1879 parent_inode = lo_inode(req, parent);
1880 if (!parent_inode) {
1881 fuse_reply_err(req, EBADF);
1882 return;
1885 err = lo_change_cred(req, &old);
1886 if (err) {
1887 goto out;
1890 update_open_flags(lo->writeback, lo->allow_direct_io, fi);
1892 /* Try to create a new file but don't open existing files */
1893 fd = openat(parent_inode->fd, name, fi->flags | O_CREAT | O_EXCL, mode);
1894 err = fd == -1 ? errno : 0;
1896 lo_restore_cred(&old);
1898 /* Ignore the error if file exists and O_EXCL was not given */
1899 if (err && (err != EEXIST || (fi->flags & O_EXCL))) {
1900 goto out;
1903 err = lo_do_lookup(req, parent, name, &e, &inode);
1904 if (err) {
1905 goto out;
1908 err = lo_do_open(lo, inode, fd, fi);
1909 fd = -1; /* lo_do_open() takes ownership of fd */
1910 if (err) {
1911 /* Undo lo_do_lookup() nlookup ref */
1912 unref_inode_lolocked(lo, inode, 1);
1915 out:
1916 lo_inode_put(lo, &inode);
1917 lo_inode_put(lo, &parent_inode);
1919 if (err) {
1920 if (fd >= 0) {
1921 close(fd);
1924 fuse_reply_err(req, err);
1925 } else {
1926 fuse_reply_create(req, &e, fi);
1930 /* Should be called with inode->plock_mutex held */
1931 static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo,
1932 struct lo_inode *inode,
1933 uint64_t lock_owner,
1934 pid_t pid, int *err)
1936 struct lo_inode_plock *plock;
1937 int fd;
1939 plock =
1940 g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner));
1942 if (plock) {
1943 return plock;
1946 plock = malloc(sizeof(struct lo_inode_plock));
1947 if (!plock) {
1948 *err = ENOMEM;
1949 return NULL;
1952 /* Open another instance of file which can be used for ofd locks. */
1953 /* TODO: What if file is not writable? */
1954 fd = lo_inode_open(lo, inode, O_RDWR);
1955 if (fd < 0) {
1956 *err = -fd;
1957 free(plock);
1958 return NULL;
1961 plock->lock_owner = lock_owner;
1962 plock->fd = fd;
1963 g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner),
1964 plock);
1965 return plock;
1968 static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
1969 struct flock *lock)
1971 struct lo_data *lo = lo_data(req);
1972 struct lo_inode *inode;
1973 struct lo_inode_plock *plock;
1974 int ret, saverr = 0;
1976 fuse_log(FUSE_LOG_DEBUG,
1977 "lo_getlk(ino=%" PRIu64 ", flags=%d)"
1978 " owner=0x%lx, l_type=%d l_start=0x%lx"
1979 " l_len=0x%lx\n",
1980 ino, fi->flags, fi->lock_owner, lock->l_type, lock->l_start,
1981 lock->l_len);
1983 if (!lo->posix_lock) {
1984 fuse_reply_err(req, ENOSYS);
1985 return;
1988 inode = lo_inode(req, ino);
1989 if (!inode) {
1990 fuse_reply_err(req, EBADF);
1991 return;
1994 pthread_mutex_lock(&inode->plock_mutex);
1995 plock =
1996 lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
1997 if (!plock) {
1998 saverr = ret;
1999 goto out;
2002 ret = fcntl(plock->fd, F_OFD_GETLK, lock);
2003 if (ret == -1) {
2004 saverr = errno;
2007 out:
2008 pthread_mutex_unlock(&inode->plock_mutex);
2009 lo_inode_put(lo, &inode);
2011 if (saverr) {
2012 fuse_reply_err(req, saverr);
2013 } else {
2014 fuse_reply_lock(req, lock);
2018 static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
2019 struct flock *lock, int sleep)
2021 struct lo_data *lo = lo_data(req);
2022 struct lo_inode *inode;
2023 struct lo_inode_plock *plock;
2024 int ret, saverr = 0;
2026 fuse_log(FUSE_LOG_DEBUG,
2027 "lo_setlk(ino=%" PRIu64 ", flags=%d)"
2028 " cmd=%d pid=%d owner=0x%lx sleep=%d l_whence=%d"
2029 " l_start=0x%lx l_len=0x%lx\n",
2030 ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep,
2031 lock->l_whence, lock->l_start, lock->l_len);
2033 if (!lo->posix_lock) {
2034 fuse_reply_err(req, ENOSYS);
2035 return;
2038 if (sleep) {
2039 fuse_reply_err(req, EOPNOTSUPP);
2040 return;
2043 inode = lo_inode(req, ino);
2044 if (!inode) {
2045 fuse_reply_err(req, EBADF);
2046 return;
2049 pthread_mutex_lock(&inode->plock_mutex);
2050 plock =
2051 lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
2053 if (!plock) {
2054 saverr = ret;
2055 goto out;
2058 /* TODO: Is it alright to modify flock? */
2059 lock->l_pid = 0;
2060 ret = fcntl(plock->fd, F_OFD_SETLK, lock);
2061 if (ret == -1) {
2062 saverr = errno;
2065 out:
2066 pthread_mutex_unlock(&inode->plock_mutex);
2067 lo_inode_put(lo, &inode);
2069 fuse_reply_err(req, saverr);
2072 static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
2073 struct fuse_file_info *fi)
2075 int res;
2076 struct lo_dirp *d;
2077 int fd;
2079 (void)ino;
2081 d = lo_dirp(req, fi);
2082 if (!d) {
2083 fuse_reply_err(req, EBADF);
2084 return;
2087 fd = dirfd(d->dp);
2088 if (datasync) {
2089 res = fdatasync(fd);
2090 } else {
2091 res = fsync(fd);
2094 lo_dirp_put(&d);
2096 fuse_reply_err(req, res == -1 ? errno : 0);
2099 static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
2101 struct lo_data *lo = lo_data(req);
2102 struct lo_inode *inode = lo_inode(req, ino);
2103 int err;
2105 fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d, kill_priv=%d)"
2106 "\n", ino, fi->flags, fi->kill_priv);
2108 if (!inode) {
2109 fuse_reply_err(req, EBADF);
2110 return;
2113 err = lo_do_open(lo, inode, -1, fi);
2114 lo_inode_put(lo, &inode);
2115 if (err) {
2116 fuse_reply_err(req, err);
2117 } else {
2118 fuse_reply_open(req, fi);
2122 static void lo_release(fuse_req_t req, fuse_ino_t ino,
2123 struct fuse_file_info *fi)
2125 struct lo_data *lo = lo_data(req);
2126 struct lo_map_elem *elem;
2127 int fd = -1;
2129 (void)ino;
2131 pthread_mutex_lock(&lo->mutex);
2132 elem = lo_map_get(&lo->fd_map, fi->fh);
2133 if (elem) {
2134 fd = elem->fd;
2135 elem = NULL;
2136 lo_map_remove(&lo->fd_map, fi->fh);
2138 pthread_mutex_unlock(&lo->mutex);
2140 close(fd);
2141 fuse_reply_err(req, 0);
2144 static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
2146 int res;
2147 (void)ino;
2148 struct lo_inode *inode;
2149 struct lo_data *lo = lo_data(req);
2151 inode = lo_inode(req, ino);
2152 if (!inode) {
2153 fuse_reply_err(req, EBADF);
2154 return;
2157 if (!S_ISREG(inode->filetype)) {
2158 lo_inode_put(lo, &inode);
2159 fuse_reply_err(req, EBADF);
2160 return;
2163 /* An fd is going away. Cleanup associated posix locks */
2164 if (lo->posix_lock) {
2165 pthread_mutex_lock(&inode->plock_mutex);
2166 g_hash_table_remove(inode->posix_locks,
2167 GUINT_TO_POINTER(fi->lock_owner));
2168 pthread_mutex_unlock(&inode->plock_mutex);
2170 res = close(dup(lo_fi_fd(req, fi)));
2171 lo_inode_put(lo, &inode);
2172 fuse_reply_err(req, res == -1 ? errno : 0);
2175 static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync,
2176 struct fuse_file_info *fi)
2178 struct lo_inode *inode = lo_inode(req, ino);
2179 struct lo_data *lo = lo_data(req);
2180 int res;
2181 int fd;
2183 fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino,
2184 (void *)fi);
2186 if (!inode) {
2187 fuse_reply_err(req, EBADF);
2188 return;
2191 if (!fi) {
2192 fd = lo_inode_open(lo, inode, O_RDWR);
2193 if (fd < 0) {
2194 res = -fd;
2195 goto out;
2197 } else {
2198 fd = lo_fi_fd(req, fi);
2201 if (datasync) {
2202 res = fdatasync(fd) == -1 ? errno : 0;
2203 } else {
2204 res = fsync(fd) == -1 ? errno : 0;
2206 if (!fi) {
2207 close(fd);
2209 out:
2210 lo_inode_put(lo, &inode);
2211 fuse_reply_err(req, res);
2214 static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset,
2215 struct fuse_file_info *fi)
2217 struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size);
2219 fuse_log(FUSE_LOG_DEBUG,
2220 "lo_read(ino=%" PRIu64 ", size=%zd, "
2221 "off=%lu)\n",
2222 ino, size, (unsigned long)offset);
2224 buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
2225 buf.buf[0].fd = lo_fi_fd(req, fi);
2226 buf.buf[0].pos = offset;
2228 fuse_reply_data(req, &buf);
2231 static void lo_write_buf(fuse_req_t req, fuse_ino_t ino,
2232 struct fuse_bufvec *in_buf, off_t off,
2233 struct fuse_file_info *fi)
2235 (void)ino;
2236 ssize_t res;
2237 struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf));
2238 bool cap_fsetid_dropped = false;
2240 out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
2241 out_buf.buf[0].fd = lo_fi_fd(req, fi);
2242 out_buf.buf[0].pos = off;
2244 fuse_log(FUSE_LOG_DEBUG,
2245 "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu kill_priv=%d)\n",
2246 ino, out_buf.buf[0].size, (unsigned long)off, fi->kill_priv);
2248 res = drop_security_capability(lo_data(req), out_buf.buf[0].fd);
2249 if (res) {
2250 fuse_reply_err(req, res);
2251 return;
2255 * If kill_priv is set, drop CAP_FSETID which should lead to kernel
2256 * clearing setuid/setgid on file. Note, for WRITE, we need to do
2257 * this even if killpriv_v2 is not enabled. fuse direct write path
2258 * relies on this.
2260 if (fi->kill_priv) {
2261 res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
2262 if (res != 0) {
2263 fuse_reply_err(req, res);
2264 return;
2268 res = fuse_buf_copy(&out_buf, in_buf);
2269 if (res < 0) {
2270 fuse_reply_err(req, -res);
2271 } else {
2272 fuse_reply_write(req, (size_t)res);
2275 if (cap_fsetid_dropped) {
2276 res = gain_effective_cap("FSETID");
2277 if (res) {
2278 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
2283 static void lo_statfs(fuse_req_t req, fuse_ino_t ino)
2285 int res;
2286 struct statvfs stbuf;
2288 res = fstatvfs(lo_fd(req, ino), &stbuf);
2289 if (res == -1) {
2290 fuse_reply_err(req, errno);
2291 } else {
2292 fuse_reply_statfs(req, &stbuf);
2296 static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset,
2297 off_t length, struct fuse_file_info *fi)
2299 int err = EOPNOTSUPP;
2300 (void)ino;
2302 #ifdef CONFIG_FALLOCATE
2303 err = fallocate(lo_fi_fd(req, fi), mode, offset, length);
2304 if (err < 0) {
2305 err = errno;
2308 #elif defined(CONFIG_POSIX_FALLOCATE)
2309 if (mode) {
2310 fuse_reply_err(req, EOPNOTSUPP);
2311 return;
2314 err = posix_fallocate(lo_fi_fd(req, fi), offset, length);
2315 #endif
2317 fuse_reply_err(req, err);
2320 static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
2321 int op)
2323 int res;
2324 (void)ino;
2326 res = flock(lo_fi_fd(req, fi), op);
2328 fuse_reply_err(req, res == -1 ? errno : 0);
2331 /* types */
2333 * Exit; process attribute unmodified if matched.
2334 * An empty key applies to all.
2336 #define XATTR_MAP_FLAG_OK (1 << 0)
2338 * The attribute is unwanted;
2339 * EPERM on write, hidden on read.
2341 #define XATTR_MAP_FLAG_BAD (1 << 1)
2343 * For attr that start with 'key' prepend 'prepend'
2344 * 'key' may be empty to prepend for all attrs
2345 * key is defined from set/remove point of view.
2346 * Automatically reversed on read
2348 #define XATTR_MAP_FLAG_PREFIX (1 << 2)
2350 /* scopes */
2351 /* Apply rule to get/set/remove */
2352 #define XATTR_MAP_FLAG_CLIENT (1 << 16)
2353 /* Apply rule to list */
2354 #define XATTR_MAP_FLAG_SERVER (1 << 17)
2355 /* Apply rule to all */
2356 #define XATTR_MAP_FLAG_ALL (XATTR_MAP_FLAG_SERVER | XATTR_MAP_FLAG_CLIENT)
2358 static void add_xattrmap_entry(struct lo_data *lo,
2359 const XattrMapEntry *new_entry)
2361 XattrMapEntry *res = g_realloc_n(lo->xattr_map_list,
2362 lo->xattr_map_nentries + 1,
2363 sizeof(XattrMapEntry));
2364 res[lo->xattr_map_nentries++] = *new_entry;
2366 lo->xattr_map_list = res;
2369 static void free_xattrmap(struct lo_data *lo)
2371 XattrMapEntry *map = lo->xattr_map_list;
2372 size_t i;
2374 if (!map) {
2375 return;
2378 for (i = 0; i < lo->xattr_map_nentries; i++) {
2379 g_free(map[i].key);
2380 g_free(map[i].prepend);
2383 g_free(map);
2384 lo->xattr_map_list = NULL;
2385 lo->xattr_map_nentries = -1;
2389 * Handle the 'map' type, which is sugar for a set of commands
2390 * for the common case of prefixing a subset or everything,
2391 * and allowing anything not prefixed through.
2392 * It must be the last entry in the stream, although there
2393 * can be other entries before it.
2394 * The form is:
2395 * :map:key:prefix:
2397 * key maybe empty in which case all entries are prefixed.
2399 static void parse_xattrmap_map(struct lo_data *lo,
2400 const char *rule, char sep)
2402 const char *tmp;
2403 char *key;
2404 char *prefix;
2405 XattrMapEntry tmp_entry;
2407 if (*rule != sep) {
2408 fuse_log(FUSE_LOG_ERR,
2409 "%s: Expecting '%c' after 'map' keyword, found '%c'\n",
2410 __func__, sep, *rule);
2411 exit(1);
2414 rule++;
2416 /* At start of 'key' field */
2417 tmp = strchr(rule, sep);
2418 if (!tmp) {
2419 fuse_log(FUSE_LOG_ERR,
2420 "%s: Missing '%c' at end of key field in map rule\n",
2421 __func__, sep);
2422 exit(1);
2425 key = g_strndup(rule, tmp - rule);
2426 rule = tmp + 1;
2428 /* At start of prefix field */
2429 tmp = strchr(rule, sep);
2430 if (!tmp) {
2431 fuse_log(FUSE_LOG_ERR,
2432 "%s: Missing '%c' at end of prefix field in map rule\n",
2433 __func__, sep);
2434 exit(1);
2437 prefix = g_strndup(rule, tmp - rule);
2438 rule = tmp + 1;
2441 * This should be the end of the string, we don't allow
2442 * any more commands after 'map'.
2444 if (*rule) {
2445 fuse_log(FUSE_LOG_ERR,
2446 "%s: Expecting end of command after map, found '%c'\n",
2447 __func__, *rule);
2448 exit(1);
2451 /* 1st: Prefix matches/everything */
2452 tmp_entry.flags = XATTR_MAP_FLAG_PREFIX | XATTR_MAP_FLAG_ALL;
2453 tmp_entry.key = g_strdup(key);
2454 tmp_entry.prepend = g_strdup(prefix);
2455 add_xattrmap_entry(lo, &tmp_entry);
2457 if (!*key) {
2458 /* Prefix all case */
2460 /* 2nd: Hide any non-prefixed entries on the host */
2461 tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_ALL;
2462 tmp_entry.key = g_strdup("");
2463 tmp_entry.prepend = g_strdup("");
2464 add_xattrmap_entry(lo, &tmp_entry);
2465 } else {
2466 /* Prefix matching case */
2468 /* 2nd: Hide non-prefixed but matching entries on the host */
2469 tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_SERVER;
2470 tmp_entry.key = g_strdup(""); /* Not used */
2471 tmp_entry.prepend = g_strdup(key);
2472 add_xattrmap_entry(lo, &tmp_entry);
2474 /* 3rd: Stop the client accessing prefixed attributes directly */
2475 tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_CLIENT;
2476 tmp_entry.key = g_strdup(prefix);
2477 tmp_entry.prepend = g_strdup(""); /* Not used */
2478 add_xattrmap_entry(lo, &tmp_entry);
2480 /* 4th: Everything else is OK */
2481 tmp_entry.flags = XATTR_MAP_FLAG_OK | XATTR_MAP_FLAG_ALL;
2482 tmp_entry.key = g_strdup("");
2483 tmp_entry.prepend = g_strdup("");
2484 add_xattrmap_entry(lo, &tmp_entry);
2487 g_free(key);
2488 g_free(prefix);
2491 static void parse_xattrmap(struct lo_data *lo)
2493 const char *map = lo->xattrmap;
2494 const char *tmp;
2495 int ret;
2497 lo->xattr_map_nentries = 0;
2498 while (*map) {
2499 XattrMapEntry tmp_entry;
2500 char sep;
2502 if (isspace(*map)) {
2503 map++;
2504 continue;
2506 /* The separator is the first non-space of the rule */
2507 sep = *map++;
2508 if (!sep) {
2509 break;
2512 tmp_entry.flags = 0;
2513 /* Start of 'type' */
2514 if (strstart(map, "prefix", &map)) {
2515 tmp_entry.flags |= XATTR_MAP_FLAG_PREFIX;
2516 } else if (strstart(map, "ok", &map)) {
2517 tmp_entry.flags |= XATTR_MAP_FLAG_OK;
2518 } else if (strstart(map, "bad", &map)) {
2519 tmp_entry.flags |= XATTR_MAP_FLAG_BAD;
2520 } else if (strstart(map, "map", &map)) {
2522 * map is sugar that adds a number of rules, and must be
2523 * the last entry.
2525 parse_xattrmap_map(lo, map, sep);
2526 break;
2527 } else {
2528 fuse_log(FUSE_LOG_ERR,
2529 "%s: Unexpected type;"
2530 "Expecting 'prefix', 'ok', 'bad' or 'map' in rule %zu\n",
2531 __func__, lo->xattr_map_nentries);
2532 exit(1);
2535 if (*map++ != sep) {
2536 fuse_log(FUSE_LOG_ERR,
2537 "%s: Missing '%c' at end of type field of rule %zu\n",
2538 __func__, sep, lo->xattr_map_nentries);
2539 exit(1);
2542 /* Start of 'scope' */
2543 if (strstart(map, "client", &map)) {
2544 tmp_entry.flags |= XATTR_MAP_FLAG_CLIENT;
2545 } else if (strstart(map, "server", &map)) {
2546 tmp_entry.flags |= XATTR_MAP_FLAG_SERVER;
2547 } else if (strstart(map, "all", &map)) {
2548 tmp_entry.flags |= XATTR_MAP_FLAG_ALL;
2549 } else {
2550 fuse_log(FUSE_LOG_ERR,
2551 "%s: Unexpected scope;"
2552 " Expecting 'client', 'server', or 'all', in rule %zu\n",
2553 __func__, lo->xattr_map_nentries);
2554 exit(1);
2557 if (*map++ != sep) {
2558 fuse_log(FUSE_LOG_ERR,
2559 "%s: Expecting '%c' found '%c'"
2560 " after scope in rule %zu\n",
2561 __func__, sep, *map, lo->xattr_map_nentries);
2562 exit(1);
2565 /* At start of 'key' field */
2566 tmp = strchr(map, sep);
2567 if (!tmp) {
2568 fuse_log(FUSE_LOG_ERR,
2569 "%s: Missing '%c' at end of key field of rule %zu",
2570 __func__, sep, lo->xattr_map_nentries);
2571 exit(1);
2573 tmp_entry.key = g_strndup(map, tmp - map);
2574 map = tmp + 1;
2576 /* At start of 'prepend' field */
2577 tmp = strchr(map, sep);
2578 if (!tmp) {
2579 fuse_log(FUSE_LOG_ERR,
2580 "%s: Missing '%c' at end of prepend field of rule %zu",
2581 __func__, sep, lo->xattr_map_nentries);
2582 exit(1);
2584 tmp_entry.prepend = g_strndup(map, tmp - map);
2585 map = tmp + 1;
2587 add_xattrmap_entry(lo, &tmp_entry);
2588 /* End of rule - go around again for another rule */
2591 if (!lo->xattr_map_nentries) {
2592 fuse_log(FUSE_LOG_ERR, "Empty xattr map\n");
2593 exit(1);
2596 ret = xattr_map_client(lo, "security.capability",
2597 &lo->xattr_security_capability);
2598 if (ret) {
2599 fuse_log(FUSE_LOG_ERR, "Failed to map security.capability: %s\n",
2600 strerror(ret));
2601 exit(1);
2603 if (!strcmp(lo->xattr_security_capability, "security.capability")) {
2604 /* 1-1 mapping, don't need to do anything */
2605 free(lo->xattr_security_capability);
2606 lo->xattr_security_capability = NULL;
2611 * For use with getxattr/setxattr/removexattr, where the client
2612 * gives us a name and we may need to choose a different one.
2613 * Allocates a buffer for the result placing it in *out_name.
2614 * If there's no change then *out_name is not set.
2615 * Returns 0 on success
2616 * Can return -EPERM to indicate we block a given attribute
2617 * (in which case out_name is not allocated)
2618 * Can return -ENOMEM to indicate out_name couldn't be allocated.
2620 static int xattr_map_client(const struct lo_data *lo, const char *client_name,
2621 char **out_name)
2623 size_t i;
2624 for (i = 0; i < lo->xattr_map_nentries; i++) {
2625 const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
2627 if ((cur_entry->flags & XATTR_MAP_FLAG_CLIENT) &&
2628 (strstart(client_name, cur_entry->key, NULL))) {
2629 if (cur_entry->flags & XATTR_MAP_FLAG_BAD) {
2630 return -EPERM;
2632 if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
2633 /* Unmodified name */
2634 return 0;
2636 if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
2637 *out_name = g_try_malloc(strlen(client_name) +
2638 strlen(cur_entry->prepend) + 1);
2639 if (!*out_name) {
2640 return -ENOMEM;
2642 sprintf(*out_name, "%s%s", cur_entry->prepend, client_name);
2643 return 0;
2648 return -EPERM;
2652 * For use with listxattr where the server fs gives us a name and we may need
2653 * to sanitize this for the client.
2654 * Returns a pointer to the result in *out_name
2655 * This is always the original string or the current string with some prefix
2656 * removed; no reallocation is done.
2657 * Returns 0 on success
2658 * Can return -ENODATA to indicate the name should be dropped from the list.
2660 static int xattr_map_server(const struct lo_data *lo, const char *server_name,
2661 const char **out_name)
2663 size_t i;
2664 const char *end;
2666 for (i = 0; i < lo->xattr_map_nentries; i++) {
2667 const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
2669 if ((cur_entry->flags & XATTR_MAP_FLAG_SERVER) &&
2670 (strstart(server_name, cur_entry->prepend, &end))) {
2671 if (cur_entry->flags & XATTR_MAP_FLAG_BAD) {
2672 return -ENODATA;
2674 if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
2675 *out_name = server_name;
2676 return 0;
2678 if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
2679 /* Remove prefix */
2680 *out_name = end;
2681 return 0;
2686 return -ENODATA;
2689 static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
2690 size_t size)
2692 struct lo_data *lo = lo_data(req);
2693 char *value = NULL;
2694 char procname[64];
2695 const char *name;
2696 char *mapped_name;
2697 struct lo_inode *inode;
2698 ssize_t ret;
2699 int saverr;
2700 int fd = -1;
2702 mapped_name = NULL;
2703 name = in_name;
2704 if (lo->xattrmap) {
2705 ret = xattr_map_client(lo, in_name, &mapped_name);
2706 if (ret < 0) {
2707 if (ret == -EPERM) {
2708 ret = -ENODATA;
2710 fuse_reply_err(req, -ret);
2711 return;
2713 if (mapped_name) {
2714 name = mapped_name;
2718 inode = lo_inode(req, ino);
2719 if (!inode) {
2720 fuse_reply_err(req, EBADF);
2721 g_free(mapped_name);
2722 return;
2725 saverr = ENOSYS;
2726 if (!lo_data(req)->xattr) {
2727 goto out;
2730 fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n",
2731 ino, name, size);
2733 if (size) {
2734 value = malloc(size);
2735 if (!value) {
2736 goto out_err;
2740 sprintf(procname, "%i", inode->fd);
2742 * It is not safe to open() non-regular/non-dir files in file server
2743 * unless O_PATH is used, so use that method for regular files/dir
2744 * only (as it seems giving less performance overhead).
2745 * Otherwise, call fchdir() to avoid open().
2747 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2748 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2749 if (fd < 0) {
2750 goto out_err;
2752 ret = fgetxattr(fd, name, value, size);
2753 } else {
2754 /* fchdir should not fail here */
2755 assert(fchdir(lo->proc_self_fd) == 0);
2756 ret = getxattr(procname, name, value, size);
2757 assert(fchdir(lo->root.fd) == 0);
2760 if (ret == -1) {
2761 goto out_err;
2763 if (size) {
2764 saverr = 0;
2765 if (ret == 0) {
2766 goto out;
2768 fuse_reply_buf(req, value, ret);
2769 } else {
2770 fuse_reply_xattr(req, ret);
2772 out_free:
2773 free(value);
2775 if (fd >= 0) {
2776 close(fd);
2779 lo_inode_put(lo, &inode);
2780 return;
2782 out_err:
2783 saverr = errno;
2784 out:
2785 fuse_reply_err(req, saverr);
2786 g_free(mapped_name);
2787 goto out_free;
2790 static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
2792 struct lo_data *lo = lo_data(req);
2793 char *value = NULL;
2794 char procname[64];
2795 struct lo_inode *inode;
2796 ssize_t ret;
2797 int saverr;
2798 int fd = -1;
2800 inode = lo_inode(req, ino);
2801 if (!inode) {
2802 fuse_reply_err(req, EBADF);
2803 return;
2806 saverr = ENOSYS;
2807 if (!lo_data(req)->xattr) {
2808 goto out;
2811 fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino,
2812 size);
2814 if (size) {
2815 value = malloc(size);
2816 if (!value) {
2817 goto out_err;
2821 sprintf(procname, "%i", inode->fd);
2822 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2823 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2824 if (fd < 0) {
2825 goto out_err;
2827 ret = flistxattr(fd, value, size);
2828 } else {
2829 /* fchdir should not fail here */
2830 assert(fchdir(lo->proc_self_fd) == 0);
2831 ret = listxattr(procname, value, size);
2832 assert(fchdir(lo->root.fd) == 0);
2835 if (ret == -1) {
2836 goto out_err;
2838 if (size) {
2839 saverr = 0;
2840 if (ret == 0) {
2841 goto out;
2844 if (lo->xattr_map_list) {
2846 * Map the names back, some attributes might be dropped,
2847 * some shortened, but not increased, so we shouldn't
2848 * run out of room.
2850 size_t out_index, in_index;
2851 out_index = 0;
2852 in_index = 0;
2853 while (in_index < ret) {
2854 const char *map_out;
2855 char *in_ptr = value + in_index;
2856 /* Length of current attribute name */
2857 size_t in_len = strlen(value + in_index) + 1;
2859 int mapret = xattr_map_server(lo, in_ptr, &map_out);
2860 if (mapret != -ENODATA && mapret != 0) {
2861 /* Shouldn't happen */
2862 saverr = -mapret;
2863 goto out;
2865 if (mapret == 0) {
2866 /* Either unchanged, or truncated */
2867 size_t out_len;
2868 if (map_out != in_ptr) {
2869 /* +1 copies the NIL */
2870 out_len = strlen(map_out) + 1;
2871 } else {
2872 /* No change */
2873 out_len = in_len;
2876 * Move result along, may still be needed for an unchanged
2877 * entry if a previous entry was changed.
2879 memmove(value + out_index, map_out, out_len);
2881 out_index += out_len;
2883 in_index += in_len;
2885 ret = out_index;
2886 if (ret == 0) {
2887 goto out;
2890 fuse_reply_buf(req, value, ret);
2891 } else {
2893 * xattrmap only ever shortens the result,
2894 * so we don't need to do anything clever with the
2895 * allocation length here.
2897 fuse_reply_xattr(req, ret);
2899 out_free:
2900 free(value);
2902 if (fd >= 0) {
2903 close(fd);
2906 lo_inode_put(lo, &inode);
2907 return;
2909 out_err:
2910 saverr = errno;
2911 out:
2912 fuse_reply_err(req, saverr);
2913 goto out_free;
2916 static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
2917 const char *value, size_t size, int flags)
2919 char procname[64];
2920 const char *name;
2921 char *mapped_name;
2922 struct lo_data *lo = lo_data(req);
2923 struct lo_inode *inode;
2924 ssize_t ret;
2925 int saverr;
2926 int fd = -1;
2928 mapped_name = NULL;
2929 name = in_name;
2930 if (lo->xattrmap) {
2931 ret = xattr_map_client(lo, in_name, &mapped_name);
2932 if (ret < 0) {
2933 fuse_reply_err(req, -ret);
2934 return;
2936 if (mapped_name) {
2937 name = mapped_name;
2941 inode = lo_inode(req, ino);
2942 if (!inode) {
2943 fuse_reply_err(req, EBADF);
2944 g_free(mapped_name);
2945 return;
2948 saverr = ENOSYS;
2949 if (!lo_data(req)->xattr) {
2950 goto out;
2953 fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64
2954 ", name=%s value=%s size=%zd)\n", ino, name, value, size);
2956 sprintf(procname, "%i", inode->fd);
2957 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2958 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2959 if (fd < 0) {
2960 saverr = errno;
2961 goto out;
2963 ret = fsetxattr(fd, name, value, size, flags);
2964 } else {
2965 /* fchdir should not fail here */
2966 assert(fchdir(lo->proc_self_fd) == 0);
2967 ret = setxattr(procname, name, value, size, flags);
2968 assert(fchdir(lo->root.fd) == 0);
2971 saverr = ret == -1 ? errno : 0;
2973 out:
2974 if (fd >= 0) {
2975 close(fd);
2978 lo_inode_put(lo, &inode);
2979 g_free(mapped_name);
2980 fuse_reply_err(req, saverr);
2983 static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *in_name)
2985 char procname[64];
2986 const char *name;
2987 char *mapped_name;
2988 struct lo_data *lo = lo_data(req);
2989 struct lo_inode *inode;
2990 ssize_t ret;
2991 int saverr;
2992 int fd = -1;
2994 mapped_name = NULL;
2995 name = in_name;
2996 if (lo->xattrmap) {
2997 ret = xattr_map_client(lo, in_name, &mapped_name);
2998 if (ret < 0) {
2999 fuse_reply_err(req, -ret);
3000 return;
3002 if (mapped_name) {
3003 name = mapped_name;
3007 inode = lo_inode(req, ino);
3008 if (!inode) {
3009 fuse_reply_err(req, EBADF);
3010 g_free(mapped_name);
3011 return;
3014 saverr = ENOSYS;
3015 if (!lo_data(req)->xattr) {
3016 goto out;
3019 fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino,
3020 name);
3022 sprintf(procname, "%i", inode->fd);
3023 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
3024 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
3025 if (fd < 0) {
3026 saverr = errno;
3027 goto out;
3029 ret = fremovexattr(fd, name);
3030 } else {
3031 /* fchdir should not fail here */
3032 assert(fchdir(lo->proc_self_fd) == 0);
3033 ret = removexattr(procname, name);
3034 assert(fchdir(lo->root.fd) == 0);
3037 saverr = ret == -1 ? errno : 0;
3039 out:
3040 if (fd >= 0) {
3041 close(fd);
3044 lo_inode_put(lo, &inode);
3045 g_free(mapped_name);
3046 fuse_reply_err(req, saverr);
3049 #ifdef HAVE_COPY_FILE_RANGE
3050 static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in,
3051 struct fuse_file_info *fi_in, fuse_ino_t ino_out,
3052 off_t off_out, struct fuse_file_info *fi_out,
3053 size_t len, int flags)
3055 int in_fd, out_fd;
3056 ssize_t res;
3058 in_fd = lo_fi_fd(req, fi_in);
3059 out_fd = lo_fi_fd(req, fi_out);
3061 fuse_log(FUSE_LOG_DEBUG,
3062 "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, "
3063 "off=%lu, ino=%" PRIu64 "/fd=%d, "
3064 "off=%lu, size=%zd, flags=0x%x)\n",
3065 ino_in, in_fd, off_in, ino_out, out_fd, off_out, len, flags);
3067 res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags);
3068 if (res < 0) {
3069 fuse_reply_err(req, errno);
3070 } else {
3071 fuse_reply_write(req, res);
3074 #endif
3076 static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
3077 struct fuse_file_info *fi)
3079 off_t res;
3081 (void)ino;
3082 res = lseek(lo_fi_fd(req, fi), off, whence);
3083 if (res != -1) {
3084 fuse_reply_lseek(req, res);
3085 } else {
3086 fuse_reply_err(req, errno);
3090 static void lo_destroy(void *userdata)
3092 struct lo_data *lo = (struct lo_data *)userdata;
3094 pthread_mutex_lock(&lo->mutex);
3095 while (true) {
3096 GHashTableIter iter;
3097 gpointer key, value;
3099 g_hash_table_iter_init(&iter, lo->inodes);
3100 if (!g_hash_table_iter_next(&iter, &key, &value)) {
3101 break;
3104 struct lo_inode *inode = value;
3105 unref_inode(lo, inode, inode->nlookup);
3107 pthread_mutex_unlock(&lo->mutex);
3110 static struct fuse_lowlevel_ops lo_oper = {
3111 .init = lo_init,
3112 .lookup = lo_lookup,
3113 .mkdir = lo_mkdir,
3114 .mknod = lo_mknod,
3115 .symlink = lo_symlink,
3116 .link = lo_link,
3117 .unlink = lo_unlink,
3118 .rmdir = lo_rmdir,
3119 .rename = lo_rename,
3120 .forget = lo_forget,
3121 .forget_multi = lo_forget_multi,
3122 .getattr = lo_getattr,
3123 .setattr = lo_setattr,
3124 .readlink = lo_readlink,
3125 .opendir = lo_opendir,
3126 .readdir = lo_readdir,
3127 .readdirplus = lo_readdirplus,
3128 .releasedir = lo_releasedir,
3129 .fsyncdir = lo_fsyncdir,
3130 .create = lo_create,
3131 .getlk = lo_getlk,
3132 .setlk = lo_setlk,
3133 .open = lo_open,
3134 .release = lo_release,
3135 .flush = lo_flush,
3136 .fsync = lo_fsync,
3137 .read = lo_read,
3138 .write_buf = lo_write_buf,
3139 .statfs = lo_statfs,
3140 .fallocate = lo_fallocate,
3141 .flock = lo_flock,
3142 .getxattr = lo_getxattr,
3143 .listxattr = lo_listxattr,
3144 .setxattr = lo_setxattr,
3145 .removexattr = lo_removexattr,
3146 #ifdef HAVE_COPY_FILE_RANGE
3147 .copy_file_range = lo_copy_file_range,
3148 #endif
3149 .lseek = lo_lseek,
3150 .destroy = lo_destroy,
3153 /* Print vhost-user.json backend program capabilities */
3154 static void print_capabilities(void)
3156 printf("{\n");
3157 printf(" \"type\": \"fs\"\n");
3158 printf("}\n");
3162 * Drop all Linux capabilities because the wait parent process only needs to
3163 * sit in waitpid(2) and terminate.
3165 static void setup_wait_parent_capabilities(void)
3167 capng_setpid(syscall(SYS_gettid));
3168 capng_clear(CAPNG_SELECT_BOTH);
3169 capng_apply(CAPNG_SELECT_BOTH);
3173 * Move to a new mount, net, and pid namespaces to isolate this process.
3175 static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
3177 pid_t child;
3180 * Create a new pid namespace for *child* processes. We'll have to
3181 * fork in order to enter the new pid namespace. A new mount namespace
3182 * is also needed so that we can remount /proc for the new pid
3183 * namespace.
3185 * Our UNIX domain sockets have been created. Now we can move to
3186 * an empty network namespace to prevent TCP/IP and other network
3187 * activity in case this process is compromised.
3189 if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) {
3190 fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
3191 exit(1);
3194 child = fork();
3195 if (child < 0) {
3196 fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n");
3197 exit(1);
3199 if (child > 0) {
3200 pid_t waited;
3201 int wstatus;
3203 setup_wait_parent_capabilities();
3205 /* The parent waits for the child */
3206 do {
3207 waited = waitpid(child, &wstatus, 0);
3208 } while (waited < 0 && errno == EINTR && !se->exited);
3210 /* We were terminated by a signal, see fuse_signals.c */
3211 if (se->exited) {
3212 exit(0);
3215 if (WIFEXITED(wstatus)) {
3216 exit(WEXITSTATUS(wstatus));
3219 exit(1);
3222 /* Send us SIGTERM when the parent thread terminates, see prctl(2) */
3223 prctl(PR_SET_PDEATHSIG, SIGTERM);
3226 * If the mounts have shared propagation then we want to opt out so our
3227 * mount changes don't affect the parent mount namespace.
3229 if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) {
3230 fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n");
3231 exit(1);
3234 /* The child must remount /proc to use the new pid namespace */
3235 if (mount("proc", "/proc", "proc",
3236 MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) {
3237 fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n");
3238 exit(1);
3242 * We only need /proc/self/fd. Prevent ".." from accessing parent
3243 * directories of /proc/self/fd by bind-mounting it over /proc. Since / was
3244 * previously remounted with MS_REC | MS_SLAVE this mount change only
3245 * affects our process.
3247 if (mount("/proc/self/fd", "/proc", NULL, MS_BIND, NULL) < 0) {
3248 fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, MS_BIND): %m\n");
3249 exit(1);
3252 /* Get the /proc (actually /proc/self/fd, see above) file descriptor */
3253 lo->proc_self_fd = open("/proc", O_PATH);
3254 if (lo->proc_self_fd == -1) {
3255 fuse_log(FUSE_LOG_ERR, "open(/proc, O_PATH): %m\n");
3256 exit(1);
3261 * Capture the capability state, we'll need to restore this for individual
3262 * threads later; see load_capng.
3264 static void setup_capng(void)
3266 /* Note this accesses /proc so has to happen before the sandbox */
3267 if (capng_get_caps_process()) {
3268 fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n");
3269 exit(1);
3271 pthread_mutex_init(&cap.mutex, NULL);
3272 pthread_mutex_lock(&cap.mutex);
3273 cap.saved = capng_save_state();
3274 if (!cap.saved) {
3275 fuse_log(FUSE_LOG_ERR, "capng_save_state\n");
3276 exit(1);
3278 pthread_mutex_unlock(&cap.mutex);
3281 static void cleanup_capng(void)
3283 free(cap.saved);
3284 cap.saved = NULL;
3285 pthread_mutex_destroy(&cap.mutex);
3290 * Make the source directory our root so symlinks cannot escape and no other
3291 * files are accessible. Assumes unshare(CLONE_NEWNS) was already called.
3293 static void setup_mounts(const char *source)
3295 int oldroot;
3296 int newroot;
3298 if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) {
3299 fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
3300 exit(1);
3303 /* This magic is based on lxc's lxc_pivot_root() */
3304 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
3305 if (oldroot < 0) {
3306 fuse_log(FUSE_LOG_ERR, "open(/): %m\n");
3307 exit(1);
3310 newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
3311 if (newroot < 0) {
3312 fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source);
3313 exit(1);
3316 if (fchdir(newroot) < 0) {
3317 fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
3318 exit(1);
3321 if (syscall(__NR_pivot_root, ".", ".") < 0) {
3322 fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n");
3323 exit(1);
3326 if (fchdir(oldroot) < 0) {
3327 fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n");
3328 exit(1);
3331 if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) {
3332 fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n");
3333 exit(1);
3336 if (umount2(".", MNT_DETACH) < 0) {
3337 fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n");
3338 exit(1);
3341 if (fchdir(newroot) < 0) {
3342 fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
3343 exit(1);
3346 close(newroot);
3347 close(oldroot);
3351 * Only keep capabilities in allowlist that are needed for file system operation
3352 * The (possibly NULL) modcaps_in string passed in is free'd before exit.
3354 static void setup_capabilities(char *modcaps_in)
3356 char *modcaps = modcaps_in;
3357 pthread_mutex_lock(&cap.mutex);
3358 capng_restore_state(&cap.saved);
3361 * Add to allowlist file system-related capabilities that are needed for a
3362 * file server to act like root. Drop everything else like networking and
3363 * sysadmin capabilities.
3365 * Exclusions:
3366 * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl
3367 * and we don't support that.
3368 * 2. CAP_MAC_OVERRIDE is not included because it only seems to be
3369 * used by the Smack LSM. Omit it until there is demand for it.
3371 capng_setpid(syscall(SYS_gettid));
3372 capng_clear(CAPNG_SELECT_BOTH);
3373 if (capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE,
3374 CAP_CHOWN,
3375 CAP_DAC_OVERRIDE,
3376 CAP_FOWNER,
3377 CAP_FSETID,
3378 CAP_SETGID,
3379 CAP_SETUID,
3380 CAP_MKNOD,
3381 CAP_SETFCAP,
3382 -1)) {
3383 fuse_log(FUSE_LOG_ERR, "%s: capng_updatev failed\n", __func__);
3384 exit(1);
3388 * The modcaps option is a colon separated list of caps,
3389 * each preceded by either + or -.
3391 while (modcaps) {
3392 capng_act_t action;
3393 int cap;
3395 char *next = strchr(modcaps, ':');
3396 if (next) {
3397 *next = '\0';
3398 next++;
3401 switch (modcaps[0]) {
3402 case '+':
3403 action = CAPNG_ADD;
3404 break;
3406 case '-':
3407 action = CAPNG_DROP;
3408 break;
3410 default:
3411 fuse_log(FUSE_LOG_ERR,
3412 "%s: Expecting '+'/'-' in modcaps but found '%c'\n",
3413 __func__, modcaps[0]);
3414 exit(1);
3416 cap = capng_name_to_capability(modcaps + 1);
3417 if (cap < 0) {
3418 fuse_log(FUSE_LOG_ERR, "%s: Unknown capability '%s'\n", __func__,
3419 modcaps);
3420 exit(1);
3422 if (capng_update(action, CAPNG_PERMITTED | CAPNG_EFFECTIVE, cap)) {
3423 fuse_log(FUSE_LOG_ERR, "%s: capng_update failed for '%s'\n",
3424 __func__, modcaps);
3425 exit(1);
3428 modcaps = next;
3430 g_free(modcaps_in);
3432 if (capng_apply(CAPNG_SELECT_BOTH)) {
3433 fuse_log(FUSE_LOG_ERR, "%s: capng_apply failed\n", __func__);
3434 exit(1);
3437 cap.saved = capng_save_state();
3438 if (!cap.saved) {
3439 fuse_log(FUSE_LOG_ERR, "%s: capng_save_state failed\n", __func__);
3440 exit(1);
3442 pthread_mutex_unlock(&cap.mutex);
3446 * Use chroot as a weaker sandbox for environments where the process is
3447 * launched without CAP_SYS_ADMIN.
3449 static void setup_chroot(struct lo_data *lo)
3451 lo->proc_self_fd = open("/proc/self/fd", O_PATH);
3452 if (lo->proc_self_fd == -1) {
3453 fuse_log(FUSE_LOG_ERR, "open(\"/proc/self/fd\", O_PATH): %m\n");
3454 exit(1);
3458 * Make the shared directory the file system root so that FUSE_OPEN
3459 * (lo_open()) cannot escape the shared directory by opening a symlink.
3461 * The chroot(2) syscall is later disabled by seccomp and the
3462 * CAP_SYS_CHROOT capability is dropped so that tampering with the chroot
3463 * is not possible.
3465 * However, it's still possible to escape the chroot via lo->proc_self_fd
3466 * but that requires first gaining control of the process.
3468 if (chroot(lo->source) != 0) {
3469 fuse_log(FUSE_LOG_ERR, "chroot(\"%s\"): %m\n", lo->source);
3470 exit(1);
3473 /* Move into the chroot */
3474 if (chdir("/") != 0) {
3475 fuse_log(FUSE_LOG_ERR, "chdir(\"/\"): %m\n");
3476 exit(1);
3481 * Lock down this process to prevent access to other processes or files outside
3482 * source directory. This reduces the impact of arbitrary code execution bugs.
3484 static void setup_sandbox(struct lo_data *lo, struct fuse_session *se,
3485 bool enable_syslog)
3487 if (lo->sandbox == SANDBOX_NAMESPACE) {
3488 setup_namespaces(lo, se);
3489 setup_mounts(lo->source);
3490 } else {
3491 setup_chroot(lo);
3494 setup_seccomp(enable_syslog);
3495 setup_capabilities(g_strdup(lo->modcaps));
3498 /* Set the maximum number of open file descriptors */
3499 static void setup_nofile_rlimit(unsigned long rlimit_nofile)
3501 struct rlimit rlim = {
3502 .rlim_cur = rlimit_nofile,
3503 .rlim_max = rlimit_nofile,
3506 if (rlimit_nofile == 0) {
3507 return; /* nothing to do */
3510 if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
3511 /* Ignore SELinux denials */
3512 if (errno == EPERM) {
3513 return;
3516 fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n");
3517 exit(1);
3521 static void log_func(enum fuse_log_level level, const char *fmt, va_list ap)
3523 g_autofree char *localfmt = NULL;
3524 struct timespec ts;
3525 struct tm tm;
3526 char sec_fmt[sizeof "2020-12-07 18:17:54"];
3527 char zone_fmt[sizeof "+0100"];
3529 if (current_log_level < level) {
3530 return;
3533 if (current_log_level == FUSE_LOG_DEBUG) {
3534 if (use_syslog) {
3535 /* no timestamp needed */
3536 localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid),
3537 fmt);
3538 } else {
3539 /* try formatting a broken-down timestamp */
3540 if (clock_gettime(CLOCK_REALTIME, &ts) != -1 &&
3541 localtime_r(&ts.tv_sec, &tm) != NULL &&
3542 strftime(sec_fmt, sizeof sec_fmt, "%Y-%m-%d %H:%M:%S",
3543 &tm) != 0 &&
3544 strftime(zone_fmt, sizeof zone_fmt, "%z", &tm) != 0) {
3545 localfmt = g_strdup_printf("[%s.%02ld%s] [ID: %08ld] %s",
3546 sec_fmt,
3547 ts.tv_nsec / (10L * 1000 * 1000),
3548 zone_fmt, syscall(__NR_gettid),
3549 fmt);
3550 } else {
3551 /* fall back to a flat timestamp */
3552 localfmt = g_strdup_printf("[%" PRId64 "] [ID: %08ld] %s",
3553 get_clock(), syscall(__NR_gettid),
3554 fmt);
3557 fmt = localfmt;
3560 if (use_syslog) {
3561 int priority = LOG_ERR;
3562 switch (level) {
3563 case FUSE_LOG_EMERG:
3564 priority = LOG_EMERG;
3565 break;
3566 case FUSE_LOG_ALERT:
3567 priority = LOG_ALERT;
3568 break;
3569 case FUSE_LOG_CRIT:
3570 priority = LOG_CRIT;
3571 break;
3572 case FUSE_LOG_ERR:
3573 priority = LOG_ERR;
3574 break;
3575 case FUSE_LOG_WARNING:
3576 priority = LOG_WARNING;
3577 break;
3578 case FUSE_LOG_NOTICE:
3579 priority = LOG_NOTICE;
3580 break;
3581 case FUSE_LOG_INFO:
3582 priority = LOG_INFO;
3583 break;
3584 case FUSE_LOG_DEBUG:
3585 priority = LOG_DEBUG;
3586 break;
3588 vsyslog(priority, fmt, ap);
3589 } else {
3590 vfprintf(stderr, fmt, ap);
3594 static void setup_root(struct lo_data *lo, struct lo_inode *root)
3596 int fd, res;
3597 struct stat stat;
3598 uint64_t mnt_id;
3600 fd = open("/", O_PATH);
3601 if (fd == -1) {
3602 fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source);
3603 exit(1);
3606 res = do_statx(lo, fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
3607 &mnt_id);
3608 if (res == -1) {
3609 fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source);
3610 exit(1);
3613 root->filetype = S_IFDIR;
3614 root->fd = fd;
3615 root->key.ino = stat.st_ino;
3616 root->key.dev = stat.st_dev;
3617 root->key.mnt_id = mnt_id;
3618 root->nlookup = 2;
3619 g_atomic_int_set(&root->refcount, 2);
3620 if (lo->posix_lock) {
3621 pthread_mutex_init(&root->plock_mutex, NULL);
3622 root->posix_locks = g_hash_table_new_full(
3623 g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
3627 static guint lo_key_hash(gconstpointer key)
3629 const struct lo_key *lkey = key;
3631 return (guint)lkey->ino + (guint)lkey->dev + (guint)lkey->mnt_id;
3634 static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
3636 const struct lo_key *la = a;
3637 const struct lo_key *lb = b;
3639 return la->ino == lb->ino && la->dev == lb->dev && la->mnt_id == lb->mnt_id;
3642 static void fuse_lo_data_cleanup(struct lo_data *lo)
3644 if (lo->inodes) {
3645 g_hash_table_destroy(lo->inodes);
3648 if (lo->root.posix_locks) {
3649 g_hash_table_destroy(lo->root.posix_locks);
3651 lo_map_destroy(&lo->fd_map);
3652 lo_map_destroy(&lo->dirp_map);
3653 lo_map_destroy(&lo->ino_map);
3655 if (lo->proc_self_fd >= 0) {
3656 close(lo->proc_self_fd);
3659 if (lo->root.fd >= 0) {
3660 close(lo->root.fd);
3663 free(lo->xattrmap);
3664 free_xattrmap(lo);
3665 free(lo->xattr_security_capability);
3666 free(lo->source);
3669 int main(int argc, char *argv[])
3671 struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
3672 struct fuse_session *se;
3673 struct fuse_cmdline_opts opts;
3674 struct lo_data lo = {
3675 .sandbox = SANDBOX_NAMESPACE,
3676 .debug = 0,
3677 .writeback = 0,
3678 .posix_lock = 0,
3679 .allow_direct_io = 0,
3680 .proc_self_fd = -1,
3681 .user_killpriv_v2 = -1,
3683 struct lo_map_elem *root_elem;
3684 struct lo_map_elem *reserve_elem;
3685 int ret = -1;
3687 /* Initialize time conversion information for localtime_r(). */
3688 tzset();
3690 /* Don't mask creation mode, kernel already did that */
3691 umask(0);
3693 qemu_init_exec_dir(argv[0]);
3695 pthread_mutex_init(&lo.mutex, NULL);
3696 lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal);
3697 lo.root.fd = -1;
3698 lo.root.fuse_ino = FUSE_ROOT_ID;
3699 lo.cache = CACHE_AUTO;
3702 * Set up the ino map like this:
3703 * [0] Reserved (will not be used)
3704 * [1] Root inode
3706 lo_map_init(&lo.ino_map);
3707 reserve_elem = lo_map_reserve(&lo.ino_map, 0);
3708 if (!reserve_elem) {
3709 fuse_log(FUSE_LOG_ERR, "failed to alloc reserve_elem.\n");
3710 goto err_out1;
3712 reserve_elem->in_use = false;
3713 root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino);
3714 if (!root_elem) {
3715 fuse_log(FUSE_LOG_ERR, "failed to alloc root_elem.\n");
3716 goto err_out1;
3718 root_elem->inode = &lo.root;
3720 lo_map_init(&lo.dirp_map);
3721 lo_map_init(&lo.fd_map);
3723 if (fuse_parse_cmdline(&args, &opts) != 0) {
3724 goto err_out1;
3726 fuse_set_log_func(log_func);
3727 use_syslog = opts.syslog;
3728 if (use_syslog) {
3729 openlog("virtiofsd", LOG_PID, LOG_DAEMON);
3732 if (opts.show_help) {
3733 printf("usage: %s [options]\n\n", argv[0]);
3734 fuse_cmdline_help();
3735 printf(" -o source=PATH shared directory tree\n");
3736 fuse_lowlevel_help();
3737 ret = 0;
3738 goto err_out1;
3739 } else if (opts.show_version) {
3740 fuse_lowlevel_version();
3741 ret = 0;
3742 goto err_out1;
3743 } else if (opts.print_capabilities) {
3744 print_capabilities();
3745 ret = 0;
3746 goto err_out1;
3749 if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) {
3750 goto err_out1;
3753 if (opts.log_level != 0) {
3754 current_log_level = opts.log_level;
3755 } else {
3756 /* default log level is INFO */
3757 current_log_level = FUSE_LOG_INFO;
3759 lo.debug = opts.debug;
3760 if (lo.debug) {
3761 current_log_level = FUSE_LOG_DEBUG;
3763 if (lo.source) {
3764 struct stat stat;
3765 int res;
3767 res = lstat(lo.source, &stat);
3768 if (res == -1) {
3769 fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n",
3770 lo.source);
3771 exit(1);
3773 if (!S_ISDIR(stat.st_mode)) {
3774 fuse_log(FUSE_LOG_ERR, "source is not a directory\n");
3775 exit(1);
3777 } else {
3778 lo.source = strdup("/");
3779 if (!lo.source) {
3780 fuse_log(FUSE_LOG_ERR, "failed to strdup source\n");
3781 goto err_out1;
3785 if (lo.xattrmap) {
3786 parse_xattrmap(&lo);
3789 if (!lo.timeout_set) {
3790 switch (lo.cache) {
3791 case CACHE_NONE:
3792 lo.timeout = 0.0;
3793 break;
3795 case CACHE_AUTO:
3796 lo.timeout = 1.0;
3797 break;
3799 case CACHE_ALWAYS:
3800 lo.timeout = 86400.0;
3801 break;
3803 } else if (lo.timeout < 0) {
3804 fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout);
3805 exit(1);
3808 lo.use_statx = true;
3810 se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo);
3811 if (se == NULL) {
3812 goto err_out1;
3815 if (fuse_set_signal_handlers(se) != 0) {
3816 goto err_out2;
3819 if (fuse_session_mount(se) != 0) {
3820 goto err_out3;
3823 fuse_daemonize(opts.foreground);
3825 setup_nofile_rlimit(opts.rlimit_nofile);
3827 /* Must be before sandbox since it wants /proc */
3828 setup_capng();
3830 setup_sandbox(&lo, se, opts.syslog);
3832 setup_root(&lo, &lo.root);
3833 /* Block until ctrl+c or fusermount -u */
3834 ret = virtio_loop(se);
3836 fuse_session_unmount(se);
3837 cleanup_capng();
3838 err_out3:
3839 fuse_remove_signal_handlers(se);
3840 err_out2:
3841 fuse_session_destroy(se);
3842 err_out1:
3843 fuse_opt_free_args(&args);
3845 fuse_lo_data_cleanup(&lo);
3847 return ret ? 1 : 0;