virtiofsd: Announce sub-mount points
[qemu.git] / tools / virtiofsd / passthrough_ll.c
blob4db50046d49f88f5e3026e62813f467ca3dc8d89
1 /*
2 * FUSE: Filesystem in Userspace
3 * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
5 * This program can be distributed under the terms of the GNU GPLv2.
6 * See the file COPYING.
7 */
9 /*
11 * This file system mirrors the existing file system hierarchy of the
12 * system, starting at the root file system. This is implemented by
13 * just "passing through" all requests to the corresponding user-space
14 * libc functions. In contrast to passthrough.c and passthrough_fh.c,
15 * this implementation uses the low-level API. Its performance should
16 * be the least bad among the three, but many operations are not
17 * implemented. In particular, it is not possible to remove files (or
18 * directories) because the code necessary to defer actual removal
19 * until the file is not opened anymore would make the example much
20 * more complicated.
22 * When writeback caching is enabled (-o writeback mount option), it
23 * is only possible to write to files for which the mounting user has
24 * read permissions. This is because the writeback cache requires the
25 * kernel to be able to issue read requests for all files (which the
26 * passthrough filesystem cannot satisfy if it can't read the file in
27 * the underlying filesystem).
29 * Compile with:
31 * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o
32 * passthrough_ll
34 * ## Source code ##
35 * \include passthrough_ll.c
38 #include "qemu/osdep.h"
39 #include "qemu/timer.h"
40 #include "fuse_virtio.h"
41 #include "fuse_log.h"
42 #include "fuse_lowlevel.h"
43 #include "standard-headers/linux/fuse.h"
44 #include <assert.h>
45 #include <cap-ng.h>
46 #include <dirent.h>
47 #include <errno.h>
48 #include <glib.h>
49 #include <inttypes.h>
50 #include <limits.h>
51 #include <pthread.h>
52 #include <stdbool.h>
53 #include <stddef.h>
54 #include <stdio.h>
55 #include <stdlib.h>
56 #include <string.h>
57 #include <sys/file.h>
58 #include <sys/mount.h>
59 #include <sys/prctl.h>
60 #include <sys/resource.h>
61 #include <sys/syscall.h>
62 #include <sys/types.h>
63 #include <sys/wait.h>
64 #include <sys/xattr.h>
65 #include <syslog.h>
66 #include <unistd.h>
68 #include "qemu/cutils.h"
69 #include "passthrough_helpers.h"
70 #include "passthrough_seccomp.h"
72 /* Keep track of inode posix locks for each owner. */
73 struct lo_inode_plock {
74 uint64_t lock_owner;
75 int fd; /* fd for OFD locks */
78 struct lo_map_elem {
79 union {
80 struct lo_inode *inode;
81 struct lo_dirp *dirp;
82 int fd;
83 ssize_t freelist;
85 bool in_use;
88 /* Maps FUSE fh or ino values to internal objects */
89 struct lo_map {
90 struct lo_map_elem *elems;
91 size_t nelems;
92 ssize_t freelist;
95 struct lo_key {
96 ino_t ino;
97 dev_t dev;
100 struct lo_inode {
101 int fd;
104 * Atomic reference count for this object. The nlookup field holds a
105 * reference and release it when nlookup reaches 0.
107 gint refcount;
109 struct lo_key key;
112 * This counter keeps the inode alive during the FUSE session.
113 * Incremented when the FUSE inode number is sent in a reply
114 * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc). Decremented when an inode is
115 * released by requests like FUSE_FORGET, FUSE_RMDIR, FUSE_RENAME, etc.
117 * Note that this value is untrusted because the client can manipulate
118 * it arbitrarily using FUSE_FORGET requests.
120 * Protected by lo->mutex.
122 uint64_t nlookup;
124 fuse_ino_t fuse_ino;
125 pthread_mutex_t plock_mutex;
126 GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
128 mode_t filetype;
131 * So we can detect crossmount roots
132 * (As such, this only needs to be valid for directories. Note
133 * that files can have multiple parents due to hard links, and so
134 * their parent_dev may fluctuate.)
136 dev_t parent_dev;
139 struct lo_cred {
140 uid_t euid;
141 gid_t egid;
144 enum {
145 CACHE_NONE,
146 CACHE_AUTO,
147 CACHE_ALWAYS,
150 enum {
151 SANDBOX_NAMESPACE,
152 SANDBOX_CHROOT,
155 typedef struct xattr_map_entry {
156 char *key;
157 char *prepend;
158 unsigned int flags;
159 } XattrMapEntry;
161 struct lo_data {
162 pthread_mutex_t mutex;
163 int sandbox;
164 int debug;
165 int writeback;
166 int flock;
167 int posix_lock;
168 int xattr;
169 char *xattrmap;
170 char *source;
171 char *modcaps;
172 double timeout;
173 int cache;
174 int timeout_set;
175 int readdirplus_set;
176 int readdirplus_clear;
177 int announce_submounts;
178 int allow_direct_io;
179 struct lo_inode root;
180 GHashTable *inodes; /* protected by lo->mutex */
181 struct lo_map ino_map; /* protected by lo->mutex */
182 struct lo_map dirp_map; /* protected by lo->mutex */
183 struct lo_map fd_map; /* protected by lo->mutex */
184 XattrMapEntry *xattr_map_list;
185 size_t xattr_map_nentries;
187 /* An O_PATH file descriptor to /proc/self/fd/ */
188 int proc_self_fd;
191 static const struct fuse_opt lo_opts[] = {
192 { "sandbox=namespace",
193 offsetof(struct lo_data, sandbox),
194 SANDBOX_NAMESPACE },
195 { "sandbox=chroot",
196 offsetof(struct lo_data, sandbox),
197 SANDBOX_CHROOT },
198 { "writeback", offsetof(struct lo_data, writeback), 1 },
199 { "no_writeback", offsetof(struct lo_data, writeback), 0 },
200 { "source=%s", offsetof(struct lo_data, source), 0 },
201 { "flock", offsetof(struct lo_data, flock), 1 },
202 { "no_flock", offsetof(struct lo_data, flock), 0 },
203 { "posix_lock", offsetof(struct lo_data, posix_lock), 1 },
204 { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 },
205 { "xattr", offsetof(struct lo_data, xattr), 1 },
206 { "no_xattr", offsetof(struct lo_data, xattr), 0 },
207 { "xattrmap=%s", offsetof(struct lo_data, xattrmap), 0 },
208 { "modcaps=%s", offsetof(struct lo_data, modcaps), 0 },
209 { "timeout=%lf", offsetof(struct lo_data, timeout), 0 },
210 { "timeout=", offsetof(struct lo_data, timeout_set), 1 },
211 { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE },
212 { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO },
213 { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS },
214 { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 },
215 { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 },
216 { "announce_submounts", offsetof(struct lo_data, announce_submounts), 1 },
217 { "allow_direct_io", offsetof(struct lo_data, allow_direct_io), 1 },
218 { "no_allow_direct_io", offsetof(struct lo_data, allow_direct_io), 0 },
219 FUSE_OPT_END
221 static bool use_syslog = false;
222 static int current_log_level;
223 static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
224 uint64_t n);
226 static struct {
227 pthread_mutex_t mutex;
228 void *saved;
229 } cap;
230 /* That we loaded cap-ng in the current thread from the saved */
231 static __thread bool cap_loaded = 0;
233 static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st);
235 static int is_dot_or_dotdot(const char *name)
237 return name[0] == '.' &&
238 (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'));
241 /* Is `path` a single path component that is not "." or ".."? */
242 static int is_safe_path_component(const char *path)
244 if (strchr(path, '/')) {
245 return 0;
248 return !is_dot_or_dotdot(path);
251 static struct lo_data *lo_data(fuse_req_t req)
253 return (struct lo_data *)fuse_req_userdata(req);
257 * Load capng's state from our saved state if the current thread
258 * hadn't previously been loaded.
259 * returns 0 on success
261 static int load_capng(void)
263 if (!cap_loaded) {
264 pthread_mutex_lock(&cap.mutex);
265 capng_restore_state(&cap.saved);
267 * restore_state free's the saved copy
268 * so make another.
270 cap.saved = capng_save_state();
271 if (!cap.saved) {
272 pthread_mutex_unlock(&cap.mutex);
273 fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n");
274 return -EINVAL;
276 pthread_mutex_unlock(&cap.mutex);
279 * We want to use the loaded state for our pid,
280 * not the original
282 capng_setpid(syscall(SYS_gettid));
283 cap_loaded = true;
285 return 0;
289 * Helpers for dropping and regaining effective capabilities. Returns 0
290 * on success, error otherwise
292 static int drop_effective_cap(const char *cap_name, bool *cap_dropped)
294 int cap, ret;
296 cap = capng_name_to_capability(cap_name);
297 if (cap < 0) {
298 ret = errno;
299 fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
300 cap_name, strerror(errno));
301 goto out;
304 if (load_capng()) {
305 ret = errno;
306 fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
307 goto out;
310 /* We dont have this capability in effective set already. */
311 if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) {
312 ret = 0;
313 goto out;
316 if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) {
317 ret = errno;
318 fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n");
319 goto out;
322 if (capng_apply(CAPNG_SELECT_CAPS)) {
323 ret = errno;
324 fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n");
325 goto out;
328 ret = 0;
329 if (cap_dropped) {
330 *cap_dropped = true;
333 out:
334 return ret;
337 static int gain_effective_cap(const char *cap_name)
339 int cap;
340 int ret = 0;
342 cap = capng_name_to_capability(cap_name);
343 if (cap < 0) {
344 ret = errno;
345 fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
346 cap_name, strerror(errno));
347 goto out;
350 if (load_capng()) {
351 ret = errno;
352 fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
353 goto out;
356 if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) {
357 ret = errno;
358 fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n");
359 goto out;
362 if (capng_apply(CAPNG_SELECT_CAPS)) {
363 ret = errno;
364 fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n");
365 goto out;
367 ret = 0;
369 out:
370 return ret;
373 static void lo_map_init(struct lo_map *map)
375 map->elems = NULL;
376 map->nelems = 0;
377 map->freelist = -1;
380 static void lo_map_destroy(struct lo_map *map)
382 free(map->elems);
385 static int lo_map_grow(struct lo_map *map, size_t new_nelems)
387 struct lo_map_elem *new_elems;
388 size_t i;
390 if (new_nelems <= map->nelems) {
391 return 1;
394 new_elems = realloc(map->elems, sizeof(map->elems[0]) * new_nelems);
395 if (!new_elems) {
396 return 0;
399 for (i = map->nelems; i < new_nelems; i++) {
400 new_elems[i].freelist = i + 1;
401 new_elems[i].in_use = false;
403 new_elems[new_nelems - 1].freelist = -1;
405 map->elems = new_elems;
406 map->freelist = map->nelems;
407 map->nelems = new_nelems;
408 return 1;
411 static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map)
413 struct lo_map_elem *elem;
415 if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) {
416 return NULL;
419 elem = &map->elems[map->freelist];
420 map->freelist = elem->freelist;
422 elem->in_use = true;
424 return elem;
427 static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key)
429 ssize_t *prev;
431 if (!lo_map_grow(map, key + 1)) {
432 return NULL;
435 for (prev = &map->freelist; *prev != -1;
436 prev = &map->elems[*prev].freelist) {
437 if (*prev == key) {
438 struct lo_map_elem *elem = &map->elems[key];
440 *prev = elem->freelist;
441 elem->in_use = true;
442 return elem;
445 return NULL;
448 static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key)
450 if (key >= map->nelems) {
451 return NULL;
453 if (!map->elems[key].in_use) {
454 return NULL;
456 return &map->elems[key];
459 static void lo_map_remove(struct lo_map *map, size_t key)
461 struct lo_map_elem *elem;
463 if (key >= map->nelems) {
464 return;
467 elem = &map->elems[key];
468 if (!elem->in_use) {
469 return;
472 elem->in_use = false;
474 elem->freelist = map->freelist;
475 map->freelist = key;
478 /* Assumes lo->mutex is held */
479 static ssize_t lo_add_fd_mapping(fuse_req_t req, int fd)
481 struct lo_map_elem *elem;
483 elem = lo_map_alloc_elem(&lo_data(req)->fd_map);
484 if (!elem) {
485 return -1;
488 elem->fd = fd;
489 return elem - lo_data(req)->fd_map.elems;
492 /* Assumes lo->mutex is held */
493 static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp)
495 struct lo_map_elem *elem;
497 elem = lo_map_alloc_elem(&lo_data(req)->dirp_map);
498 if (!elem) {
499 return -1;
502 elem->dirp = dirp;
503 return elem - lo_data(req)->dirp_map.elems;
506 /* Assumes lo->mutex is held */
507 static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode)
509 struct lo_map_elem *elem;
511 elem = lo_map_alloc_elem(&lo_data(req)->ino_map);
512 if (!elem) {
513 return -1;
516 elem->inode = inode;
517 return elem - lo_data(req)->ino_map.elems;
520 static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep)
522 struct lo_inode *inode = *inodep;
524 if (!inode) {
525 return;
528 *inodep = NULL;
530 if (g_atomic_int_dec_and_test(&inode->refcount)) {
531 close(inode->fd);
532 free(inode);
536 /* Caller must release refcount using lo_inode_put() */
537 static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino)
539 struct lo_data *lo = lo_data(req);
540 struct lo_map_elem *elem;
542 pthread_mutex_lock(&lo->mutex);
543 elem = lo_map_get(&lo->ino_map, ino);
544 if (elem) {
545 g_atomic_int_inc(&elem->inode->refcount);
547 pthread_mutex_unlock(&lo->mutex);
549 if (!elem) {
550 return NULL;
553 return elem->inode;
557 * TODO Remove this helper and force callers to hold an inode refcount until
558 * they are done with the fd. This will be done in a later patch to make
559 * review easier.
561 static int lo_fd(fuse_req_t req, fuse_ino_t ino)
563 struct lo_inode *inode = lo_inode(req, ino);
564 int fd;
566 if (!inode) {
567 return -1;
570 fd = inode->fd;
571 lo_inode_put(lo_data(req), &inode);
572 return fd;
575 static void lo_init(void *userdata, struct fuse_conn_info *conn)
577 struct lo_data *lo = (struct lo_data *)userdata;
579 if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) {
580 conn->want |= FUSE_CAP_EXPORT_SUPPORT;
583 if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) {
584 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n");
585 conn->want |= FUSE_CAP_WRITEBACK_CACHE;
587 if (conn->capable & FUSE_CAP_FLOCK_LOCKS) {
588 if (lo->flock) {
589 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n");
590 conn->want |= FUSE_CAP_FLOCK_LOCKS;
591 } else {
592 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n");
593 conn->want &= ~FUSE_CAP_FLOCK_LOCKS;
597 if (conn->capable & FUSE_CAP_POSIX_LOCKS) {
598 if (lo->posix_lock) {
599 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n");
600 conn->want |= FUSE_CAP_POSIX_LOCKS;
601 } else {
602 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n");
603 conn->want &= ~FUSE_CAP_POSIX_LOCKS;
607 if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) ||
608 lo->readdirplus_clear) {
609 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n");
610 conn->want &= ~FUSE_CAP_READDIRPLUS;
615 * Call fstatat() and set st_rdev whenever a directory's st_dev
616 * differs from the rparent's st_dev (@parent_dev). This will
617 * announce submounts to the FUSE client (unless @announce_submounts
618 * is false).
620 static int do_fstatat(int dirfd, const char *pathname, struct stat *statbuf,
621 int flags, dev_t parent_dev, uint32_t *fuse_attr_flags)
623 int res = fstatat(dirfd, pathname, statbuf, flags);
624 if (res == -1) {
625 return res;
628 if (statbuf->st_dev != parent_dev && S_ISDIR(statbuf->st_mode) &&
629 fuse_attr_flags)
631 *fuse_attr_flags |= FUSE_ATTR_SUBMOUNT;
634 return 0;
637 static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
638 struct fuse_file_info *fi)
640 int res;
641 struct stat buf;
642 struct lo_data *lo = lo_data(req);
643 struct lo_inode *inode = lo_inode(req, ino);
644 uint32_t fuse_attr_flags = 0;
646 (void)fi;
648 res = do_fstatat(inode->fd, "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
649 inode->parent_dev, &fuse_attr_flags);
650 lo_inode_put(lo, &inode);
651 if (res == -1) {
652 return (void)fuse_reply_err(req, errno);
655 if (!lo->announce_submounts) {
656 fuse_attr_flags &= ~FUSE_ATTR_SUBMOUNT;
659 fuse_reply_attr_with_flags(req, &buf, lo->timeout, fuse_attr_flags);
662 static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi)
664 struct lo_data *lo = lo_data(req);
665 struct lo_map_elem *elem;
667 pthread_mutex_lock(&lo->mutex);
668 elem = lo_map_get(&lo->fd_map, fi->fh);
669 pthread_mutex_unlock(&lo->mutex);
671 if (!elem) {
672 return -1;
675 return elem->fd;
678 static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
679 int valid, struct fuse_file_info *fi)
681 int saverr;
682 char procname[64];
683 struct lo_data *lo = lo_data(req);
684 struct lo_inode *inode;
685 int ifd;
686 int res;
687 int fd = -1;
689 inode = lo_inode(req, ino);
690 if (!inode) {
691 fuse_reply_err(req, EBADF);
692 return;
695 ifd = inode->fd;
697 /* If fi->fh is invalid we'll report EBADF later */
698 if (fi) {
699 fd = lo_fi_fd(req, fi);
702 if (valid & FUSE_SET_ATTR_MODE) {
703 if (fi) {
704 res = fchmod(fd, attr->st_mode);
705 } else {
706 sprintf(procname, "%i", ifd);
707 res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0);
709 if (res == -1) {
710 goto out_err;
713 if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) {
714 uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1;
715 gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1;
717 res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
718 if (res == -1) {
719 goto out_err;
722 if (valid & FUSE_SET_ATTR_SIZE) {
723 int truncfd;
725 if (fi) {
726 truncfd = fd;
727 } else {
728 sprintf(procname, "%i", ifd);
729 truncfd = openat(lo->proc_self_fd, procname, O_RDWR);
730 if (truncfd < 0) {
731 goto out_err;
735 res = ftruncate(truncfd, attr->st_size);
736 if (!fi) {
737 saverr = errno;
738 close(truncfd);
739 errno = saverr;
741 if (res == -1) {
742 goto out_err;
745 if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) {
746 struct timespec tv[2];
748 tv[0].tv_sec = 0;
749 tv[1].tv_sec = 0;
750 tv[0].tv_nsec = UTIME_OMIT;
751 tv[1].tv_nsec = UTIME_OMIT;
753 if (valid & FUSE_SET_ATTR_ATIME_NOW) {
754 tv[0].tv_nsec = UTIME_NOW;
755 } else if (valid & FUSE_SET_ATTR_ATIME) {
756 tv[0] = attr->st_atim;
759 if (valid & FUSE_SET_ATTR_MTIME_NOW) {
760 tv[1].tv_nsec = UTIME_NOW;
761 } else if (valid & FUSE_SET_ATTR_MTIME) {
762 tv[1] = attr->st_mtim;
765 if (fi) {
766 res = futimens(fd, tv);
767 } else {
768 sprintf(procname, "%i", inode->fd);
769 res = utimensat(lo->proc_self_fd, procname, tv, 0);
771 if (res == -1) {
772 goto out_err;
775 lo_inode_put(lo, &inode);
777 return lo_getattr(req, ino, fi);
779 out_err:
780 saverr = errno;
781 lo_inode_put(lo, &inode);
782 fuse_reply_err(req, saverr);
785 static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st)
787 struct lo_inode *p;
788 struct lo_key key = {
789 .ino = st->st_ino,
790 .dev = st->st_dev,
793 pthread_mutex_lock(&lo->mutex);
794 p = g_hash_table_lookup(lo->inodes, &key);
795 if (p) {
796 assert(p->nlookup > 0);
797 p->nlookup++;
798 g_atomic_int_inc(&p->refcount);
800 pthread_mutex_unlock(&lo->mutex);
802 return p;
805 /* value_destroy_func for posix_locks GHashTable */
806 static void posix_locks_value_destroy(gpointer data)
808 struct lo_inode_plock *plock = data;
811 * We had used open() for locks and had only one fd. So
812 * closing this fd should release all OFD locks.
814 close(plock->fd);
815 free(plock);
819 * Increments nlookup and caller must release refcount using
820 * lo_inode_put(&parent).
822 static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
823 struct fuse_entry_param *e)
825 int newfd;
826 int res;
827 int saverr;
828 struct lo_data *lo = lo_data(req);
829 struct lo_inode *inode = NULL;
830 struct lo_inode *dir = lo_inode(req, parent);
833 * name_to_handle_at() and open_by_handle_at() can reach here with fuse
834 * mount point in guest, but we don't have its inode info in the
835 * ino_map.
837 if (!dir) {
838 return ENOENT;
841 memset(e, 0, sizeof(*e));
842 e->attr_timeout = lo->timeout;
843 e->entry_timeout = lo->timeout;
845 /* Do not allow escaping root directory */
846 if (dir == &lo->root && strcmp(name, "..") == 0) {
847 name = ".";
850 newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW);
851 if (newfd == -1) {
852 goto out_err;
855 res = do_fstatat(newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
856 dir->key.dev, &e->attr_flags);
857 if (res == -1) {
858 goto out_err;
861 if (!lo->announce_submounts) {
862 e->attr_flags &= ~FUSE_ATTR_SUBMOUNT;
865 inode = lo_find(lo, &e->attr);
866 if (inode) {
867 close(newfd);
868 } else {
869 inode = calloc(1, sizeof(struct lo_inode));
870 if (!inode) {
871 goto out_err;
874 /* cache only filetype */
875 inode->filetype = (e->attr.st_mode & S_IFMT);
878 * One for the caller and one for nlookup (released in
879 * unref_inode_lolocked())
881 g_atomic_int_set(&inode->refcount, 2);
883 inode->nlookup = 1;
884 inode->fd = newfd;
885 inode->key.ino = e->attr.st_ino;
886 inode->key.dev = e->attr.st_dev;
887 pthread_mutex_init(&inode->plock_mutex, NULL);
888 inode->posix_locks = g_hash_table_new_full(
889 g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
891 pthread_mutex_lock(&lo->mutex);
892 inode->fuse_ino = lo_add_inode_mapping(req, inode);
893 g_hash_table_insert(lo->inodes, &inode->key, inode);
894 pthread_mutex_unlock(&lo->mutex);
896 inode->parent_dev = dir->key.dev;
897 e->ino = inode->fuse_ino;
898 lo_inode_put(lo, &inode);
899 lo_inode_put(lo, &dir);
901 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
902 name, (unsigned long long)e->ino);
904 return 0;
906 out_err:
907 saverr = errno;
908 if (newfd != -1) {
909 close(newfd);
911 lo_inode_put(lo, &inode);
912 lo_inode_put(lo, &dir);
913 return saverr;
916 static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
918 struct fuse_entry_param e;
919 int err;
921 fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent,
922 name);
925 * Don't use is_safe_path_component(), allow "." and ".." for NFS export
926 * support.
928 if (strchr(name, '/')) {
929 fuse_reply_err(req, EINVAL);
930 return;
933 err = lo_do_lookup(req, parent, name, &e);
934 if (err) {
935 fuse_reply_err(req, err);
936 } else {
937 fuse_reply_entry(req, &e);
942 * On some archs, setres*id is limited to 2^16 but they
943 * provide setres*id32 variants that allow 2^32.
944 * Others just let setres*id do 2^32 anyway.
946 #ifdef SYS_setresgid32
947 #define OURSYS_setresgid SYS_setresgid32
948 #else
949 #define OURSYS_setresgid SYS_setresgid
950 #endif
952 #ifdef SYS_setresuid32
953 #define OURSYS_setresuid SYS_setresuid32
954 #else
955 #define OURSYS_setresuid SYS_setresuid
956 #endif
959 * Change to uid/gid of caller so that file is created with
960 * ownership of caller.
961 * TODO: What about selinux context?
963 static int lo_change_cred(fuse_req_t req, struct lo_cred *old)
965 int res;
967 old->euid = geteuid();
968 old->egid = getegid();
970 res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1);
971 if (res == -1) {
972 return errno;
975 res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1);
976 if (res == -1) {
977 int errno_save = errno;
979 syscall(OURSYS_setresgid, -1, old->egid, -1);
980 return errno_save;
983 return 0;
986 /* Regain Privileges */
987 static void lo_restore_cred(struct lo_cred *old)
989 int res;
991 res = syscall(OURSYS_setresuid, -1, old->euid, -1);
992 if (res == -1) {
993 fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid);
994 exit(1);
997 res = syscall(OURSYS_setresgid, -1, old->egid, -1);
998 if (res == -1) {
999 fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid);
1000 exit(1);
1004 static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
1005 const char *name, mode_t mode, dev_t rdev,
1006 const char *link)
1008 int res;
1009 int saverr;
1010 struct lo_data *lo = lo_data(req);
1011 struct lo_inode *dir;
1012 struct fuse_entry_param e;
1013 struct lo_cred old = {};
1015 if (!is_safe_path_component(name)) {
1016 fuse_reply_err(req, EINVAL);
1017 return;
1020 dir = lo_inode(req, parent);
1021 if (!dir) {
1022 fuse_reply_err(req, EBADF);
1023 return;
1026 saverr = lo_change_cred(req, &old);
1027 if (saverr) {
1028 goto out;
1031 res = mknod_wrapper(dir->fd, name, link, mode, rdev);
1033 saverr = errno;
1035 lo_restore_cred(&old);
1037 if (res == -1) {
1038 goto out;
1041 saverr = lo_do_lookup(req, parent, name, &e);
1042 if (saverr) {
1043 goto out;
1046 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
1047 name, (unsigned long long)e.ino);
1049 fuse_reply_entry(req, &e);
1050 lo_inode_put(lo, &dir);
1051 return;
1053 out:
1054 lo_inode_put(lo, &dir);
1055 fuse_reply_err(req, saverr);
1058 static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name,
1059 mode_t mode, dev_t rdev)
1061 lo_mknod_symlink(req, parent, name, mode, rdev, NULL);
1064 static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name,
1065 mode_t mode)
1067 lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL);
1070 static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent,
1071 const char *name)
1073 lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link);
1076 static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent,
1077 const char *name)
1079 int res;
1080 struct lo_data *lo = lo_data(req);
1081 struct lo_inode *parent_inode;
1082 struct lo_inode *inode;
1083 struct fuse_entry_param e;
1084 char procname[64];
1085 int saverr;
1087 if (!is_safe_path_component(name)) {
1088 fuse_reply_err(req, EINVAL);
1089 return;
1092 parent_inode = lo_inode(req, parent);
1093 inode = lo_inode(req, ino);
1094 if (!parent_inode || !inode) {
1095 errno = EBADF;
1096 goto out_err;
1099 memset(&e, 0, sizeof(struct fuse_entry_param));
1100 e.attr_timeout = lo->timeout;
1101 e.entry_timeout = lo->timeout;
1103 sprintf(procname, "%i", inode->fd);
1104 res = linkat(lo->proc_self_fd, procname, parent_inode->fd, name,
1105 AT_SYMLINK_FOLLOW);
1106 if (res == -1) {
1107 goto out_err;
1110 res = do_fstatat(inode->fd, "", &e.attr,
1111 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
1112 parent_inode->key.dev, &e.attr_flags);
1113 if (res == -1) {
1114 goto out_err;
1117 if (!lo->announce_submounts) {
1118 e.attr_flags &= ~FUSE_ATTR_SUBMOUNT;
1121 pthread_mutex_lock(&lo->mutex);
1122 inode->nlookup++;
1123 pthread_mutex_unlock(&lo->mutex);
1124 e.ino = inode->fuse_ino;
1126 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
1127 name, (unsigned long long)e.ino);
1130 * No need to update inode->parent_dev, because
1131 * (1) We cannot, the inode now has more than one parent,
1132 * (2) Directories cannot have more than one parent, so link()
1133 * does not work for them; but parent_dev only needs to be
1134 * valid for directories.
1137 fuse_reply_entry(req, &e);
1138 lo_inode_put(lo, &parent_inode);
1139 lo_inode_put(lo, &inode);
1140 return;
1142 out_err:
1143 saverr = errno;
1144 lo_inode_put(lo, &parent_inode);
1145 lo_inode_put(lo, &inode);
1146 fuse_reply_err(req, saverr);
1149 /* Increments nlookup and caller must release refcount using lo_inode_put() */
1150 static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent,
1151 const char *name)
1153 int res;
1154 struct stat attr;
1155 struct lo_data *lo = lo_data(req);
1156 struct lo_inode *dir = lo_inode(req, parent);
1158 if (!dir) {
1159 return NULL;
1162 res = do_fstatat(dir->fd, name, &attr,
1163 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW, dir->key.dev, NULL);
1164 lo_inode_put(lo, &dir);
1165 if (res == -1) {
1166 return NULL;
1169 return lo_find(lo, &attr);
1172 static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
1174 int res;
1175 struct lo_inode *inode;
1176 struct lo_data *lo = lo_data(req);
1178 if (!is_safe_path_component(name)) {
1179 fuse_reply_err(req, EINVAL);
1180 return;
1183 inode = lookup_name(req, parent, name);
1184 if (!inode) {
1185 fuse_reply_err(req, EIO);
1186 return;
1189 res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR);
1191 fuse_reply_err(req, res == -1 ? errno : 0);
1192 unref_inode_lolocked(lo, inode, 1);
1193 lo_inode_put(lo, &inode);
1196 static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
1197 fuse_ino_t newparent, const char *newname,
1198 unsigned int flags)
1200 int res;
1201 struct lo_inode *parent_inode;
1202 struct lo_inode *newparent_inode;
1203 struct lo_inode *oldinode = NULL;
1204 struct lo_inode *newinode = NULL;
1205 struct lo_data *lo = lo_data(req);
1207 if (!is_safe_path_component(name) || !is_safe_path_component(newname)) {
1208 fuse_reply_err(req, EINVAL);
1209 return;
1212 parent_inode = lo_inode(req, parent);
1213 newparent_inode = lo_inode(req, newparent);
1214 if (!parent_inode || !newparent_inode) {
1215 fuse_reply_err(req, EBADF);
1216 goto out;
1219 oldinode = lookup_name(req, parent, name);
1220 newinode = lookup_name(req, newparent, newname);
1222 if (!oldinode) {
1223 fuse_reply_err(req, EIO);
1224 goto out;
1227 if (flags) {
1228 #ifndef SYS_renameat2
1229 fuse_reply_err(req, EINVAL);
1230 #else
1231 res = syscall(SYS_renameat2, parent_inode->fd, name,
1232 newparent_inode->fd, newname, flags);
1233 if (res == -1 && errno == ENOSYS) {
1234 fuse_reply_err(req, EINVAL);
1235 } else {
1236 fuse_reply_err(req, res == -1 ? errno : 0);
1238 #endif
1239 goto out;
1242 res = renameat(parent_inode->fd, name, newparent_inode->fd, newname);
1244 fuse_reply_err(req, res == -1 ? errno : 0);
1245 out:
1246 unref_inode_lolocked(lo, oldinode, 1);
1247 unref_inode_lolocked(lo, newinode, 1);
1248 lo_inode_put(lo, &oldinode);
1249 lo_inode_put(lo, &newinode);
1250 lo_inode_put(lo, &parent_inode);
1251 lo_inode_put(lo, &newparent_inode);
1254 static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name)
1256 int res;
1257 struct lo_inode *inode;
1258 struct lo_data *lo = lo_data(req);
1260 if (!is_safe_path_component(name)) {
1261 fuse_reply_err(req, EINVAL);
1262 return;
1265 inode = lookup_name(req, parent, name);
1266 if (!inode) {
1267 fuse_reply_err(req, EIO);
1268 return;
1271 res = unlinkat(lo_fd(req, parent), name, 0);
1273 fuse_reply_err(req, res == -1 ? errno : 0);
1274 unref_inode_lolocked(lo, inode, 1);
1275 lo_inode_put(lo, &inode);
1278 /* To be called with lo->mutex held */
1279 static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n)
1281 if (!inode) {
1282 return;
1285 assert(inode->nlookup >= n);
1286 inode->nlookup -= n;
1287 if (!inode->nlookup) {
1288 lo_map_remove(&lo->ino_map, inode->fuse_ino);
1289 g_hash_table_remove(lo->inodes, &inode->key);
1290 if (g_hash_table_size(inode->posix_locks)) {
1291 fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
1293 g_hash_table_destroy(inode->posix_locks);
1294 pthread_mutex_destroy(&inode->plock_mutex);
1296 /* Drop our refcount from lo_do_lookup() */
1297 lo_inode_put(lo, &inode);
1301 static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
1302 uint64_t n)
1304 if (!inode) {
1305 return;
1308 pthread_mutex_lock(&lo->mutex);
1309 unref_inode(lo, inode, n);
1310 pthread_mutex_unlock(&lo->mutex);
1313 static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
1315 struct lo_data *lo = lo_data(req);
1316 struct lo_inode *inode;
1318 inode = lo_inode(req, ino);
1319 if (!inode) {
1320 return;
1323 fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n",
1324 (unsigned long long)ino, (unsigned long long)inode->nlookup,
1325 (unsigned long long)nlookup);
1327 unref_inode_lolocked(lo, inode, nlookup);
1328 lo_inode_put(lo, &inode);
1331 static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
1333 lo_forget_one(req, ino, nlookup);
1334 fuse_reply_none(req);
1337 static void lo_forget_multi(fuse_req_t req, size_t count,
1338 struct fuse_forget_data *forgets)
1340 int i;
1342 for (i = 0; i < count; i++) {
1343 lo_forget_one(req, forgets[i].ino, forgets[i].nlookup);
1345 fuse_reply_none(req);
1348 static void lo_readlink(fuse_req_t req, fuse_ino_t ino)
1350 char buf[PATH_MAX + 1];
1351 int res;
1353 res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf));
1354 if (res == -1) {
1355 return (void)fuse_reply_err(req, errno);
1358 if (res == sizeof(buf)) {
1359 return (void)fuse_reply_err(req, ENAMETOOLONG);
1362 buf[res] = '\0';
1364 fuse_reply_readlink(req, buf);
1367 struct lo_dirp {
1368 gint refcount;
1369 DIR *dp;
1370 struct dirent *entry;
1371 off_t offset;
1374 static void lo_dirp_put(struct lo_dirp **dp)
1376 struct lo_dirp *d = *dp;
1378 if (!d) {
1379 return;
1381 *dp = NULL;
1383 if (g_atomic_int_dec_and_test(&d->refcount)) {
1384 closedir(d->dp);
1385 free(d);
1389 /* Call lo_dirp_put() on the return value when no longer needed */
1390 static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi)
1392 struct lo_data *lo = lo_data(req);
1393 struct lo_map_elem *elem;
1395 pthread_mutex_lock(&lo->mutex);
1396 elem = lo_map_get(&lo->dirp_map, fi->fh);
1397 if (elem) {
1398 g_atomic_int_inc(&elem->dirp->refcount);
1400 pthread_mutex_unlock(&lo->mutex);
1401 if (!elem) {
1402 return NULL;
1405 return elem->dirp;
1408 static void lo_opendir(fuse_req_t req, fuse_ino_t ino,
1409 struct fuse_file_info *fi)
1411 int error = ENOMEM;
1412 struct lo_data *lo = lo_data(req);
1413 struct lo_dirp *d;
1414 int fd;
1415 ssize_t fh;
1417 d = calloc(1, sizeof(struct lo_dirp));
1418 if (d == NULL) {
1419 goto out_err;
1422 fd = openat(lo_fd(req, ino), ".", O_RDONLY);
1423 if (fd == -1) {
1424 goto out_errno;
1427 d->dp = fdopendir(fd);
1428 if (d->dp == NULL) {
1429 goto out_errno;
1432 d->offset = 0;
1433 d->entry = NULL;
1435 g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */
1436 pthread_mutex_lock(&lo->mutex);
1437 fh = lo_add_dirp_mapping(req, d);
1438 pthread_mutex_unlock(&lo->mutex);
1439 if (fh == -1) {
1440 goto out_err;
1443 fi->fh = fh;
1444 if (lo->cache == CACHE_ALWAYS) {
1445 fi->cache_readdir = 1;
1447 fuse_reply_open(req, fi);
1448 return;
1450 out_errno:
1451 error = errno;
1452 out_err:
1453 if (d) {
1454 if (d->dp) {
1455 closedir(d->dp);
1456 } else if (fd != -1) {
1457 close(fd);
1459 free(d);
1461 fuse_reply_err(req, error);
1464 static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
1465 off_t offset, struct fuse_file_info *fi, int plus)
1467 struct lo_data *lo = lo_data(req);
1468 struct lo_dirp *d = NULL;
1469 struct lo_inode *dinode;
1470 char *buf = NULL;
1471 char *p;
1472 size_t rem = size;
1473 int err = EBADF;
1475 dinode = lo_inode(req, ino);
1476 if (!dinode) {
1477 goto error;
1480 d = lo_dirp(req, fi);
1481 if (!d) {
1482 goto error;
1485 err = ENOMEM;
1486 buf = calloc(1, size);
1487 if (!buf) {
1488 goto error;
1490 p = buf;
1492 if (offset != d->offset) {
1493 seekdir(d->dp, offset);
1494 d->entry = NULL;
1495 d->offset = offset;
1497 while (1) {
1498 size_t entsize;
1499 off_t nextoff;
1500 const char *name;
1502 if (!d->entry) {
1503 errno = 0;
1504 d->entry = readdir(d->dp);
1505 if (!d->entry) {
1506 if (errno) { /* Error */
1507 err = errno;
1508 goto error;
1509 } else { /* End of stream */
1510 break;
1514 nextoff = d->entry->d_off;
1515 name = d->entry->d_name;
1517 fuse_ino_t entry_ino = 0;
1518 struct fuse_entry_param e = (struct fuse_entry_param){
1519 .attr.st_ino = d->entry->d_ino,
1520 .attr.st_mode = d->entry->d_type << 12,
1523 /* Hide root's parent directory */
1524 if (dinode == &lo->root && strcmp(name, "..") == 0) {
1525 e.attr.st_ino = lo->root.key.ino;
1526 e.attr.st_mode = DT_DIR << 12;
1529 if (plus) {
1530 if (!is_dot_or_dotdot(name)) {
1531 err = lo_do_lookup(req, ino, name, &e);
1532 if (err) {
1533 goto error;
1535 entry_ino = e.ino;
1538 entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff);
1539 } else {
1540 entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff);
1542 if (entsize > rem) {
1543 if (entry_ino != 0) {
1544 lo_forget_one(req, entry_ino, 1);
1546 break;
1549 p += entsize;
1550 rem -= entsize;
1552 d->entry = NULL;
1553 d->offset = nextoff;
1556 err = 0;
1557 error:
1558 lo_dirp_put(&d);
1559 lo_inode_put(lo, &dinode);
1562 * If there's an error, we can only signal it if we haven't stored
1563 * any entries yet - otherwise we'd end up with wrong lookup
1564 * counts for the entries that are already in the buffer. So we
1565 * return what we've collected until that point.
1567 if (err && rem == size) {
1568 fuse_reply_err(req, err);
1569 } else {
1570 fuse_reply_buf(req, buf, size - rem);
1572 free(buf);
1575 static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
1576 off_t offset, struct fuse_file_info *fi)
1578 lo_do_readdir(req, ino, size, offset, fi, 0);
1581 static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size,
1582 off_t offset, struct fuse_file_info *fi)
1584 lo_do_readdir(req, ino, size, offset, fi, 1);
1587 static void lo_releasedir(fuse_req_t req, fuse_ino_t ino,
1588 struct fuse_file_info *fi)
1590 struct lo_data *lo = lo_data(req);
1591 struct lo_map_elem *elem;
1592 struct lo_dirp *d;
1594 (void)ino;
1596 pthread_mutex_lock(&lo->mutex);
1597 elem = lo_map_get(&lo->dirp_map, fi->fh);
1598 if (!elem) {
1599 pthread_mutex_unlock(&lo->mutex);
1600 fuse_reply_err(req, EBADF);
1601 return;
1604 d = elem->dirp;
1605 lo_map_remove(&lo->dirp_map, fi->fh);
1606 pthread_mutex_unlock(&lo->mutex);
1608 lo_dirp_put(&d); /* paired with lo_opendir() */
1610 fuse_reply_err(req, 0);
1613 static void update_open_flags(int writeback, int allow_direct_io,
1614 struct fuse_file_info *fi)
1617 * With writeback cache, kernel may send read requests even
1618 * when userspace opened write-only
1620 if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) {
1621 fi->flags &= ~O_ACCMODE;
1622 fi->flags |= O_RDWR;
1626 * With writeback cache, O_APPEND is handled by the kernel.
1627 * This breaks atomicity (since the file may change in the
1628 * underlying filesystem, so that the kernel's idea of the
1629 * end of the file isn't accurate anymore). In this example,
1630 * we just accept that. A more rigorous filesystem may want
1631 * to return an error here
1633 if (writeback && (fi->flags & O_APPEND)) {
1634 fi->flags &= ~O_APPEND;
1638 * O_DIRECT in guest should not necessarily mean bypassing page
1639 * cache on host as well. Therefore, we discard it by default
1640 * ('-o no_allow_direct_io'). If somebody needs that behavior,
1641 * the '-o allow_direct_io' option should be set.
1643 if (!allow_direct_io) {
1644 fi->flags &= ~O_DIRECT;
1648 static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
1649 mode_t mode, struct fuse_file_info *fi)
1651 int fd;
1652 struct lo_data *lo = lo_data(req);
1653 struct lo_inode *parent_inode;
1654 struct fuse_entry_param e;
1655 int err;
1656 struct lo_cred old = {};
1658 fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", parent,
1659 name);
1661 if (!is_safe_path_component(name)) {
1662 fuse_reply_err(req, EINVAL);
1663 return;
1666 parent_inode = lo_inode(req, parent);
1667 if (!parent_inode) {
1668 fuse_reply_err(req, EBADF);
1669 return;
1672 err = lo_change_cred(req, &old);
1673 if (err) {
1674 goto out;
1677 update_open_flags(lo->writeback, lo->allow_direct_io, fi);
1679 fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW,
1680 mode);
1681 err = fd == -1 ? errno : 0;
1682 lo_restore_cred(&old);
1684 if (!err) {
1685 ssize_t fh;
1687 pthread_mutex_lock(&lo->mutex);
1688 fh = lo_add_fd_mapping(req, fd);
1689 pthread_mutex_unlock(&lo->mutex);
1690 if (fh == -1) {
1691 close(fd);
1692 err = ENOMEM;
1693 goto out;
1696 fi->fh = fh;
1697 err = lo_do_lookup(req, parent, name, &e);
1699 if (lo->cache == CACHE_NONE) {
1700 fi->direct_io = 1;
1701 } else if (lo->cache == CACHE_ALWAYS) {
1702 fi->keep_cache = 1;
1705 out:
1706 lo_inode_put(lo, &parent_inode);
1708 if (err) {
1709 fuse_reply_err(req, err);
1710 } else {
1711 fuse_reply_create(req, &e, fi);
1715 /* Should be called with inode->plock_mutex held */
1716 static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo,
1717 struct lo_inode *inode,
1718 uint64_t lock_owner,
1719 pid_t pid, int *err)
1721 struct lo_inode_plock *plock;
1722 char procname[64];
1723 int fd;
1725 plock =
1726 g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner));
1728 if (plock) {
1729 return plock;
1732 plock = malloc(sizeof(struct lo_inode_plock));
1733 if (!plock) {
1734 *err = ENOMEM;
1735 return NULL;
1738 /* Open another instance of file which can be used for ofd locks. */
1739 sprintf(procname, "%i", inode->fd);
1741 /* TODO: What if file is not writable? */
1742 fd = openat(lo->proc_self_fd, procname, O_RDWR);
1743 if (fd == -1) {
1744 *err = errno;
1745 free(plock);
1746 return NULL;
1749 plock->lock_owner = lock_owner;
1750 plock->fd = fd;
1751 g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner),
1752 plock);
1753 return plock;
1756 static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
1757 struct flock *lock)
1759 struct lo_data *lo = lo_data(req);
1760 struct lo_inode *inode;
1761 struct lo_inode_plock *plock;
1762 int ret, saverr = 0;
1764 fuse_log(FUSE_LOG_DEBUG,
1765 "lo_getlk(ino=%" PRIu64 ", flags=%d)"
1766 " owner=0x%lx, l_type=%d l_start=0x%lx"
1767 " l_len=0x%lx\n",
1768 ino, fi->flags, fi->lock_owner, lock->l_type, lock->l_start,
1769 lock->l_len);
1771 inode = lo_inode(req, ino);
1772 if (!inode) {
1773 fuse_reply_err(req, EBADF);
1774 return;
1777 pthread_mutex_lock(&inode->plock_mutex);
1778 plock =
1779 lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
1780 if (!plock) {
1781 saverr = ret;
1782 goto out;
1785 ret = fcntl(plock->fd, F_OFD_GETLK, lock);
1786 if (ret == -1) {
1787 saverr = errno;
1790 out:
1791 pthread_mutex_unlock(&inode->plock_mutex);
1792 lo_inode_put(lo, &inode);
1794 if (saverr) {
1795 fuse_reply_err(req, saverr);
1796 } else {
1797 fuse_reply_lock(req, lock);
1801 static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
1802 struct flock *lock, int sleep)
1804 struct lo_data *lo = lo_data(req);
1805 struct lo_inode *inode;
1806 struct lo_inode_plock *plock;
1807 int ret, saverr = 0;
1809 fuse_log(FUSE_LOG_DEBUG,
1810 "lo_setlk(ino=%" PRIu64 ", flags=%d)"
1811 " cmd=%d pid=%d owner=0x%lx sleep=%d l_whence=%d"
1812 " l_start=0x%lx l_len=0x%lx\n",
1813 ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep,
1814 lock->l_whence, lock->l_start, lock->l_len);
1816 if (sleep) {
1817 fuse_reply_err(req, EOPNOTSUPP);
1818 return;
1821 inode = lo_inode(req, ino);
1822 if (!inode) {
1823 fuse_reply_err(req, EBADF);
1824 return;
1827 pthread_mutex_lock(&inode->plock_mutex);
1828 plock =
1829 lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
1831 if (!plock) {
1832 saverr = ret;
1833 goto out;
1836 /* TODO: Is it alright to modify flock? */
1837 lock->l_pid = 0;
1838 ret = fcntl(plock->fd, F_OFD_SETLK, lock);
1839 if (ret == -1) {
1840 saverr = errno;
1843 out:
1844 pthread_mutex_unlock(&inode->plock_mutex);
1845 lo_inode_put(lo, &inode);
1847 fuse_reply_err(req, saverr);
1850 static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
1851 struct fuse_file_info *fi)
1853 int res;
1854 struct lo_dirp *d;
1855 int fd;
1857 (void)ino;
1859 d = lo_dirp(req, fi);
1860 if (!d) {
1861 fuse_reply_err(req, EBADF);
1862 return;
1865 fd = dirfd(d->dp);
1866 if (datasync) {
1867 res = fdatasync(fd);
1868 } else {
1869 res = fsync(fd);
1872 lo_dirp_put(&d);
1874 fuse_reply_err(req, res == -1 ? errno : 0);
1877 static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
1879 int fd;
1880 ssize_t fh;
1881 char buf[64];
1882 struct lo_data *lo = lo_data(req);
1884 fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino,
1885 fi->flags);
1887 update_open_flags(lo->writeback, lo->allow_direct_io, fi);
1889 sprintf(buf, "%i", lo_fd(req, ino));
1890 fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW);
1891 if (fd == -1) {
1892 return (void)fuse_reply_err(req, errno);
1895 pthread_mutex_lock(&lo->mutex);
1896 fh = lo_add_fd_mapping(req, fd);
1897 pthread_mutex_unlock(&lo->mutex);
1898 if (fh == -1) {
1899 close(fd);
1900 fuse_reply_err(req, ENOMEM);
1901 return;
1904 fi->fh = fh;
1905 if (lo->cache == CACHE_NONE) {
1906 fi->direct_io = 1;
1907 } else if (lo->cache == CACHE_ALWAYS) {
1908 fi->keep_cache = 1;
1910 fuse_reply_open(req, fi);
1913 static void lo_release(fuse_req_t req, fuse_ino_t ino,
1914 struct fuse_file_info *fi)
1916 struct lo_data *lo = lo_data(req);
1917 struct lo_map_elem *elem;
1918 int fd = -1;
1920 (void)ino;
1922 pthread_mutex_lock(&lo->mutex);
1923 elem = lo_map_get(&lo->fd_map, fi->fh);
1924 if (elem) {
1925 fd = elem->fd;
1926 elem = NULL;
1927 lo_map_remove(&lo->fd_map, fi->fh);
1929 pthread_mutex_unlock(&lo->mutex);
1931 close(fd);
1932 fuse_reply_err(req, 0);
1935 static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
1937 int res;
1938 (void)ino;
1939 struct lo_inode *inode;
1941 inode = lo_inode(req, ino);
1942 if (!inode) {
1943 fuse_reply_err(req, EBADF);
1944 return;
1947 /* An fd is going away. Cleanup associated posix locks */
1948 pthread_mutex_lock(&inode->plock_mutex);
1949 g_hash_table_remove(inode->posix_locks, GUINT_TO_POINTER(fi->lock_owner));
1950 pthread_mutex_unlock(&inode->plock_mutex);
1952 res = close(dup(lo_fi_fd(req, fi)));
1953 lo_inode_put(lo_data(req), &inode);
1954 fuse_reply_err(req, res == -1 ? errno : 0);
1957 static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync,
1958 struct fuse_file_info *fi)
1960 int res;
1961 int fd;
1962 char *buf;
1964 fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino,
1965 (void *)fi);
1967 if (!fi) {
1968 struct lo_data *lo = lo_data(req);
1970 res = asprintf(&buf, "%i", lo_fd(req, ino));
1971 if (res == -1) {
1972 return (void)fuse_reply_err(req, errno);
1975 fd = openat(lo->proc_self_fd, buf, O_RDWR);
1976 free(buf);
1977 if (fd == -1) {
1978 return (void)fuse_reply_err(req, errno);
1980 } else {
1981 fd = lo_fi_fd(req, fi);
1984 if (datasync) {
1985 res = fdatasync(fd);
1986 } else {
1987 res = fsync(fd);
1989 if (!fi) {
1990 close(fd);
1992 fuse_reply_err(req, res == -1 ? errno : 0);
1995 static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset,
1996 struct fuse_file_info *fi)
1998 struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size);
2000 fuse_log(FUSE_LOG_DEBUG,
2001 "lo_read(ino=%" PRIu64 ", size=%zd, "
2002 "off=%lu)\n",
2003 ino, size, (unsigned long)offset);
2005 buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
2006 buf.buf[0].fd = lo_fi_fd(req, fi);
2007 buf.buf[0].pos = offset;
2009 fuse_reply_data(req, &buf);
2012 static void lo_write_buf(fuse_req_t req, fuse_ino_t ino,
2013 struct fuse_bufvec *in_buf, off_t off,
2014 struct fuse_file_info *fi)
2016 (void)ino;
2017 ssize_t res;
2018 struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf));
2019 bool cap_fsetid_dropped = false;
2021 out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
2022 out_buf.buf[0].fd = lo_fi_fd(req, fi);
2023 out_buf.buf[0].pos = off;
2025 fuse_log(FUSE_LOG_DEBUG,
2026 "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ino,
2027 out_buf.buf[0].size, (unsigned long)off);
2030 * If kill_priv is set, drop CAP_FSETID which should lead to kernel
2031 * clearing setuid/setgid on file.
2033 if (fi->kill_priv) {
2034 res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
2035 if (res != 0) {
2036 fuse_reply_err(req, res);
2037 return;
2041 res = fuse_buf_copy(&out_buf, in_buf);
2042 if (res < 0) {
2043 fuse_reply_err(req, -res);
2044 } else {
2045 fuse_reply_write(req, (size_t)res);
2048 if (cap_fsetid_dropped) {
2049 res = gain_effective_cap("FSETID");
2050 if (res) {
2051 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
2056 static void lo_statfs(fuse_req_t req, fuse_ino_t ino)
2058 int res;
2059 struct statvfs stbuf;
2061 res = fstatvfs(lo_fd(req, ino), &stbuf);
2062 if (res == -1) {
2063 fuse_reply_err(req, errno);
2064 } else {
2065 fuse_reply_statfs(req, &stbuf);
2069 static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset,
2070 off_t length, struct fuse_file_info *fi)
2072 int err = EOPNOTSUPP;
2073 (void)ino;
2075 #ifdef CONFIG_FALLOCATE
2076 err = fallocate(lo_fi_fd(req, fi), mode, offset, length);
2077 if (err < 0) {
2078 err = errno;
2081 #elif defined(CONFIG_POSIX_FALLOCATE)
2082 if (mode) {
2083 fuse_reply_err(req, EOPNOTSUPP);
2084 return;
2087 err = posix_fallocate(lo_fi_fd(req, fi), offset, length);
2088 #endif
2090 fuse_reply_err(req, err);
2093 static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
2094 int op)
2096 int res;
2097 (void)ino;
2099 res = flock(lo_fi_fd(req, fi), op);
2101 fuse_reply_err(req, res == -1 ? errno : 0);
2104 /* types */
2106 * Exit; process attribute unmodified if matched.
2107 * An empty key applies to all.
2109 #define XATTR_MAP_FLAG_OK (1 << 0)
2111 * The attribute is unwanted;
2112 * EPERM on write, hidden on read.
2114 #define XATTR_MAP_FLAG_BAD (1 << 1)
2116 * For attr that start with 'key' prepend 'prepend'
2117 * 'key' may be empty to prepend for all attrs
2118 * key is defined from set/remove point of view.
2119 * Automatically reversed on read
2121 #define XATTR_MAP_FLAG_PREFIX (1 << 2)
2123 /* scopes */
2124 /* Apply rule to get/set/remove */
2125 #define XATTR_MAP_FLAG_CLIENT (1 << 16)
2126 /* Apply rule to list */
2127 #define XATTR_MAP_FLAG_SERVER (1 << 17)
2128 /* Apply rule to all */
2129 #define XATTR_MAP_FLAG_ALL (XATTR_MAP_FLAG_SERVER | XATTR_MAP_FLAG_CLIENT)
2131 static void add_xattrmap_entry(struct lo_data *lo,
2132 const XattrMapEntry *new_entry)
2134 XattrMapEntry *res = g_realloc_n(lo->xattr_map_list,
2135 lo->xattr_map_nentries + 1,
2136 sizeof(XattrMapEntry));
2137 res[lo->xattr_map_nentries++] = *new_entry;
2139 lo->xattr_map_list = res;
2142 static void free_xattrmap(struct lo_data *lo)
2144 XattrMapEntry *map = lo->xattr_map_list;
2145 size_t i;
2147 if (!map) {
2148 return;
2151 for (i = 0; i < lo->xattr_map_nentries; i++) {
2152 g_free(map[i].key);
2153 g_free(map[i].prepend);
2156 g_free(map);
2157 lo->xattr_map_list = NULL;
2158 lo->xattr_map_nentries = -1;
2162 * Handle the 'map' type, which is sugar for a set of commands
2163 * for the common case of prefixing a subset or everything,
2164 * and allowing anything not prefixed through.
2165 * It must be the last entry in the stream, although there
2166 * can be other entries before it.
2167 * The form is:
2168 * :map:key:prefix:
2170 * key maybe empty in which case all entries are prefixed.
2172 static void parse_xattrmap_map(struct lo_data *lo,
2173 const char *rule, char sep)
2175 const char *tmp;
2176 char *key;
2177 char *prefix;
2178 XattrMapEntry tmp_entry;
2180 if (*rule != sep) {
2181 fuse_log(FUSE_LOG_ERR,
2182 "%s: Expecting '%c' after 'map' keyword, found '%c'\n",
2183 __func__, sep, *rule);
2184 exit(1);
2187 rule++;
2189 /* At start of 'key' field */
2190 tmp = strchr(rule, sep);
2191 if (!tmp) {
2192 fuse_log(FUSE_LOG_ERR,
2193 "%s: Missing '%c' at end of key field in map rule\n",
2194 __func__, sep);
2195 exit(1);
2198 key = g_strndup(rule, tmp - rule);
2199 rule = tmp + 1;
2201 /* At start of prefix field */
2202 tmp = strchr(rule, sep);
2203 if (!tmp) {
2204 fuse_log(FUSE_LOG_ERR,
2205 "%s: Missing '%c' at end of prefix field in map rule\n",
2206 __func__, sep);
2207 exit(1);
2210 prefix = g_strndup(rule, tmp - rule);
2211 rule = tmp + 1;
2214 * This should be the end of the string, we don't allow
2215 * any more commands after 'map'.
2217 if (*rule) {
2218 fuse_log(FUSE_LOG_ERR,
2219 "%s: Expecting end of command after map, found '%c'\n",
2220 __func__, *rule);
2221 exit(1);
2224 /* 1st: Prefix matches/everything */
2225 tmp_entry.flags = XATTR_MAP_FLAG_PREFIX | XATTR_MAP_FLAG_ALL;
2226 tmp_entry.key = g_strdup(key);
2227 tmp_entry.prepend = g_strdup(prefix);
2228 add_xattrmap_entry(lo, &tmp_entry);
2230 if (!*key) {
2231 /* Prefix all case */
2233 /* 2nd: Hide any non-prefixed entries on the host */
2234 tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_ALL;
2235 tmp_entry.key = g_strdup("");
2236 tmp_entry.prepend = g_strdup("");
2237 add_xattrmap_entry(lo, &tmp_entry);
2238 } else {
2239 /* Prefix matching case */
2241 /* 2nd: Hide non-prefixed but matching entries on the host */
2242 tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_SERVER;
2243 tmp_entry.key = g_strdup(""); /* Not used */
2244 tmp_entry.prepend = g_strdup(key);
2245 add_xattrmap_entry(lo, &tmp_entry);
2247 /* 3rd: Stop the client accessing prefixed attributes directly */
2248 tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_CLIENT;
2249 tmp_entry.key = g_strdup(prefix);
2250 tmp_entry.prepend = g_strdup(""); /* Not used */
2251 add_xattrmap_entry(lo, &tmp_entry);
2253 /* 4th: Everything else is OK */
2254 tmp_entry.flags = XATTR_MAP_FLAG_OK | XATTR_MAP_FLAG_ALL;
2255 tmp_entry.key = g_strdup("");
2256 tmp_entry.prepend = g_strdup("");
2257 add_xattrmap_entry(lo, &tmp_entry);
2260 g_free(key);
2261 g_free(prefix);
2264 static void parse_xattrmap(struct lo_data *lo)
2266 const char *map = lo->xattrmap;
2267 const char *tmp;
2269 lo->xattr_map_nentries = 0;
2270 while (*map) {
2271 XattrMapEntry tmp_entry;
2272 char sep;
2274 if (isspace(*map)) {
2275 map++;
2276 continue;
2278 /* The separator is the first non-space of the rule */
2279 sep = *map++;
2280 if (!sep) {
2281 break;
2284 tmp_entry.flags = 0;
2285 /* Start of 'type' */
2286 if (strstart(map, "prefix", &map)) {
2287 tmp_entry.flags |= XATTR_MAP_FLAG_PREFIX;
2288 } else if (strstart(map, "ok", &map)) {
2289 tmp_entry.flags |= XATTR_MAP_FLAG_OK;
2290 } else if (strstart(map, "bad", &map)) {
2291 tmp_entry.flags |= XATTR_MAP_FLAG_BAD;
2292 } else if (strstart(map, "map", &map)) {
2294 * map is sugar that adds a number of rules, and must be
2295 * the last entry.
2297 parse_xattrmap_map(lo, map, sep);
2298 return;
2299 } else {
2300 fuse_log(FUSE_LOG_ERR,
2301 "%s: Unexpected type;"
2302 "Expecting 'prefix', 'ok', 'bad' or 'map' in rule %zu\n",
2303 __func__, lo->xattr_map_nentries);
2304 exit(1);
2307 if (*map++ != sep) {
2308 fuse_log(FUSE_LOG_ERR,
2309 "%s: Missing '%c' at end of type field of rule %zu\n",
2310 __func__, sep, lo->xattr_map_nentries);
2311 exit(1);
2314 /* Start of 'scope' */
2315 if (strstart(map, "client", &map)) {
2316 tmp_entry.flags |= XATTR_MAP_FLAG_CLIENT;
2317 } else if (strstart(map, "server", &map)) {
2318 tmp_entry.flags |= XATTR_MAP_FLAG_SERVER;
2319 } else if (strstart(map, "all", &map)) {
2320 tmp_entry.flags |= XATTR_MAP_FLAG_ALL;
2321 } else {
2322 fuse_log(FUSE_LOG_ERR,
2323 "%s: Unexpected scope;"
2324 " Expecting 'client', 'server', or 'all', in rule %zu\n",
2325 __func__, lo->xattr_map_nentries);
2326 exit(1);
2329 if (*map++ != sep) {
2330 fuse_log(FUSE_LOG_ERR,
2331 "%s: Expecting '%c' found '%c'"
2332 " after scope in rule %zu\n",
2333 __func__, sep, *map, lo->xattr_map_nentries);
2334 exit(1);
2337 /* At start of 'key' field */
2338 tmp = strchr(map, sep);
2339 if (!tmp) {
2340 fuse_log(FUSE_LOG_ERR,
2341 "%s: Missing '%c' at end of key field of rule %zu",
2342 __func__, sep, lo->xattr_map_nentries);
2343 exit(1);
2345 tmp_entry.key = g_strndup(map, tmp - map);
2346 map = tmp + 1;
2348 /* At start of 'prepend' field */
2349 tmp = strchr(map, sep);
2350 if (!tmp) {
2351 fuse_log(FUSE_LOG_ERR,
2352 "%s: Missing '%c' at end of prepend field of rule %zu",
2353 __func__, sep, lo->xattr_map_nentries);
2354 exit(1);
2356 tmp_entry.prepend = g_strndup(map, tmp - map);
2357 map = tmp + 1;
2359 add_xattrmap_entry(lo, &tmp_entry);
2360 /* End of rule - go around again for another rule */
2363 if (!lo->xattr_map_nentries) {
2364 fuse_log(FUSE_LOG_ERR, "Empty xattr map\n");
2365 exit(1);
2370 * For use with getxattr/setxattr/removexattr, where the client
2371 * gives us a name and we may need to choose a different one.
2372 * Allocates a buffer for the result placing it in *out_name.
2373 * If there's no change then *out_name is not set.
2374 * Returns 0 on success
2375 * Can return -EPERM to indicate we block a given attribute
2376 * (in which case out_name is not allocated)
2377 * Can return -ENOMEM to indicate out_name couldn't be allocated.
2379 static int xattr_map_client(const struct lo_data *lo, const char *client_name,
2380 char **out_name)
2382 size_t i;
2383 for (i = 0; i < lo->xattr_map_nentries; i++) {
2384 const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
2386 if ((cur_entry->flags & XATTR_MAP_FLAG_CLIENT) &&
2387 (strstart(client_name, cur_entry->key, NULL))) {
2388 if (cur_entry->flags & XATTR_MAP_FLAG_BAD) {
2389 return -EPERM;
2391 if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
2392 /* Unmodified name */
2393 return 0;
2395 if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
2396 *out_name = g_try_malloc(strlen(client_name) +
2397 strlen(cur_entry->prepend) + 1);
2398 if (!*out_name) {
2399 return -ENOMEM;
2401 sprintf(*out_name, "%s%s", cur_entry->prepend, client_name);
2402 return 0;
2407 return -EPERM;
2411 * For use with listxattr where the server fs gives us a name and we may need
2412 * to sanitize this for the client.
2413 * Returns a pointer to the result in *out_name
2414 * This is always the original string or the current string with some prefix
2415 * removed; no reallocation is done.
2416 * Returns 0 on success
2417 * Can return -ENODATA to indicate the name should be dropped from the list.
2419 static int xattr_map_server(const struct lo_data *lo, const char *server_name,
2420 const char **out_name)
2422 size_t i;
2423 const char *end;
2425 for (i = 0; i < lo->xattr_map_nentries; i++) {
2426 const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
2428 if ((cur_entry->flags & XATTR_MAP_FLAG_SERVER) &&
2429 (strstart(server_name, cur_entry->prepend, &end))) {
2430 if (cur_entry->flags & XATTR_MAP_FLAG_BAD) {
2431 return -ENODATA;
2433 if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
2434 *out_name = server_name;
2435 return 0;
2437 if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
2438 /* Remove prefix */
2439 *out_name = end;
2440 return 0;
2445 return -ENODATA;
2448 static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
2449 size_t size)
2451 struct lo_data *lo = lo_data(req);
2452 char *value = NULL;
2453 char procname[64];
2454 const char *name;
2455 char *mapped_name;
2456 struct lo_inode *inode;
2457 ssize_t ret;
2458 int saverr;
2459 int fd = -1;
2461 mapped_name = NULL;
2462 name = in_name;
2463 if (lo->xattrmap) {
2464 ret = xattr_map_client(lo, in_name, &mapped_name);
2465 if (ret < 0) {
2466 if (ret == -EPERM) {
2467 ret = -ENODATA;
2469 fuse_reply_err(req, -ret);
2470 return;
2472 if (mapped_name) {
2473 name = mapped_name;
2477 inode = lo_inode(req, ino);
2478 if (!inode) {
2479 fuse_reply_err(req, EBADF);
2480 g_free(mapped_name);
2481 return;
2484 saverr = ENOSYS;
2485 if (!lo_data(req)->xattr) {
2486 goto out;
2489 fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n",
2490 ino, name, size);
2492 if (size) {
2493 value = malloc(size);
2494 if (!value) {
2495 goto out_err;
2499 sprintf(procname, "%i", inode->fd);
2501 * It is not safe to open() non-regular/non-dir files in file server
2502 * unless O_PATH is used, so use that method for regular files/dir
2503 * only (as it seems giving less performance overhead).
2504 * Otherwise, call fchdir() to avoid open().
2506 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2507 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2508 if (fd < 0) {
2509 goto out_err;
2511 ret = fgetxattr(fd, name, value, size);
2512 } else {
2513 /* fchdir should not fail here */
2514 assert(fchdir(lo->proc_self_fd) == 0);
2515 ret = getxattr(procname, name, value, size);
2516 assert(fchdir(lo->root.fd) == 0);
2519 if (ret == -1) {
2520 goto out_err;
2522 if (size) {
2523 saverr = 0;
2524 if (ret == 0) {
2525 goto out;
2527 fuse_reply_buf(req, value, ret);
2528 } else {
2529 fuse_reply_xattr(req, ret);
2531 out_free:
2532 free(value);
2534 if (fd >= 0) {
2535 close(fd);
2538 lo_inode_put(lo, &inode);
2539 return;
2541 out_err:
2542 saverr = errno;
2543 out:
2544 fuse_reply_err(req, saverr);
2545 g_free(mapped_name);
2546 goto out_free;
2549 static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
2551 struct lo_data *lo = lo_data(req);
2552 char *value = NULL;
2553 char procname[64];
2554 struct lo_inode *inode;
2555 ssize_t ret;
2556 int saverr;
2557 int fd = -1;
2559 inode = lo_inode(req, ino);
2560 if (!inode) {
2561 fuse_reply_err(req, EBADF);
2562 return;
2565 saverr = ENOSYS;
2566 if (!lo_data(req)->xattr) {
2567 goto out;
2570 fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino,
2571 size);
2573 if (size) {
2574 value = malloc(size);
2575 if (!value) {
2576 goto out_err;
2580 sprintf(procname, "%i", inode->fd);
2581 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2582 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2583 if (fd < 0) {
2584 goto out_err;
2586 ret = flistxattr(fd, value, size);
2587 } else {
2588 /* fchdir should not fail here */
2589 assert(fchdir(lo->proc_self_fd) == 0);
2590 ret = listxattr(procname, value, size);
2591 assert(fchdir(lo->root.fd) == 0);
2594 if (ret == -1) {
2595 goto out_err;
2597 if (size) {
2598 saverr = 0;
2599 if (ret == 0) {
2600 goto out;
2603 if (lo->xattr_map_list) {
2605 * Map the names back, some attributes might be dropped,
2606 * some shortened, but not increased, so we shouldn't
2607 * run out of room.
2609 size_t out_index, in_index;
2610 out_index = 0;
2611 in_index = 0;
2612 while (in_index < ret) {
2613 const char *map_out;
2614 char *in_ptr = value + in_index;
2615 /* Length of current attribute name */
2616 size_t in_len = strlen(value + in_index) + 1;
2618 int mapret = xattr_map_server(lo, in_ptr, &map_out);
2619 if (mapret != -ENODATA && mapret != 0) {
2620 /* Shouldn't happen */
2621 saverr = -mapret;
2622 goto out;
2624 if (mapret == 0) {
2625 /* Either unchanged, or truncated */
2626 size_t out_len;
2627 if (map_out != in_ptr) {
2628 /* +1 copies the NIL */
2629 out_len = strlen(map_out) + 1;
2630 } else {
2631 /* No change */
2632 out_len = in_len;
2635 * Move result along, may still be needed for an unchanged
2636 * entry if a previous entry was changed.
2638 memmove(value + out_index, map_out, out_len);
2640 out_index += out_len;
2642 in_index += in_len;
2644 ret = out_index;
2645 if (ret == 0) {
2646 goto out;
2649 fuse_reply_buf(req, value, ret);
2650 } else {
2652 * xattrmap only ever shortens the result,
2653 * so we don't need to do anything clever with the
2654 * allocation length here.
2656 fuse_reply_xattr(req, ret);
2658 out_free:
2659 free(value);
2661 if (fd >= 0) {
2662 close(fd);
2665 lo_inode_put(lo, &inode);
2666 return;
2668 out_err:
2669 saverr = errno;
2670 out:
2671 fuse_reply_err(req, saverr);
2672 goto out_free;
2675 static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
2676 const char *value, size_t size, int flags)
2678 char procname[64];
2679 const char *name;
2680 char *mapped_name;
2681 struct lo_data *lo = lo_data(req);
2682 struct lo_inode *inode;
2683 ssize_t ret;
2684 int saverr;
2685 int fd = -1;
2687 mapped_name = NULL;
2688 name = in_name;
2689 if (lo->xattrmap) {
2690 ret = xattr_map_client(lo, in_name, &mapped_name);
2691 if (ret < 0) {
2692 fuse_reply_err(req, -ret);
2693 return;
2695 if (mapped_name) {
2696 name = mapped_name;
2700 inode = lo_inode(req, ino);
2701 if (!inode) {
2702 fuse_reply_err(req, EBADF);
2703 g_free(mapped_name);
2704 return;
2707 saverr = ENOSYS;
2708 if (!lo_data(req)->xattr) {
2709 goto out;
2712 fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64
2713 ", name=%s value=%s size=%zd)\n", ino, name, value, size);
2715 sprintf(procname, "%i", inode->fd);
2716 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2717 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2718 if (fd < 0) {
2719 saverr = errno;
2720 goto out;
2722 ret = fsetxattr(fd, name, value, size, flags);
2723 } else {
2724 /* fchdir should not fail here */
2725 assert(fchdir(lo->proc_self_fd) == 0);
2726 ret = setxattr(procname, name, value, size, flags);
2727 assert(fchdir(lo->root.fd) == 0);
2730 saverr = ret == -1 ? errno : 0;
2732 out:
2733 if (fd >= 0) {
2734 close(fd);
2737 lo_inode_put(lo, &inode);
2738 g_free(mapped_name);
2739 fuse_reply_err(req, saverr);
2742 static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *in_name)
2744 char procname[64];
2745 const char *name;
2746 char *mapped_name;
2747 struct lo_data *lo = lo_data(req);
2748 struct lo_inode *inode;
2749 ssize_t ret;
2750 int saverr;
2751 int fd = -1;
2753 mapped_name = NULL;
2754 name = in_name;
2755 if (lo->xattrmap) {
2756 ret = xattr_map_client(lo, in_name, &mapped_name);
2757 if (ret < 0) {
2758 fuse_reply_err(req, -ret);
2759 return;
2761 if (mapped_name) {
2762 name = mapped_name;
2766 inode = lo_inode(req, ino);
2767 if (!inode) {
2768 fuse_reply_err(req, EBADF);
2769 g_free(mapped_name);
2770 return;
2773 saverr = ENOSYS;
2774 if (!lo_data(req)->xattr) {
2775 goto out;
2778 fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino,
2779 name);
2781 sprintf(procname, "%i", inode->fd);
2782 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2783 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2784 if (fd < 0) {
2785 saverr = errno;
2786 goto out;
2788 ret = fremovexattr(fd, name);
2789 } else {
2790 /* fchdir should not fail here */
2791 assert(fchdir(lo->proc_self_fd) == 0);
2792 ret = removexattr(procname, name);
2793 assert(fchdir(lo->root.fd) == 0);
2796 saverr = ret == -1 ? errno : 0;
2798 out:
2799 if (fd >= 0) {
2800 close(fd);
2803 lo_inode_put(lo, &inode);
2804 g_free(mapped_name);
2805 fuse_reply_err(req, saverr);
2808 #ifdef HAVE_COPY_FILE_RANGE
2809 static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in,
2810 struct fuse_file_info *fi_in, fuse_ino_t ino_out,
2811 off_t off_out, struct fuse_file_info *fi_out,
2812 size_t len, int flags)
2814 int in_fd, out_fd;
2815 ssize_t res;
2817 in_fd = lo_fi_fd(req, fi_in);
2818 out_fd = lo_fi_fd(req, fi_out);
2820 fuse_log(FUSE_LOG_DEBUG,
2821 "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, "
2822 "off=%lu, ino=%" PRIu64 "/fd=%d, "
2823 "off=%lu, size=%zd, flags=0x%x)\n",
2824 ino_in, in_fd, off_in, ino_out, out_fd, off_out, len, flags);
2826 res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags);
2827 if (res < 0) {
2828 fuse_reply_err(req, errno);
2829 } else {
2830 fuse_reply_write(req, res);
2833 #endif
2835 static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
2836 struct fuse_file_info *fi)
2838 off_t res;
2840 (void)ino;
2841 res = lseek(lo_fi_fd(req, fi), off, whence);
2842 if (res != -1) {
2843 fuse_reply_lseek(req, res);
2844 } else {
2845 fuse_reply_err(req, errno);
2849 static void lo_destroy(void *userdata)
2851 struct lo_data *lo = (struct lo_data *)userdata;
2853 pthread_mutex_lock(&lo->mutex);
2854 while (true) {
2855 GHashTableIter iter;
2856 gpointer key, value;
2858 g_hash_table_iter_init(&iter, lo->inodes);
2859 if (!g_hash_table_iter_next(&iter, &key, &value)) {
2860 break;
2863 struct lo_inode *inode = value;
2864 unref_inode(lo, inode, inode->nlookup);
2866 pthread_mutex_unlock(&lo->mutex);
2869 static struct fuse_lowlevel_ops lo_oper = {
2870 .init = lo_init,
2871 .lookup = lo_lookup,
2872 .mkdir = lo_mkdir,
2873 .mknod = lo_mknod,
2874 .symlink = lo_symlink,
2875 .link = lo_link,
2876 .unlink = lo_unlink,
2877 .rmdir = lo_rmdir,
2878 .rename = lo_rename,
2879 .forget = lo_forget,
2880 .forget_multi = lo_forget_multi,
2881 .getattr = lo_getattr,
2882 .setattr = lo_setattr,
2883 .readlink = lo_readlink,
2884 .opendir = lo_opendir,
2885 .readdir = lo_readdir,
2886 .readdirplus = lo_readdirplus,
2887 .releasedir = lo_releasedir,
2888 .fsyncdir = lo_fsyncdir,
2889 .create = lo_create,
2890 .getlk = lo_getlk,
2891 .setlk = lo_setlk,
2892 .open = lo_open,
2893 .release = lo_release,
2894 .flush = lo_flush,
2895 .fsync = lo_fsync,
2896 .read = lo_read,
2897 .write_buf = lo_write_buf,
2898 .statfs = lo_statfs,
2899 .fallocate = lo_fallocate,
2900 .flock = lo_flock,
2901 .getxattr = lo_getxattr,
2902 .listxattr = lo_listxattr,
2903 .setxattr = lo_setxattr,
2904 .removexattr = lo_removexattr,
2905 #ifdef HAVE_COPY_FILE_RANGE
2906 .copy_file_range = lo_copy_file_range,
2907 #endif
2908 .lseek = lo_lseek,
2909 .destroy = lo_destroy,
2912 /* Print vhost-user.json backend program capabilities */
2913 static void print_capabilities(void)
2915 printf("{\n");
2916 printf(" \"type\": \"fs\"\n");
2917 printf("}\n");
2921 * Drop all Linux capabilities because the wait parent process only needs to
2922 * sit in waitpid(2) and terminate.
2924 static void setup_wait_parent_capabilities(void)
2926 capng_setpid(syscall(SYS_gettid));
2927 capng_clear(CAPNG_SELECT_BOTH);
2928 capng_apply(CAPNG_SELECT_BOTH);
2932 * Move to a new mount, net, and pid namespaces to isolate this process.
2934 static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
2936 pid_t child;
2939 * Create a new pid namespace for *child* processes. We'll have to
2940 * fork in order to enter the new pid namespace. A new mount namespace
2941 * is also needed so that we can remount /proc for the new pid
2942 * namespace.
2944 * Our UNIX domain sockets have been created. Now we can move to
2945 * an empty network namespace to prevent TCP/IP and other network
2946 * activity in case this process is compromised.
2948 if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) {
2949 fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
2950 exit(1);
2953 child = fork();
2954 if (child < 0) {
2955 fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n");
2956 exit(1);
2958 if (child > 0) {
2959 pid_t waited;
2960 int wstatus;
2962 setup_wait_parent_capabilities();
2964 /* The parent waits for the child */
2965 do {
2966 waited = waitpid(child, &wstatus, 0);
2967 } while (waited < 0 && errno == EINTR && !se->exited);
2969 /* We were terminated by a signal, see fuse_signals.c */
2970 if (se->exited) {
2971 exit(0);
2974 if (WIFEXITED(wstatus)) {
2975 exit(WEXITSTATUS(wstatus));
2978 exit(1);
2981 /* Send us SIGTERM when the parent thread terminates, see prctl(2) */
2982 prctl(PR_SET_PDEATHSIG, SIGTERM);
2985 * If the mounts have shared propagation then we want to opt out so our
2986 * mount changes don't affect the parent mount namespace.
2988 if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) {
2989 fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n");
2990 exit(1);
2993 /* The child must remount /proc to use the new pid namespace */
2994 if (mount("proc", "/proc", "proc",
2995 MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) {
2996 fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n");
2997 exit(1);
3001 * We only need /proc/self/fd. Prevent ".." from accessing parent
3002 * directories of /proc/self/fd by bind-mounting it over /proc. Since / was
3003 * previously remounted with MS_REC | MS_SLAVE this mount change only
3004 * affects our process.
3006 if (mount("/proc/self/fd", "/proc", NULL, MS_BIND, NULL) < 0) {
3007 fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, MS_BIND): %m\n");
3008 exit(1);
3011 /* Get the /proc (actually /proc/self/fd, see above) file descriptor */
3012 lo->proc_self_fd = open("/proc", O_PATH);
3013 if (lo->proc_self_fd == -1) {
3014 fuse_log(FUSE_LOG_ERR, "open(/proc, O_PATH): %m\n");
3015 exit(1);
3020 * Capture the capability state, we'll need to restore this for individual
3021 * threads later; see load_capng.
3023 static void setup_capng(void)
3025 /* Note this accesses /proc so has to happen before the sandbox */
3026 if (capng_get_caps_process()) {
3027 fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n");
3028 exit(1);
3030 pthread_mutex_init(&cap.mutex, NULL);
3031 pthread_mutex_lock(&cap.mutex);
3032 cap.saved = capng_save_state();
3033 if (!cap.saved) {
3034 fuse_log(FUSE_LOG_ERR, "capng_save_state\n");
3035 exit(1);
3037 pthread_mutex_unlock(&cap.mutex);
3040 static void cleanup_capng(void)
3042 free(cap.saved);
3043 cap.saved = NULL;
3044 pthread_mutex_destroy(&cap.mutex);
3049 * Make the source directory our root so symlinks cannot escape and no other
3050 * files are accessible. Assumes unshare(CLONE_NEWNS) was already called.
3052 static void setup_mounts(const char *source)
3054 int oldroot;
3055 int newroot;
3057 if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) {
3058 fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
3059 exit(1);
3062 /* This magic is based on lxc's lxc_pivot_root() */
3063 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
3064 if (oldroot < 0) {
3065 fuse_log(FUSE_LOG_ERR, "open(/): %m\n");
3066 exit(1);
3069 newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
3070 if (newroot < 0) {
3071 fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source);
3072 exit(1);
3075 if (fchdir(newroot) < 0) {
3076 fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
3077 exit(1);
3080 if (syscall(__NR_pivot_root, ".", ".") < 0) {
3081 fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n");
3082 exit(1);
3085 if (fchdir(oldroot) < 0) {
3086 fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n");
3087 exit(1);
3090 if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) {
3091 fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n");
3092 exit(1);
3095 if (umount2(".", MNT_DETACH) < 0) {
3096 fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n");
3097 exit(1);
3100 if (fchdir(newroot) < 0) {
3101 fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
3102 exit(1);
3105 close(newroot);
3106 close(oldroot);
3110 * Only keep whitelisted capabilities that are needed for file system operation
3111 * The (possibly NULL) modcaps_in string passed in is free'd before exit.
3113 static void setup_capabilities(char *modcaps_in)
3115 char *modcaps = modcaps_in;
3116 pthread_mutex_lock(&cap.mutex);
3117 capng_restore_state(&cap.saved);
3120 * Whitelist file system-related capabilities that are needed for a file
3121 * server to act like root. Drop everything else like networking and
3122 * sysadmin capabilities.
3124 * Exclusions:
3125 * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl
3126 * and we don't support that.
3127 * 2. CAP_MAC_OVERRIDE is not included because it only seems to be
3128 * used by the Smack LSM. Omit it until there is demand for it.
3130 capng_setpid(syscall(SYS_gettid));
3131 capng_clear(CAPNG_SELECT_BOTH);
3132 if (capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE,
3133 CAP_CHOWN,
3134 CAP_DAC_OVERRIDE,
3135 CAP_FOWNER,
3136 CAP_FSETID,
3137 CAP_SETGID,
3138 CAP_SETUID,
3139 CAP_MKNOD,
3140 CAP_SETFCAP,
3141 -1)) {
3142 fuse_log(FUSE_LOG_ERR, "%s: capng_updatev failed\n", __func__);
3143 exit(1);
3147 * The modcaps option is a colon separated list of caps,
3148 * each preceded by either + or -.
3150 while (modcaps) {
3151 capng_act_t action;
3152 int cap;
3154 char *next = strchr(modcaps, ':');
3155 if (next) {
3156 *next = '\0';
3157 next++;
3160 switch (modcaps[0]) {
3161 case '+':
3162 action = CAPNG_ADD;
3163 break;
3165 case '-':
3166 action = CAPNG_DROP;
3167 break;
3169 default:
3170 fuse_log(FUSE_LOG_ERR,
3171 "%s: Expecting '+'/'-' in modcaps but found '%c'\n",
3172 __func__, modcaps[0]);
3173 exit(1);
3175 cap = capng_name_to_capability(modcaps + 1);
3176 if (cap < 0) {
3177 fuse_log(FUSE_LOG_ERR, "%s: Unknown capability '%s'\n", __func__,
3178 modcaps);
3179 exit(1);
3181 if (capng_update(action, CAPNG_PERMITTED | CAPNG_EFFECTIVE, cap)) {
3182 fuse_log(FUSE_LOG_ERR, "%s: capng_update failed for '%s'\n",
3183 __func__, modcaps);
3184 exit(1);
3187 modcaps = next;
3189 g_free(modcaps_in);
3191 if (capng_apply(CAPNG_SELECT_BOTH)) {
3192 fuse_log(FUSE_LOG_ERR, "%s: capng_apply failed\n", __func__);
3193 exit(1);
3196 cap.saved = capng_save_state();
3197 if (!cap.saved) {
3198 fuse_log(FUSE_LOG_ERR, "%s: capng_save_state failed\n", __func__);
3199 exit(1);
3201 pthread_mutex_unlock(&cap.mutex);
3205 * Use chroot as a weaker sandbox for environments where the process is
3206 * launched without CAP_SYS_ADMIN.
3208 static void setup_chroot(struct lo_data *lo)
3210 lo->proc_self_fd = open("/proc/self/fd", O_PATH);
3211 if (lo->proc_self_fd == -1) {
3212 fuse_log(FUSE_LOG_ERR, "open(\"/proc/self/fd\", O_PATH): %m\n");
3213 exit(1);
3217 * Make the shared directory the file system root so that FUSE_OPEN
3218 * (lo_open()) cannot escape the shared directory by opening a symlink.
3220 * The chroot(2) syscall is later disabled by seccomp and the
3221 * CAP_SYS_CHROOT capability is dropped so that tampering with the chroot
3222 * is not possible.
3224 * However, it's still possible to escape the chroot via lo->proc_self_fd
3225 * but that requires first gaining control of the process.
3227 if (chroot(lo->source) != 0) {
3228 fuse_log(FUSE_LOG_ERR, "chroot(\"%s\"): %m\n", lo->source);
3229 exit(1);
3232 /* Move into the chroot */
3233 if (chdir("/") != 0) {
3234 fuse_log(FUSE_LOG_ERR, "chdir(\"/\"): %m\n");
3235 exit(1);
3240 * Lock down this process to prevent access to other processes or files outside
3241 * source directory. This reduces the impact of arbitrary code execution bugs.
3243 static void setup_sandbox(struct lo_data *lo, struct fuse_session *se,
3244 bool enable_syslog)
3246 if (lo->sandbox == SANDBOX_NAMESPACE) {
3247 setup_namespaces(lo, se);
3248 setup_mounts(lo->source);
3249 } else {
3250 setup_chroot(lo);
3253 setup_seccomp(enable_syslog);
3254 setup_capabilities(g_strdup(lo->modcaps));
3257 /* Set the maximum number of open file descriptors */
3258 static void setup_nofile_rlimit(unsigned long rlimit_nofile)
3260 struct rlimit rlim = {
3261 .rlim_cur = rlimit_nofile,
3262 .rlim_max = rlimit_nofile,
3265 if (rlimit_nofile == 0) {
3266 return; /* nothing to do */
3269 if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
3270 /* Ignore SELinux denials */
3271 if (errno == EPERM) {
3272 return;
3275 fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n");
3276 exit(1);
3280 static void log_func(enum fuse_log_level level, const char *fmt, va_list ap)
3282 g_autofree char *localfmt = NULL;
3284 if (current_log_level < level) {
3285 return;
3288 if (current_log_level == FUSE_LOG_DEBUG) {
3289 if (!use_syslog) {
3290 localfmt = g_strdup_printf("[%" PRId64 "] [ID: %08ld] %s",
3291 get_clock(), syscall(__NR_gettid), fmt);
3292 } else {
3293 localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid),
3294 fmt);
3296 fmt = localfmt;
3299 if (use_syslog) {
3300 int priority = LOG_ERR;
3301 switch (level) {
3302 case FUSE_LOG_EMERG:
3303 priority = LOG_EMERG;
3304 break;
3305 case FUSE_LOG_ALERT:
3306 priority = LOG_ALERT;
3307 break;
3308 case FUSE_LOG_CRIT:
3309 priority = LOG_CRIT;
3310 break;
3311 case FUSE_LOG_ERR:
3312 priority = LOG_ERR;
3313 break;
3314 case FUSE_LOG_WARNING:
3315 priority = LOG_WARNING;
3316 break;
3317 case FUSE_LOG_NOTICE:
3318 priority = LOG_NOTICE;
3319 break;
3320 case FUSE_LOG_INFO:
3321 priority = LOG_INFO;
3322 break;
3323 case FUSE_LOG_DEBUG:
3324 priority = LOG_DEBUG;
3325 break;
3327 vsyslog(priority, fmt, ap);
3328 } else {
3329 vfprintf(stderr, fmt, ap);
3333 static void setup_root(struct lo_data *lo, struct lo_inode *root)
3335 int fd, res;
3336 struct stat stat;
3338 fd = open("/", O_PATH);
3339 if (fd == -1) {
3340 fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source);
3341 exit(1);
3344 res = fstatat(fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
3345 if (res == -1) {
3346 fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source);
3347 exit(1);
3350 root->filetype = S_IFDIR;
3351 root->fd = fd;
3352 root->key.ino = stat.st_ino;
3353 root->key.dev = stat.st_dev;
3354 root->nlookup = 2;
3355 g_atomic_int_set(&root->refcount, 2);
3358 static guint lo_key_hash(gconstpointer key)
3360 const struct lo_key *lkey = key;
3362 return (guint)lkey->ino + (guint)lkey->dev;
3365 static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
3367 const struct lo_key *la = a;
3368 const struct lo_key *lb = b;
3370 return la->ino == lb->ino && la->dev == lb->dev;
3373 static void fuse_lo_data_cleanup(struct lo_data *lo)
3375 if (lo->inodes) {
3376 g_hash_table_destroy(lo->inodes);
3378 lo_map_destroy(&lo->fd_map);
3379 lo_map_destroy(&lo->dirp_map);
3380 lo_map_destroy(&lo->ino_map);
3382 if (lo->proc_self_fd >= 0) {
3383 close(lo->proc_self_fd);
3386 if (lo->root.fd >= 0) {
3387 close(lo->root.fd);
3390 free(lo->xattrmap);
3391 free_xattrmap(lo);
3392 free(lo->source);
3395 int main(int argc, char *argv[])
3397 struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
3398 struct fuse_session *se;
3399 struct fuse_cmdline_opts opts;
3400 struct lo_data lo = {
3401 .sandbox = SANDBOX_NAMESPACE,
3402 .debug = 0,
3403 .writeback = 0,
3404 .posix_lock = 0,
3405 .allow_direct_io = 0,
3406 .proc_self_fd = -1,
3408 struct lo_map_elem *root_elem;
3409 int ret = -1;
3411 /* Don't mask creation mode, kernel already did that */
3412 umask(0);
3414 qemu_init_exec_dir(argv[0]);
3416 pthread_mutex_init(&lo.mutex, NULL);
3417 lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal);
3418 lo.root.fd = -1;
3419 lo.root.fuse_ino = FUSE_ROOT_ID;
3420 lo.cache = CACHE_AUTO;
3423 * Set up the ino map like this:
3424 * [0] Reserved (will not be used)
3425 * [1] Root inode
3427 lo_map_init(&lo.ino_map);
3428 lo_map_reserve(&lo.ino_map, 0)->in_use = false;
3429 root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino);
3430 root_elem->inode = &lo.root;
3432 lo_map_init(&lo.dirp_map);
3433 lo_map_init(&lo.fd_map);
3435 if (fuse_parse_cmdline(&args, &opts) != 0) {
3436 goto err_out1;
3438 fuse_set_log_func(log_func);
3439 use_syslog = opts.syslog;
3440 if (use_syslog) {
3441 openlog("virtiofsd", LOG_PID, LOG_DAEMON);
3444 if (opts.show_help) {
3445 printf("usage: %s [options]\n\n", argv[0]);
3446 fuse_cmdline_help();
3447 printf(" -o source=PATH shared directory tree\n");
3448 fuse_lowlevel_help();
3449 ret = 0;
3450 goto err_out1;
3451 } else if (opts.show_version) {
3452 fuse_lowlevel_version();
3453 ret = 0;
3454 goto err_out1;
3455 } else if (opts.print_capabilities) {
3456 print_capabilities();
3457 ret = 0;
3458 goto err_out1;
3461 if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) {
3462 goto err_out1;
3465 if (opts.log_level != 0) {
3466 current_log_level = opts.log_level;
3467 } else {
3468 /* default log level is INFO */
3469 current_log_level = FUSE_LOG_INFO;
3471 lo.debug = opts.debug;
3472 if (lo.debug) {
3473 current_log_level = FUSE_LOG_DEBUG;
3475 if (lo.source) {
3476 struct stat stat;
3477 int res;
3479 res = lstat(lo.source, &stat);
3480 if (res == -1) {
3481 fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n",
3482 lo.source);
3483 exit(1);
3485 if (!S_ISDIR(stat.st_mode)) {
3486 fuse_log(FUSE_LOG_ERR, "source is not a directory\n");
3487 exit(1);
3489 } else {
3490 lo.source = strdup("/");
3493 if (lo.xattrmap) {
3494 parse_xattrmap(&lo);
3497 if (!lo.timeout_set) {
3498 switch (lo.cache) {
3499 case CACHE_NONE:
3500 lo.timeout = 0.0;
3501 break;
3503 case CACHE_AUTO:
3504 lo.timeout = 1.0;
3505 break;
3507 case CACHE_ALWAYS:
3508 lo.timeout = 86400.0;
3509 break;
3511 } else if (lo.timeout < 0) {
3512 fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout);
3513 exit(1);
3516 se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo);
3517 if (se == NULL) {
3518 goto err_out1;
3521 if (fuse_set_signal_handlers(se) != 0) {
3522 goto err_out2;
3525 if (fuse_session_mount(se) != 0) {
3526 goto err_out3;
3529 fuse_daemonize(opts.foreground);
3531 setup_nofile_rlimit(opts.rlimit_nofile);
3533 /* Must be before sandbox since it wants /proc */
3534 setup_capng();
3536 setup_sandbox(&lo, se, opts.syslog);
3538 setup_root(&lo, &lo.root);
3539 /* Block until ctrl+c or fusermount -u */
3540 ret = virtio_loop(se);
3542 fuse_session_unmount(se);
3543 cleanup_capng();
3544 err_out3:
3545 fuse_remove_signal_handlers(se);
3546 err_out2:
3547 fuse_session_destroy(se);
3548 err_out1:
3549 fuse_opt_free_args(&args);
3551 fuse_lo_data_cleanup(&lo);
3553 return ret ? 1 : 0;