2 * FUSE: Filesystem in Userspace
3 * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
5 * This program can be distributed under the terms of the GNU GPLv2.
6 * See the file COPYING.
11 * This file system mirrors the existing file system hierarchy of the
12 * system, starting at the root file system. This is implemented by
13 * just "passing through" all requests to the corresponding user-space
14 * libc functions. In contrast to passthrough.c and passthrough_fh.c,
15 * this implementation uses the low-level API. Its performance should
16 * be the least bad among the three, but many operations are not
17 * implemented. In particular, it is not possible to remove files (or
18 * directories) because the code necessary to defer actual removal
19 * until the file is not opened anymore would make the example much
22 * When writeback caching is enabled (-o writeback mount option), it
23 * is only possible to write to files for which the mounting user has
24 * read permissions. This is because the writeback cache requires the
25 * kernel to be able to issue read requests for all files (which the
26 * passthrough filesystem cannot satisfy if it can't read the file in
27 * the underlying filesystem).
31 * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o
35 * \include passthrough_ll.c
38 #include "qemu/osdep.h"
39 #include "qemu/timer.h"
40 #include "fuse_virtio.h"
42 #include "fuse_lowlevel.h"
43 #include "standard-headers/linux/fuse.h"
58 #include <sys/mount.h>
59 #include <sys/prctl.h>
60 #include <sys/resource.h>
61 #include <sys/syscall.h>
62 #include <sys/types.h>
64 #include <sys/xattr.h>
68 #include "qemu/cutils.h"
69 #include "passthrough_helpers.h"
70 #include "passthrough_seccomp.h"
72 /* Keep track of inode posix locks for each owner. */
73 struct lo_inode_plock
{
75 int fd
; /* fd for OFD locks */
80 struct lo_inode
*inode
;
88 /* Maps FUSE fh or ino values to internal objects */
90 struct lo_map_elem
*elems
;
104 * Atomic reference count for this object. The nlookup field holds a
105 * reference and release it when nlookup reaches 0.
112 * This counter keeps the inode alive during the FUSE session.
113 * Incremented when the FUSE inode number is sent in a reply
114 * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc). Decremented when an inode is
115 * released by requests like FUSE_FORGET, FUSE_RMDIR, FUSE_RENAME, etc.
117 * Note that this value is untrusted because the client can manipulate
118 * it arbitrarily using FUSE_FORGET requests.
120 * Protected by lo->mutex.
125 pthread_mutex_t plock_mutex
;
126 GHashTable
*posix_locks
; /* protected by lo_inode->plock_mutex */
131 * So we can detect crossmount roots
132 * (As such, this only needs to be valid for directories. Note
133 * that files can have multiple parents due to hard links, and so
134 * their parent_dev may fluctuate.)
155 typedef struct xattr_map_entry
{
162 pthread_mutex_t mutex
;
176 int readdirplus_clear
;
177 int announce_submounts
;
179 struct lo_inode root
;
180 GHashTable
*inodes
; /* protected by lo->mutex */
181 struct lo_map ino_map
; /* protected by lo->mutex */
182 struct lo_map dirp_map
; /* protected by lo->mutex */
183 struct lo_map fd_map
; /* protected by lo->mutex */
184 XattrMapEntry
*xattr_map_list
;
185 size_t xattr_map_nentries
;
187 /* An O_PATH file descriptor to /proc/self/fd/ */
191 static const struct fuse_opt lo_opts
[] = {
192 { "sandbox=namespace",
193 offsetof(struct lo_data
, sandbox
),
196 offsetof(struct lo_data
, sandbox
),
198 { "writeback", offsetof(struct lo_data
, writeback
), 1 },
199 { "no_writeback", offsetof(struct lo_data
, writeback
), 0 },
200 { "source=%s", offsetof(struct lo_data
, source
), 0 },
201 { "flock", offsetof(struct lo_data
, flock
), 1 },
202 { "no_flock", offsetof(struct lo_data
, flock
), 0 },
203 { "posix_lock", offsetof(struct lo_data
, posix_lock
), 1 },
204 { "no_posix_lock", offsetof(struct lo_data
, posix_lock
), 0 },
205 { "xattr", offsetof(struct lo_data
, xattr
), 1 },
206 { "no_xattr", offsetof(struct lo_data
, xattr
), 0 },
207 { "xattrmap=%s", offsetof(struct lo_data
, xattrmap
), 0 },
208 { "modcaps=%s", offsetof(struct lo_data
, modcaps
), 0 },
209 { "timeout=%lf", offsetof(struct lo_data
, timeout
), 0 },
210 { "timeout=", offsetof(struct lo_data
, timeout_set
), 1 },
211 { "cache=none", offsetof(struct lo_data
, cache
), CACHE_NONE
},
212 { "cache=auto", offsetof(struct lo_data
, cache
), CACHE_AUTO
},
213 { "cache=always", offsetof(struct lo_data
, cache
), CACHE_ALWAYS
},
214 { "readdirplus", offsetof(struct lo_data
, readdirplus_set
), 1 },
215 { "no_readdirplus", offsetof(struct lo_data
, readdirplus_clear
), 1 },
216 { "announce_submounts", offsetof(struct lo_data
, announce_submounts
), 1 },
217 { "allow_direct_io", offsetof(struct lo_data
, allow_direct_io
), 1 },
218 { "no_allow_direct_io", offsetof(struct lo_data
, allow_direct_io
), 0 },
221 static bool use_syslog
= false;
222 static int current_log_level
;
223 static void unref_inode_lolocked(struct lo_data
*lo
, struct lo_inode
*inode
,
227 pthread_mutex_t mutex
;
230 /* That we loaded cap-ng in the current thread from the saved */
231 static __thread
bool cap_loaded
= 0;
233 static struct lo_inode
*lo_find(struct lo_data
*lo
, struct stat
*st
);
235 static int is_dot_or_dotdot(const char *name
)
237 return name
[0] == '.' &&
238 (name
[1] == '\0' || (name
[1] == '.' && name
[2] == '\0'));
241 /* Is `path` a single path component that is not "." or ".."? */
242 static int is_safe_path_component(const char *path
)
244 if (strchr(path
, '/')) {
248 return !is_dot_or_dotdot(path
);
251 static struct lo_data
*lo_data(fuse_req_t req
)
253 return (struct lo_data
*)fuse_req_userdata(req
);
257 * Load capng's state from our saved state if the current thread
258 * hadn't previously been loaded.
259 * returns 0 on success
261 static int load_capng(void)
264 pthread_mutex_lock(&cap
.mutex
);
265 capng_restore_state(&cap
.saved
);
267 * restore_state free's the saved copy
270 cap
.saved
= capng_save_state();
272 pthread_mutex_unlock(&cap
.mutex
);
273 fuse_log(FUSE_LOG_ERR
, "capng_save_state (thread)\n");
276 pthread_mutex_unlock(&cap
.mutex
);
279 * We want to use the loaded state for our pid,
282 capng_setpid(syscall(SYS_gettid
));
289 * Helpers for dropping and regaining effective capabilities. Returns 0
290 * on success, error otherwise
292 static int drop_effective_cap(const char *cap_name
, bool *cap_dropped
)
296 cap
= capng_name_to_capability(cap_name
);
299 fuse_log(FUSE_LOG_ERR
, "capng_name_to_capability(%s) failed:%s\n",
300 cap_name
, strerror(errno
));
306 fuse_log(FUSE_LOG_ERR
, "load_capng() failed\n");
310 /* We dont have this capability in effective set already. */
311 if (!capng_have_capability(CAPNG_EFFECTIVE
, cap
)) {
316 if (capng_update(CAPNG_DROP
, CAPNG_EFFECTIVE
, cap
)) {
318 fuse_log(FUSE_LOG_ERR
, "capng_update(DROP,) failed\n");
322 if (capng_apply(CAPNG_SELECT_CAPS
)) {
324 fuse_log(FUSE_LOG_ERR
, "drop:capng_apply() failed\n");
337 static int gain_effective_cap(const char *cap_name
)
342 cap
= capng_name_to_capability(cap_name
);
345 fuse_log(FUSE_LOG_ERR
, "capng_name_to_capability(%s) failed:%s\n",
346 cap_name
, strerror(errno
));
352 fuse_log(FUSE_LOG_ERR
, "load_capng() failed\n");
356 if (capng_update(CAPNG_ADD
, CAPNG_EFFECTIVE
, cap
)) {
358 fuse_log(FUSE_LOG_ERR
, "capng_update(ADD,) failed\n");
362 if (capng_apply(CAPNG_SELECT_CAPS
)) {
364 fuse_log(FUSE_LOG_ERR
, "gain:capng_apply() failed\n");
373 static void lo_map_init(struct lo_map
*map
)
380 static void lo_map_destroy(struct lo_map
*map
)
385 static int lo_map_grow(struct lo_map
*map
, size_t new_nelems
)
387 struct lo_map_elem
*new_elems
;
390 if (new_nelems
<= map
->nelems
) {
394 new_elems
= realloc(map
->elems
, sizeof(map
->elems
[0]) * new_nelems
);
399 for (i
= map
->nelems
; i
< new_nelems
; i
++) {
400 new_elems
[i
].freelist
= i
+ 1;
401 new_elems
[i
].in_use
= false;
403 new_elems
[new_nelems
- 1].freelist
= -1;
405 map
->elems
= new_elems
;
406 map
->freelist
= map
->nelems
;
407 map
->nelems
= new_nelems
;
411 static struct lo_map_elem
*lo_map_alloc_elem(struct lo_map
*map
)
413 struct lo_map_elem
*elem
;
415 if (map
->freelist
== -1 && !lo_map_grow(map
, map
->nelems
+ 256)) {
419 elem
= &map
->elems
[map
->freelist
];
420 map
->freelist
= elem
->freelist
;
427 static struct lo_map_elem
*lo_map_reserve(struct lo_map
*map
, size_t key
)
431 if (!lo_map_grow(map
, key
+ 1)) {
435 for (prev
= &map
->freelist
; *prev
!= -1;
436 prev
= &map
->elems
[*prev
].freelist
) {
438 struct lo_map_elem
*elem
= &map
->elems
[key
];
440 *prev
= elem
->freelist
;
448 static struct lo_map_elem
*lo_map_get(struct lo_map
*map
, size_t key
)
450 if (key
>= map
->nelems
) {
453 if (!map
->elems
[key
].in_use
) {
456 return &map
->elems
[key
];
459 static void lo_map_remove(struct lo_map
*map
, size_t key
)
461 struct lo_map_elem
*elem
;
463 if (key
>= map
->nelems
) {
467 elem
= &map
->elems
[key
];
472 elem
->in_use
= false;
474 elem
->freelist
= map
->freelist
;
478 /* Assumes lo->mutex is held */
479 static ssize_t
lo_add_fd_mapping(fuse_req_t req
, int fd
)
481 struct lo_map_elem
*elem
;
483 elem
= lo_map_alloc_elem(&lo_data(req
)->fd_map
);
489 return elem
- lo_data(req
)->fd_map
.elems
;
492 /* Assumes lo->mutex is held */
493 static ssize_t
lo_add_dirp_mapping(fuse_req_t req
, struct lo_dirp
*dirp
)
495 struct lo_map_elem
*elem
;
497 elem
= lo_map_alloc_elem(&lo_data(req
)->dirp_map
);
503 return elem
- lo_data(req
)->dirp_map
.elems
;
506 /* Assumes lo->mutex is held */
507 static ssize_t
lo_add_inode_mapping(fuse_req_t req
, struct lo_inode
*inode
)
509 struct lo_map_elem
*elem
;
511 elem
= lo_map_alloc_elem(&lo_data(req
)->ino_map
);
517 return elem
- lo_data(req
)->ino_map
.elems
;
520 static void lo_inode_put(struct lo_data
*lo
, struct lo_inode
**inodep
)
522 struct lo_inode
*inode
= *inodep
;
530 if (g_atomic_int_dec_and_test(&inode
->refcount
)) {
536 /* Caller must release refcount using lo_inode_put() */
537 static struct lo_inode
*lo_inode(fuse_req_t req
, fuse_ino_t ino
)
539 struct lo_data
*lo
= lo_data(req
);
540 struct lo_map_elem
*elem
;
542 pthread_mutex_lock(&lo
->mutex
);
543 elem
= lo_map_get(&lo
->ino_map
, ino
);
545 g_atomic_int_inc(&elem
->inode
->refcount
);
547 pthread_mutex_unlock(&lo
->mutex
);
557 * TODO Remove this helper and force callers to hold an inode refcount until
558 * they are done with the fd. This will be done in a later patch to make
561 static int lo_fd(fuse_req_t req
, fuse_ino_t ino
)
563 struct lo_inode
*inode
= lo_inode(req
, ino
);
571 lo_inode_put(lo_data(req
), &inode
);
575 static void lo_init(void *userdata
, struct fuse_conn_info
*conn
)
577 struct lo_data
*lo
= (struct lo_data
*)userdata
;
579 if (conn
->capable
& FUSE_CAP_EXPORT_SUPPORT
) {
580 conn
->want
|= FUSE_CAP_EXPORT_SUPPORT
;
583 if (lo
->writeback
&& conn
->capable
& FUSE_CAP_WRITEBACK_CACHE
) {
584 fuse_log(FUSE_LOG_DEBUG
, "lo_init: activating writeback\n");
585 conn
->want
|= FUSE_CAP_WRITEBACK_CACHE
;
587 if (conn
->capable
& FUSE_CAP_FLOCK_LOCKS
) {
589 fuse_log(FUSE_LOG_DEBUG
, "lo_init: activating flock locks\n");
590 conn
->want
|= FUSE_CAP_FLOCK_LOCKS
;
592 fuse_log(FUSE_LOG_DEBUG
, "lo_init: disabling flock locks\n");
593 conn
->want
&= ~FUSE_CAP_FLOCK_LOCKS
;
597 if (conn
->capable
& FUSE_CAP_POSIX_LOCKS
) {
598 if (lo
->posix_lock
) {
599 fuse_log(FUSE_LOG_DEBUG
, "lo_init: activating posix locks\n");
600 conn
->want
|= FUSE_CAP_POSIX_LOCKS
;
602 fuse_log(FUSE_LOG_DEBUG
, "lo_init: disabling posix locks\n");
603 conn
->want
&= ~FUSE_CAP_POSIX_LOCKS
;
607 if ((lo
->cache
== CACHE_NONE
&& !lo
->readdirplus_set
) ||
608 lo
->readdirplus_clear
) {
609 fuse_log(FUSE_LOG_DEBUG
, "lo_init: disabling readdirplus\n");
610 conn
->want
&= ~FUSE_CAP_READDIRPLUS
;
615 * Call fstatat() and set st_rdev whenever a directory's st_dev
616 * differs from the rparent's st_dev (@parent_dev). This will
617 * announce submounts to the FUSE client (unless @announce_submounts
620 static int do_fstatat(int dirfd
, const char *pathname
, struct stat
*statbuf
,
621 int flags
, dev_t parent_dev
, uint32_t *fuse_attr_flags
)
623 int res
= fstatat(dirfd
, pathname
, statbuf
, flags
);
628 if (statbuf
->st_dev
!= parent_dev
&& S_ISDIR(statbuf
->st_mode
) &&
631 *fuse_attr_flags
|= FUSE_ATTR_SUBMOUNT
;
637 static void lo_getattr(fuse_req_t req
, fuse_ino_t ino
,
638 struct fuse_file_info
*fi
)
642 struct lo_data
*lo
= lo_data(req
);
643 struct lo_inode
*inode
= lo_inode(req
, ino
);
644 uint32_t fuse_attr_flags
= 0;
648 res
= do_fstatat(inode
->fd
, "", &buf
, AT_EMPTY_PATH
| AT_SYMLINK_NOFOLLOW
,
649 inode
->parent_dev
, &fuse_attr_flags
);
650 lo_inode_put(lo
, &inode
);
652 return (void)fuse_reply_err(req
, errno
);
655 if (!lo
->announce_submounts
) {
656 fuse_attr_flags
&= ~FUSE_ATTR_SUBMOUNT
;
659 fuse_reply_attr_with_flags(req
, &buf
, lo
->timeout
, fuse_attr_flags
);
662 static int lo_fi_fd(fuse_req_t req
, struct fuse_file_info
*fi
)
664 struct lo_data
*lo
= lo_data(req
);
665 struct lo_map_elem
*elem
;
667 pthread_mutex_lock(&lo
->mutex
);
668 elem
= lo_map_get(&lo
->fd_map
, fi
->fh
);
669 pthread_mutex_unlock(&lo
->mutex
);
678 static void lo_setattr(fuse_req_t req
, fuse_ino_t ino
, struct stat
*attr
,
679 int valid
, struct fuse_file_info
*fi
)
683 struct lo_data
*lo
= lo_data(req
);
684 struct lo_inode
*inode
;
689 inode
= lo_inode(req
, ino
);
691 fuse_reply_err(req
, EBADF
);
697 /* If fi->fh is invalid we'll report EBADF later */
699 fd
= lo_fi_fd(req
, fi
);
702 if (valid
& FUSE_SET_ATTR_MODE
) {
704 res
= fchmod(fd
, attr
->st_mode
);
706 sprintf(procname
, "%i", ifd
);
707 res
= fchmodat(lo
->proc_self_fd
, procname
, attr
->st_mode
, 0);
713 if (valid
& (FUSE_SET_ATTR_UID
| FUSE_SET_ATTR_GID
)) {
714 uid_t uid
= (valid
& FUSE_SET_ATTR_UID
) ? attr
->st_uid
: (uid_t
)-1;
715 gid_t gid
= (valid
& FUSE_SET_ATTR_GID
) ? attr
->st_gid
: (gid_t
)-1;
717 res
= fchownat(ifd
, "", uid
, gid
, AT_EMPTY_PATH
| AT_SYMLINK_NOFOLLOW
);
722 if (valid
& FUSE_SET_ATTR_SIZE
) {
728 sprintf(procname
, "%i", ifd
);
729 truncfd
= openat(lo
->proc_self_fd
, procname
, O_RDWR
);
735 res
= ftruncate(truncfd
, attr
->st_size
);
745 if (valid
& (FUSE_SET_ATTR_ATIME
| FUSE_SET_ATTR_MTIME
)) {
746 struct timespec tv
[2];
750 tv
[0].tv_nsec
= UTIME_OMIT
;
751 tv
[1].tv_nsec
= UTIME_OMIT
;
753 if (valid
& FUSE_SET_ATTR_ATIME_NOW
) {
754 tv
[0].tv_nsec
= UTIME_NOW
;
755 } else if (valid
& FUSE_SET_ATTR_ATIME
) {
756 tv
[0] = attr
->st_atim
;
759 if (valid
& FUSE_SET_ATTR_MTIME_NOW
) {
760 tv
[1].tv_nsec
= UTIME_NOW
;
761 } else if (valid
& FUSE_SET_ATTR_MTIME
) {
762 tv
[1] = attr
->st_mtim
;
766 res
= futimens(fd
, tv
);
768 sprintf(procname
, "%i", inode
->fd
);
769 res
= utimensat(lo
->proc_self_fd
, procname
, tv
, 0);
775 lo_inode_put(lo
, &inode
);
777 return lo_getattr(req
, ino
, fi
);
781 lo_inode_put(lo
, &inode
);
782 fuse_reply_err(req
, saverr
);
785 static struct lo_inode
*lo_find(struct lo_data
*lo
, struct stat
*st
)
788 struct lo_key key
= {
793 pthread_mutex_lock(&lo
->mutex
);
794 p
= g_hash_table_lookup(lo
->inodes
, &key
);
796 assert(p
->nlookup
> 0);
798 g_atomic_int_inc(&p
->refcount
);
800 pthread_mutex_unlock(&lo
->mutex
);
805 /* value_destroy_func for posix_locks GHashTable */
806 static void posix_locks_value_destroy(gpointer data
)
808 struct lo_inode_plock
*plock
= data
;
811 * We had used open() for locks and had only one fd. So
812 * closing this fd should release all OFD locks.
819 * Increments nlookup and caller must release refcount using
820 * lo_inode_put(&parent).
822 static int lo_do_lookup(fuse_req_t req
, fuse_ino_t parent
, const char *name
,
823 struct fuse_entry_param
*e
)
828 struct lo_data
*lo
= lo_data(req
);
829 struct lo_inode
*inode
= NULL
;
830 struct lo_inode
*dir
= lo_inode(req
, parent
);
833 * name_to_handle_at() and open_by_handle_at() can reach here with fuse
834 * mount point in guest, but we don't have its inode info in the
841 memset(e
, 0, sizeof(*e
));
842 e
->attr_timeout
= lo
->timeout
;
843 e
->entry_timeout
= lo
->timeout
;
845 /* Do not allow escaping root directory */
846 if (dir
== &lo
->root
&& strcmp(name
, "..") == 0) {
850 newfd
= openat(dir
->fd
, name
, O_PATH
| O_NOFOLLOW
);
855 res
= do_fstatat(newfd
, "", &e
->attr
, AT_EMPTY_PATH
| AT_SYMLINK_NOFOLLOW
,
856 dir
->key
.dev
, &e
->attr_flags
);
861 if (!lo
->announce_submounts
) {
862 e
->attr_flags
&= ~FUSE_ATTR_SUBMOUNT
;
865 inode
= lo_find(lo
, &e
->attr
);
869 inode
= calloc(1, sizeof(struct lo_inode
));
874 /* cache only filetype */
875 inode
->filetype
= (e
->attr
.st_mode
& S_IFMT
);
878 * One for the caller and one for nlookup (released in
879 * unref_inode_lolocked())
881 g_atomic_int_set(&inode
->refcount
, 2);
885 inode
->key
.ino
= e
->attr
.st_ino
;
886 inode
->key
.dev
= e
->attr
.st_dev
;
887 pthread_mutex_init(&inode
->plock_mutex
, NULL
);
888 inode
->posix_locks
= g_hash_table_new_full(
889 g_direct_hash
, g_direct_equal
, NULL
, posix_locks_value_destroy
);
891 pthread_mutex_lock(&lo
->mutex
);
892 inode
->fuse_ino
= lo_add_inode_mapping(req
, inode
);
893 g_hash_table_insert(lo
->inodes
, &inode
->key
, inode
);
894 pthread_mutex_unlock(&lo
->mutex
);
896 inode
->parent_dev
= dir
->key
.dev
;
897 e
->ino
= inode
->fuse_ino
;
898 lo_inode_put(lo
, &inode
);
899 lo_inode_put(lo
, &dir
);
901 fuse_log(FUSE_LOG_DEBUG
, " %lli/%s -> %lli\n", (unsigned long long)parent
,
902 name
, (unsigned long long)e
->ino
);
911 lo_inode_put(lo
, &inode
);
912 lo_inode_put(lo
, &dir
);
916 static void lo_lookup(fuse_req_t req
, fuse_ino_t parent
, const char *name
)
918 struct fuse_entry_param e
;
921 fuse_log(FUSE_LOG_DEBUG
, "lo_lookup(parent=%" PRIu64
", name=%s)\n", parent
,
925 * Don't use is_safe_path_component(), allow "." and ".." for NFS export
928 if (strchr(name
, '/')) {
929 fuse_reply_err(req
, EINVAL
);
933 err
= lo_do_lookup(req
, parent
, name
, &e
);
935 fuse_reply_err(req
, err
);
937 fuse_reply_entry(req
, &e
);
942 * On some archs, setres*id is limited to 2^16 but they
943 * provide setres*id32 variants that allow 2^32.
944 * Others just let setres*id do 2^32 anyway.
946 #ifdef SYS_setresgid32
947 #define OURSYS_setresgid SYS_setresgid32
949 #define OURSYS_setresgid SYS_setresgid
952 #ifdef SYS_setresuid32
953 #define OURSYS_setresuid SYS_setresuid32
955 #define OURSYS_setresuid SYS_setresuid
959 * Change to uid/gid of caller so that file is created with
960 * ownership of caller.
961 * TODO: What about selinux context?
963 static int lo_change_cred(fuse_req_t req
, struct lo_cred
*old
)
967 old
->euid
= geteuid();
968 old
->egid
= getegid();
970 res
= syscall(OURSYS_setresgid
, -1, fuse_req_ctx(req
)->gid
, -1);
975 res
= syscall(OURSYS_setresuid
, -1, fuse_req_ctx(req
)->uid
, -1);
977 int errno_save
= errno
;
979 syscall(OURSYS_setresgid
, -1, old
->egid
, -1);
986 /* Regain Privileges */
987 static void lo_restore_cred(struct lo_cred
*old
)
991 res
= syscall(OURSYS_setresuid
, -1, old
->euid
, -1);
993 fuse_log(FUSE_LOG_ERR
, "seteuid(%u): %m\n", old
->euid
);
997 res
= syscall(OURSYS_setresgid
, -1, old
->egid
, -1);
999 fuse_log(FUSE_LOG_ERR
, "setegid(%u): %m\n", old
->egid
);
1004 static void lo_mknod_symlink(fuse_req_t req
, fuse_ino_t parent
,
1005 const char *name
, mode_t mode
, dev_t rdev
,
1010 struct lo_data
*lo
= lo_data(req
);
1011 struct lo_inode
*dir
;
1012 struct fuse_entry_param e
;
1013 struct lo_cred old
= {};
1015 if (!is_safe_path_component(name
)) {
1016 fuse_reply_err(req
, EINVAL
);
1020 dir
= lo_inode(req
, parent
);
1022 fuse_reply_err(req
, EBADF
);
1026 saverr
= lo_change_cred(req
, &old
);
1031 res
= mknod_wrapper(dir
->fd
, name
, link
, mode
, rdev
);
1035 lo_restore_cred(&old
);
1041 saverr
= lo_do_lookup(req
, parent
, name
, &e
);
1046 fuse_log(FUSE_LOG_DEBUG
, " %lli/%s -> %lli\n", (unsigned long long)parent
,
1047 name
, (unsigned long long)e
.ino
);
1049 fuse_reply_entry(req
, &e
);
1050 lo_inode_put(lo
, &dir
);
1054 lo_inode_put(lo
, &dir
);
1055 fuse_reply_err(req
, saverr
);
1058 static void lo_mknod(fuse_req_t req
, fuse_ino_t parent
, const char *name
,
1059 mode_t mode
, dev_t rdev
)
1061 lo_mknod_symlink(req
, parent
, name
, mode
, rdev
, NULL
);
1064 static void lo_mkdir(fuse_req_t req
, fuse_ino_t parent
, const char *name
,
1067 lo_mknod_symlink(req
, parent
, name
, S_IFDIR
| mode
, 0, NULL
);
1070 static void lo_symlink(fuse_req_t req
, const char *link
, fuse_ino_t parent
,
1073 lo_mknod_symlink(req
, parent
, name
, S_IFLNK
, 0, link
);
1076 static void lo_link(fuse_req_t req
, fuse_ino_t ino
, fuse_ino_t parent
,
1080 struct lo_data
*lo
= lo_data(req
);
1081 struct lo_inode
*parent_inode
;
1082 struct lo_inode
*inode
;
1083 struct fuse_entry_param e
;
1087 if (!is_safe_path_component(name
)) {
1088 fuse_reply_err(req
, EINVAL
);
1092 parent_inode
= lo_inode(req
, parent
);
1093 inode
= lo_inode(req
, ino
);
1094 if (!parent_inode
|| !inode
) {
1099 memset(&e
, 0, sizeof(struct fuse_entry_param
));
1100 e
.attr_timeout
= lo
->timeout
;
1101 e
.entry_timeout
= lo
->timeout
;
1103 sprintf(procname
, "%i", inode
->fd
);
1104 res
= linkat(lo
->proc_self_fd
, procname
, parent_inode
->fd
, name
,
1110 res
= do_fstatat(inode
->fd
, "", &e
.attr
,
1111 AT_EMPTY_PATH
| AT_SYMLINK_NOFOLLOW
,
1112 parent_inode
->key
.dev
, &e
.attr_flags
);
1117 if (!lo
->announce_submounts
) {
1118 e
.attr_flags
&= ~FUSE_ATTR_SUBMOUNT
;
1121 pthread_mutex_lock(&lo
->mutex
);
1123 pthread_mutex_unlock(&lo
->mutex
);
1124 e
.ino
= inode
->fuse_ino
;
1126 fuse_log(FUSE_LOG_DEBUG
, " %lli/%s -> %lli\n", (unsigned long long)parent
,
1127 name
, (unsigned long long)e
.ino
);
1130 * No need to update inode->parent_dev, because
1131 * (1) We cannot, the inode now has more than one parent,
1132 * (2) Directories cannot have more than one parent, so link()
1133 * does not work for them; but parent_dev only needs to be
1134 * valid for directories.
1137 fuse_reply_entry(req
, &e
);
1138 lo_inode_put(lo
, &parent_inode
);
1139 lo_inode_put(lo
, &inode
);
1144 lo_inode_put(lo
, &parent_inode
);
1145 lo_inode_put(lo
, &inode
);
1146 fuse_reply_err(req
, saverr
);
1149 /* Increments nlookup and caller must release refcount using lo_inode_put() */
1150 static struct lo_inode
*lookup_name(fuse_req_t req
, fuse_ino_t parent
,
1155 struct lo_data
*lo
= lo_data(req
);
1156 struct lo_inode
*dir
= lo_inode(req
, parent
);
1162 res
= do_fstatat(dir
->fd
, name
, &attr
,
1163 AT_EMPTY_PATH
| AT_SYMLINK_NOFOLLOW
, dir
->key
.dev
, NULL
);
1164 lo_inode_put(lo
, &dir
);
1169 return lo_find(lo
, &attr
);
1172 static void lo_rmdir(fuse_req_t req
, fuse_ino_t parent
, const char *name
)
1175 struct lo_inode
*inode
;
1176 struct lo_data
*lo
= lo_data(req
);
1178 if (!is_safe_path_component(name
)) {
1179 fuse_reply_err(req
, EINVAL
);
1183 inode
= lookup_name(req
, parent
, name
);
1185 fuse_reply_err(req
, EIO
);
1189 res
= unlinkat(lo_fd(req
, parent
), name
, AT_REMOVEDIR
);
1191 fuse_reply_err(req
, res
== -1 ? errno
: 0);
1192 unref_inode_lolocked(lo
, inode
, 1);
1193 lo_inode_put(lo
, &inode
);
1196 static void lo_rename(fuse_req_t req
, fuse_ino_t parent
, const char *name
,
1197 fuse_ino_t newparent
, const char *newname
,
1201 struct lo_inode
*parent_inode
;
1202 struct lo_inode
*newparent_inode
;
1203 struct lo_inode
*oldinode
= NULL
;
1204 struct lo_inode
*newinode
= NULL
;
1205 struct lo_data
*lo
= lo_data(req
);
1207 if (!is_safe_path_component(name
) || !is_safe_path_component(newname
)) {
1208 fuse_reply_err(req
, EINVAL
);
1212 parent_inode
= lo_inode(req
, parent
);
1213 newparent_inode
= lo_inode(req
, newparent
);
1214 if (!parent_inode
|| !newparent_inode
) {
1215 fuse_reply_err(req
, EBADF
);
1219 oldinode
= lookup_name(req
, parent
, name
);
1220 newinode
= lookup_name(req
, newparent
, newname
);
1223 fuse_reply_err(req
, EIO
);
1228 #ifndef SYS_renameat2
1229 fuse_reply_err(req
, EINVAL
);
1231 res
= syscall(SYS_renameat2
, parent_inode
->fd
, name
,
1232 newparent_inode
->fd
, newname
, flags
);
1233 if (res
== -1 && errno
== ENOSYS
) {
1234 fuse_reply_err(req
, EINVAL
);
1236 fuse_reply_err(req
, res
== -1 ? errno
: 0);
1242 res
= renameat(parent_inode
->fd
, name
, newparent_inode
->fd
, newname
);
1244 fuse_reply_err(req
, res
== -1 ? errno
: 0);
1246 unref_inode_lolocked(lo
, oldinode
, 1);
1247 unref_inode_lolocked(lo
, newinode
, 1);
1248 lo_inode_put(lo
, &oldinode
);
1249 lo_inode_put(lo
, &newinode
);
1250 lo_inode_put(lo
, &parent_inode
);
1251 lo_inode_put(lo
, &newparent_inode
);
1254 static void lo_unlink(fuse_req_t req
, fuse_ino_t parent
, const char *name
)
1257 struct lo_inode
*inode
;
1258 struct lo_data
*lo
= lo_data(req
);
1260 if (!is_safe_path_component(name
)) {
1261 fuse_reply_err(req
, EINVAL
);
1265 inode
= lookup_name(req
, parent
, name
);
1267 fuse_reply_err(req
, EIO
);
1271 res
= unlinkat(lo_fd(req
, parent
), name
, 0);
1273 fuse_reply_err(req
, res
== -1 ? errno
: 0);
1274 unref_inode_lolocked(lo
, inode
, 1);
1275 lo_inode_put(lo
, &inode
);
1278 /* To be called with lo->mutex held */
1279 static void unref_inode(struct lo_data
*lo
, struct lo_inode
*inode
, uint64_t n
)
1285 assert(inode
->nlookup
>= n
);
1286 inode
->nlookup
-= n
;
1287 if (!inode
->nlookup
) {
1288 lo_map_remove(&lo
->ino_map
, inode
->fuse_ino
);
1289 g_hash_table_remove(lo
->inodes
, &inode
->key
);
1290 if (g_hash_table_size(inode
->posix_locks
)) {
1291 fuse_log(FUSE_LOG_WARNING
, "Hash table is not empty\n");
1293 g_hash_table_destroy(inode
->posix_locks
);
1294 pthread_mutex_destroy(&inode
->plock_mutex
);
1296 /* Drop our refcount from lo_do_lookup() */
1297 lo_inode_put(lo
, &inode
);
1301 static void unref_inode_lolocked(struct lo_data
*lo
, struct lo_inode
*inode
,
1308 pthread_mutex_lock(&lo
->mutex
);
1309 unref_inode(lo
, inode
, n
);
1310 pthread_mutex_unlock(&lo
->mutex
);
1313 static void lo_forget_one(fuse_req_t req
, fuse_ino_t ino
, uint64_t nlookup
)
1315 struct lo_data
*lo
= lo_data(req
);
1316 struct lo_inode
*inode
;
1318 inode
= lo_inode(req
, ino
);
1323 fuse_log(FUSE_LOG_DEBUG
, " forget %lli %lli -%lli\n",
1324 (unsigned long long)ino
, (unsigned long long)inode
->nlookup
,
1325 (unsigned long long)nlookup
);
1327 unref_inode_lolocked(lo
, inode
, nlookup
);
1328 lo_inode_put(lo
, &inode
);
1331 static void lo_forget(fuse_req_t req
, fuse_ino_t ino
, uint64_t nlookup
)
1333 lo_forget_one(req
, ino
, nlookup
);
1334 fuse_reply_none(req
);
1337 static void lo_forget_multi(fuse_req_t req
, size_t count
,
1338 struct fuse_forget_data
*forgets
)
1342 for (i
= 0; i
< count
; i
++) {
1343 lo_forget_one(req
, forgets
[i
].ino
, forgets
[i
].nlookup
);
1345 fuse_reply_none(req
);
1348 static void lo_readlink(fuse_req_t req
, fuse_ino_t ino
)
1350 char buf
[PATH_MAX
+ 1];
1353 res
= readlinkat(lo_fd(req
, ino
), "", buf
, sizeof(buf
));
1355 return (void)fuse_reply_err(req
, errno
);
1358 if (res
== sizeof(buf
)) {
1359 return (void)fuse_reply_err(req
, ENAMETOOLONG
);
1364 fuse_reply_readlink(req
, buf
);
1370 struct dirent
*entry
;
1374 static void lo_dirp_put(struct lo_dirp
**dp
)
1376 struct lo_dirp
*d
= *dp
;
1383 if (g_atomic_int_dec_and_test(&d
->refcount
)) {
1389 /* Call lo_dirp_put() on the return value when no longer needed */
1390 static struct lo_dirp
*lo_dirp(fuse_req_t req
, struct fuse_file_info
*fi
)
1392 struct lo_data
*lo
= lo_data(req
);
1393 struct lo_map_elem
*elem
;
1395 pthread_mutex_lock(&lo
->mutex
);
1396 elem
= lo_map_get(&lo
->dirp_map
, fi
->fh
);
1398 g_atomic_int_inc(&elem
->dirp
->refcount
);
1400 pthread_mutex_unlock(&lo
->mutex
);
1408 static void lo_opendir(fuse_req_t req
, fuse_ino_t ino
,
1409 struct fuse_file_info
*fi
)
1412 struct lo_data
*lo
= lo_data(req
);
1417 d
= calloc(1, sizeof(struct lo_dirp
));
1422 fd
= openat(lo_fd(req
, ino
), ".", O_RDONLY
);
1427 d
->dp
= fdopendir(fd
);
1428 if (d
->dp
== NULL
) {
1435 g_atomic_int_set(&d
->refcount
, 1); /* paired with lo_releasedir() */
1436 pthread_mutex_lock(&lo
->mutex
);
1437 fh
= lo_add_dirp_mapping(req
, d
);
1438 pthread_mutex_unlock(&lo
->mutex
);
1444 if (lo
->cache
== CACHE_ALWAYS
) {
1445 fi
->cache_readdir
= 1;
1447 fuse_reply_open(req
, fi
);
1456 } else if (fd
!= -1) {
1461 fuse_reply_err(req
, error
);
1464 static void lo_do_readdir(fuse_req_t req
, fuse_ino_t ino
, size_t size
,
1465 off_t offset
, struct fuse_file_info
*fi
, int plus
)
1467 struct lo_data
*lo
= lo_data(req
);
1468 struct lo_dirp
*d
= NULL
;
1469 struct lo_inode
*dinode
;
1475 dinode
= lo_inode(req
, ino
);
1480 d
= lo_dirp(req
, fi
);
1486 buf
= calloc(1, size
);
1492 if (offset
!= d
->offset
) {
1493 seekdir(d
->dp
, offset
);
1504 d
->entry
= readdir(d
->dp
);
1506 if (errno
) { /* Error */
1509 } else { /* End of stream */
1514 nextoff
= d
->entry
->d_off
;
1515 name
= d
->entry
->d_name
;
1517 fuse_ino_t entry_ino
= 0;
1518 struct fuse_entry_param e
= (struct fuse_entry_param
){
1519 .attr
.st_ino
= d
->entry
->d_ino
,
1520 .attr
.st_mode
= d
->entry
->d_type
<< 12,
1523 /* Hide root's parent directory */
1524 if (dinode
== &lo
->root
&& strcmp(name
, "..") == 0) {
1525 e
.attr
.st_ino
= lo
->root
.key
.ino
;
1526 e
.attr
.st_mode
= DT_DIR
<< 12;
1530 if (!is_dot_or_dotdot(name
)) {
1531 err
= lo_do_lookup(req
, ino
, name
, &e
);
1538 entsize
= fuse_add_direntry_plus(req
, p
, rem
, name
, &e
, nextoff
);
1540 entsize
= fuse_add_direntry(req
, p
, rem
, name
, &e
.attr
, nextoff
);
1542 if (entsize
> rem
) {
1543 if (entry_ino
!= 0) {
1544 lo_forget_one(req
, entry_ino
, 1);
1553 d
->offset
= nextoff
;
1559 lo_inode_put(lo
, &dinode
);
1562 * If there's an error, we can only signal it if we haven't stored
1563 * any entries yet - otherwise we'd end up with wrong lookup
1564 * counts for the entries that are already in the buffer. So we
1565 * return what we've collected until that point.
1567 if (err
&& rem
== size
) {
1568 fuse_reply_err(req
, err
);
1570 fuse_reply_buf(req
, buf
, size
- rem
);
1575 static void lo_readdir(fuse_req_t req
, fuse_ino_t ino
, size_t size
,
1576 off_t offset
, struct fuse_file_info
*fi
)
1578 lo_do_readdir(req
, ino
, size
, offset
, fi
, 0);
1581 static void lo_readdirplus(fuse_req_t req
, fuse_ino_t ino
, size_t size
,
1582 off_t offset
, struct fuse_file_info
*fi
)
1584 lo_do_readdir(req
, ino
, size
, offset
, fi
, 1);
1587 static void lo_releasedir(fuse_req_t req
, fuse_ino_t ino
,
1588 struct fuse_file_info
*fi
)
1590 struct lo_data
*lo
= lo_data(req
);
1591 struct lo_map_elem
*elem
;
1596 pthread_mutex_lock(&lo
->mutex
);
1597 elem
= lo_map_get(&lo
->dirp_map
, fi
->fh
);
1599 pthread_mutex_unlock(&lo
->mutex
);
1600 fuse_reply_err(req
, EBADF
);
1605 lo_map_remove(&lo
->dirp_map
, fi
->fh
);
1606 pthread_mutex_unlock(&lo
->mutex
);
1608 lo_dirp_put(&d
); /* paired with lo_opendir() */
1610 fuse_reply_err(req
, 0);
1613 static void update_open_flags(int writeback
, int allow_direct_io
,
1614 struct fuse_file_info
*fi
)
1617 * With writeback cache, kernel may send read requests even
1618 * when userspace opened write-only
1620 if (writeback
&& (fi
->flags
& O_ACCMODE
) == O_WRONLY
) {
1621 fi
->flags
&= ~O_ACCMODE
;
1622 fi
->flags
|= O_RDWR
;
1626 * With writeback cache, O_APPEND is handled by the kernel.
1627 * This breaks atomicity (since the file may change in the
1628 * underlying filesystem, so that the kernel's idea of the
1629 * end of the file isn't accurate anymore). In this example,
1630 * we just accept that. A more rigorous filesystem may want
1631 * to return an error here
1633 if (writeback
&& (fi
->flags
& O_APPEND
)) {
1634 fi
->flags
&= ~O_APPEND
;
1638 * O_DIRECT in guest should not necessarily mean bypassing page
1639 * cache on host as well. Therefore, we discard it by default
1640 * ('-o no_allow_direct_io'). If somebody needs that behavior,
1641 * the '-o allow_direct_io' option should be set.
1643 if (!allow_direct_io
) {
1644 fi
->flags
&= ~O_DIRECT
;
1648 static void lo_create(fuse_req_t req
, fuse_ino_t parent
, const char *name
,
1649 mode_t mode
, struct fuse_file_info
*fi
)
1652 struct lo_data
*lo
= lo_data(req
);
1653 struct lo_inode
*parent_inode
;
1654 struct fuse_entry_param e
;
1656 struct lo_cred old
= {};
1658 fuse_log(FUSE_LOG_DEBUG
, "lo_create(parent=%" PRIu64
", name=%s)\n", parent
,
1661 if (!is_safe_path_component(name
)) {
1662 fuse_reply_err(req
, EINVAL
);
1666 parent_inode
= lo_inode(req
, parent
);
1667 if (!parent_inode
) {
1668 fuse_reply_err(req
, EBADF
);
1672 err
= lo_change_cred(req
, &old
);
1677 update_open_flags(lo
->writeback
, lo
->allow_direct_io
, fi
);
1679 fd
= openat(parent_inode
->fd
, name
, (fi
->flags
| O_CREAT
) & ~O_NOFOLLOW
,
1681 err
= fd
== -1 ? errno
: 0;
1682 lo_restore_cred(&old
);
1687 pthread_mutex_lock(&lo
->mutex
);
1688 fh
= lo_add_fd_mapping(req
, fd
);
1689 pthread_mutex_unlock(&lo
->mutex
);
1697 err
= lo_do_lookup(req
, parent
, name
, &e
);
1699 if (lo
->cache
== CACHE_NONE
) {
1701 } else if (lo
->cache
== CACHE_ALWAYS
) {
1706 lo_inode_put(lo
, &parent_inode
);
1709 fuse_reply_err(req
, err
);
1711 fuse_reply_create(req
, &e
, fi
);
1715 /* Should be called with inode->plock_mutex held */
1716 static struct lo_inode_plock
*lookup_create_plock_ctx(struct lo_data
*lo
,
1717 struct lo_inode
*inode
,
1718 uint64_t lock_owner
,
1719 pid_t pid
, int *err
)
1721 struct lo_inode_plock
*plock
;
1726 g_hash_table_lookup(inode
->posix_locks
, GUINT_TO_POINTER(lock_owner
));
1732 plock
= malloc(sizeof(struct lo_inode_plock
));
1738 /* Open another instance of file which can be used for ofd locks. */
1739 sprintf(procname
, "%i", inode
->fd
);
1741 /* TODO: What if file is not writable? */
1742 fd
= openat(lo
->proc_self_fd
, procname
, O_RDWR
);
1749 plock
->lock_owner
= lock_owner
;
1751 g_hash_table_insert(inode
->posix_locks
, GUINT_TO_POINTER(plock
->lock_owner
),
1756 static void lo_getlk(fuse_req_t req
, fuse_ino_t ino
, struct fuse_file_info
*fi
,
1759 struct lo_data
*lo
= lo_data(req
);
1760 struct lo_inode
*inode
;
1761 struct lo_inode_plock
*plock
;
1762 int ret
, saverr
= 0;
1764 fuse_log(FUSE_LOG_DEBUG
,
1765 "lo_getlk(ino=%" PRIu64
", flags=%d)"
1766 " owner=0x%lx, l_type=%d l_start=0x%lx"
1768 ino
, fi
->flags
, fi
->lock_owner
, lock
->l_type
, lock
->l_start
,
1771 inode
= lo_inode(req
, ino
);
1773 fuse_reply_err(req
, EBADF
);
1777 pthread_mutex_lock(&inode
->plock_mutex
);
1779 lookup_create_plock_ctx(lo
, inode
, fi
->lock_owner
, lock
->l_pid
, &ret
);
1785 ret
= fcntl(plock
->fd
, F_OFD_GETLK
, lock
);
1791 pthread_mutex_unlock(&inode
->plock_mutex
);
1792 lo_inode_put(lo
, &inode
);
1795 fuse_reply_err(req
, saverr
);
1797 fuse_reply_lock(req
, lock
);
1801 static void lo_setlk(fuse_req_t req
, fuse_ino_t ino
, struct fuse_file_info
*fi
,
1802 struct flock
*lock
, int sleep
)
1804 struct lo_data
*lo
= lo_data(req
);
1805 struct lo_inode
*inode
;
1806 struct lo_inode_plock
*plock
;
1807 int ret
, saverr
= 0;
1809 fuse_log(FUSE_LOG_DEBUG
,
1810 "lo_setlk(ino=%" PRIu64
", flags=%d)"
1811 " cmd=%d pid=%d owner=0x%lx sleep=%d l_whence=%d"
1812 " l_start=0x%lx l_len=0x%lx\n",
1813 ino
, fi
->flags
, lock
->l_type
, lock
->l_pid
, fi
->lock_owner
, sleep
,
1814 lock
->l_whence
, lock
->l_start
, lock
->l_len
);
1817 fuse_reply_err(req
, EOPNOTSUPP
);
1821 inode
= lo_inode(req
, ino
);
1823 fuse_reply_err(req
, EBADF
);
1827 pthread_mutex_lock(&inode
->plock_mutex
);
1829 lookup_create_plock_ctx(lo
, inode
, fi
->lock_owner
, lock
->l_pid
, &ret
);
1836 /* TODO: Is it alright to modify flock? */
1838 ret
= fcntl(plock
->fd
, F_OFD_SETLK
, lock
);
1844 pthread_mutex_unlock(&inode
->plock_mutex
);
1845 lo_inode_put(lo
, &inode
);
1847 fuse_reply_err(req
, saverr
);
1850 static void lo_fsyncdir(fuse_req_t req
, fuse_ino_t ino
, int datasync
,
1851 struct fuse_file_info
*fi
)
1859 d
= lo_dirp(req
, fi
);
1861 fuse_reply_err(req
, EBADF
);
1867 res
= fdatasync(fd
);
1874 fuse_reply_err(req
, res
== -1 ? errno
: 0);
1877 static void lo_open(fuse_req_t req
, fuse_ino_t ino
, struct fuse_file_info
*fi
)
1882 struct lo_data
*lo
= lo_data(req
);
1884 fuse_log(FUSE_LOG_DEBUG
, "lo_open(ino=%" PRIu64
", flags=%d)\n", ino
,
1887 update_open_flags(lo
->writeback
, lo
->allow_direct_io
, fi
);
1889 sprintf(buf
, "%i", lo_fd(req
, ino
));
1890 fd
= openat(lo
->proc_self_fd
, buf
, fi
->flags
& ~O_NOFOLLOW
);
1892 return (void)fuse_reply_err(req
, errno
);
1895 pthread_mutex_lock(&lo
->mutex
);
1896 fh
= lo_add_fd_mapping(req
, fd
);
1897 pthread_mutex_unlock(&lo
->mutex
);
1900 fuse_reply_err(req
, ENOMEM
);
1905 if (lo
->cache
== CACHE_NONE
) {
1907 } else if (lo
->cache
== CACHE_ALWAYS
) {
1910 fuse_reply_open(req
, fi
);
1913 static void lo_release(fuse_req_t req
, fuse_ino_t ino
,
1914 struct fuse_file_info
*fi
)
1916 struct lo_data
*lo
= lo_data(req
);
1917 struct lo_map_elem
*elem
;
1922 pthread_mutex_lock(&lo
->mutex
);
1923 elem
= lo_map_get(&lo
->fd_map
, fi
->fh
);
1927 lo_map_remove(&lo
->fd_map
, fi
->fh
);
1929 pthread_mutex_unlock(&lo
->mutex
);
1932 fuse_reply_err(req
, 0);
1935 static void lo_flush(fuse_req_t req
, fuse_ino_t ino
, struct fuse_file_info
*fi
)
1939 struct lo_inode
*inode
;
1941 inode
= lo_inode(req
, ino
);
1943 fuse_reply_err(req
, EBADF
);
1947 /* An fd is going away. Cleanup associated posix locks */
1948 pthread_mutex_lock(&inode
->plock_mutex
);
1949 g_hash_table_remove(inode
->posix_locks
, GUINT_TO_POINTER(fi
->lock_owner
));
1950 pthread_mutex_unlock(&inode
->plock_mutex
);
1952 res
= close(dup(lo_fi_fd(req
, fi
)));
1953 lo_inode_put(lo_data(req
), &inode
);
1954 fuse_reply_err(req
, res
== -1 ? errno
: 0);
1957 static void lo_fsync(fuse_req_t req
, fuse_ino_t ino
, int datasync
,
1958 struct fuse_file_info
*fi
)
1964 fuse_log(FUSE_LOG_DEBUG
, "lo_fsync(ino=%" PRIu64
", fi=0x%p)\n", ino
,
1968 struct lo_data
*lo
= lo_data(req
);
1970 res
= asprintf(&buf
, "%i", lo_fd(req
, ino
));
1972 return (void)fuse_reply_err(req
, errno
);
1975 fd
= openat(lo
->proc_self_fd
, buf
, O_RDWR
);
1978 return (void)fuse_reply_err(req
, errno
);
1981 fd
= lo_fi_fd(req
, fi
);
1985 res
= fdatasync(fd
);
1992 fuse_reply_err(req
, res
== -1 ? errno
: 0);
1995 static void lo_read(fuse_req_t req
, fuse_ino_t ino
, size_t size
, off_t offset
,
1996 struct fuse_file_info
*fi
)
1998 struct fuse_bufvec buf
= FUSE_BUFVEC_INIT(size
);
2000 fuse_log(FUSE_LOG_DEBUG
,
2001 "lo_read(ino=%" PRIu64
", size=%zd, "
2003 ino
, size
, (unsigned long)offset
);
2005 buf
.buf
[0].flags
= FUSE_BUF_IS_FD
| FUSE_BUF_FD_SEEK
;
2006 buf
.buf
[0].fd
= lo_fi_fd(req
, fi
);
2007 buf
.buf
[0].pos
= offset
;
2009 fuse_reply_data(req
, &buf
);
2012 static void lo_write_buf(fuse_req_t req
, fuse_ino_t ino
,
2013 struct fuse_bufvec
*in_buf
, off_t off
,
2014 struct fuse_file_info
*fi
)
2018 struct fuse_bufvec out_buf
= FUSE_BUFVEC_INIT(fuse_buf_size(in_buf
));
2019 bool cap_fsetid_dropped
= false;
2021 out_buf
.buf
[0].flags
= FUSE_BUF_IS_FD
| FUSE_BUF_FD_SEEK
;
2022 out_buf
.buf
[0].fd
= lo_fi_fd(req
, fi
);
2023 out_buf
.buf
[0].pos
= off
;
2025 fuse_log(FUSE_LOG_DEBUG
,
2026 "lo_write_buf(ino=%" PRIu64
", size=%zd, off=%lu)\n", ino
,
2027 out_buf
.buf
[0].size
, (unsigned long)off
);
2030 * If kill_priv is set, drop CAP_FSETID which should lead to kernel
2031 * clearing setuid/setgid on file.
2033 if (fi
->kill_priv
) {
2034 res
= drop_effective_cap("FSETID", &cap_fsetid_dropped
);
2036 fuse_reply_err(req
, res
);
2041 res
= fuse_buf_copy(&out_buf
, in_buf
);
2043 fuse_reply_err(req
, -res
);
2045 fuse_reply_write(req
, (size_t)res
);
2048 if (cap_fsetid_dropped
) {
2049 res
= gain_effective_cap("FSETID");
2051 fuse_log(FUSE_LOG_ERR
, "Failed to gain CAP_FSETID\n");
2056 static void lo_statfs(fuse_req_t req
, fuse_ino_t ino
)
2059 struct statvfs stbuf
;
2061 res
= fstatvfs(lo_fd(req
, ino
), &stbuf
);
2063 fuse_reply_err(req
, errno
);
2065 fuse_reply_statfs(req
, &stbuf
);
2069 static void lo_fallocate(fuse_req_t req
, fuse_ino_t ino
, int mode
, off_t offset
,
2070 off_t length
, struct fuse_file_info
*fi
)
2072 int err
= EOPNOTSUPP
;
2075 #ifdef CONFIG_FALLOCATE
2076 err
= fallocate(lo_fi_fd(req
, fi
), mode
, offset
, length
);
2081 #elif defined(CONFIG_POSIX_FALLOCATE)
2083 fuse_reply_err(req
, EOPNOTSUPP
);
2087 err
= posix_fallocate(lo_fi_fd(req
, fi
), offset
, length
);
2090 fuse_reply_err(req
, err
);
2093 static void lo_flock(fuse_req_t req
, fuse_ino_t ino
, struct fuse_file_info
*fi
,
2099 res
= flock(lo_fi_fd(req
, fi
), op
);
2101 fuse_reply_err(req
, res
== -1 ? errno
: 0);
2106 * Exit; process attribute unmodified if matched.
2107 * An empty key applies to all.
2109 #define XATTR_MAP_FLAG_OK (1 << 0)
2111 * The attribute is unwanted;
2112 * EPERM on write, hidden on read.
2114 #define XATTR_MAP_FLAG_BAD (1 << 1)
2116 * For attr that start with 'key' prepend 'prepend'
2117 * 'key' may be empty to prepend for all attrs
2118 * key is defined from set/remove point of view.
2119 * Automatically reversed on read
2121 #define XATTR_MAP_FLAG_PREFIX (1 << 2)
2124 /* Apply rule to get/set/remove */
2125 #define XATTR_MAP_FLAG_CLIENT (1 << 16)
2126 /* Apply rule to list */
2127 #define XATTR_MAP_FLAG_SERVER (1 << 17)
2128 /* Apply rule to all */
2129 #define XATTR_MAP_FLAG_ALL (XATTR_MAP_FLAG_SERVER | XATTR_MAP_FLAG_CLIENT)
2131 static void add_xattrmap_entry(struct lo_data
*lo
,
2132 const XattrMapEntry
*new_entry
)
2134 XattrMapEntry
*res
= g_realloc_n(lo
->xattr_map_list
,
2135 lo
->xattr_map_nentries
+ 1,
2136 sizeof(XattrMapEntry
));
2137 res
[lo
->xattr_map_nentries
++] = *new_entry
;
2139 lo
->xattr_map_list
= res
;
2142 static void free_xattrmap(struct lo_data
*lo
)
2144 XattrMapEntry
*map
= lo
->xattr_map_list
;
2151 for (i
= 0; i
< lo
->xattr_map_nentries
; i
++) {
2153 g_free(map
[i
].prepend
);
2157 lo
->xattr_map_list
= NULL
;
2158 lo
->xattr_map_nentries
= -1;
2162 * Handle the 'map' type, which is sugar for a set of commands
2163 * for the common case of prefixing a subset or everything,
2164 * and allowing anything not prefixed through.
2165 * It must be the last entry in the stream, although there
2166 * can be other entries before it.
2170 * key maybe empty in which case all entries are prefixed.
2172 static void parse_xattrmap_map(struct lo_data
*lo
,
2173 const char *rule
, char sep
)
2178 XattrMapEntry tmp_entry
;
2181 fuse_log(FUSE_LOG_ERR
,
2182 "%s: Expecting '%c' after 'map' keyword, found '%c'\n",
2183 __func__
, sep
, *rule
);
2189 /* At start of 'key' field */
2190 tmp
= strchr(rule
, sep
);
2192 fuse_log(FUSE_LOG_ERR
,
2193 "%s: Missing '%c' at end of key field in map rule\n",
2198 key
= g_strndup(rule
, tmp
- rule
);
2201 /* At start of prefix field */
2202 tmp
= strchr(rule
, sep
);
2204 fuse_log(FUSE_LOG_ERR
,
2205 "%s: Missing '%c' at end of prefix field in map rule\n",
2210 prefix
= g_strndup(rule
, tmp
- rule
);
2214 * This should be the end of the string, we don't allow
2215 * any more commands after 'map'.
2218 fuse_log(FUSE_LOG_ERR
,
2219 "%s: Expecting end of command after map, found '%c'\n",
2224 /* 1st: Prefix matches/everything */
2225 tmp_entry
.flags
= XATTR_MAP_FLAG_PREFIX
| XATTR_MAP_FLAG_ALL
;
2226 tmp_entry
.key
= g_strdup(key
);
2227 tmp_entry
.prepend
= g_strdup(prefix
);
2228 add_xattrmap_entry(lo
, &tmp_entry
);
2231 /* Prefix all case */
2233 /* 2nd: Hide any non-prefixed entries on the host */
2234 tmp_entry
.flags
= XATTR_MAP_FLAG_BAD
| XATTR_MAP_FLAG_ALL
;
2235 tmp_entry
.key
= g_strdup("");
2236 tmp_entry
.prepend
= g_strdup("");
2237 add_xattrmap_entry(lo
, &tmp_entry
);
2239 /* Prefix matching case */
2241 /* 2nd: Hide non-prefixed but matching entries on the host */
2242 tmp_entry
.flags
= XATTR_MAP_FLAG_BAD
| XATTR_MAP_FLAG_SERVER
;
2243 tmp_entry
.key
= g_strdup(""); /* Not used */
2244 tmp_entry
.prepend
= g_strdup(key
);
2245 add_xattrmap_entry(lo
, &tmp_entry
);
2247 /* 3rd: Stop the client accessing prefixed attributes directly */
2248 tmp_entry
.flags
= XATTR_MAP_FLAG_BAD
| XATTR_MAP_FLAG_CLIENT
;
2249 tmp_entry
.key
= g_strdup(prefix
);
2250 tmp_entry
.prepend
= g_strdup(""); /* Not used */
2251 add_xattrmap_entry(lo
, &tmp_entry
);
2253 /* 4th: Everything else is OK */
2254 tmp_entry
.flags
= XATTR_MAP_FLAG_OK
| XATTR_MAP_FLAG_ALL
;
2255 tmp_entry
.key
= g_strdup("");
2256 tmp_entry
.prepend
= g_strdup("");
2257 add_xattrmap_entry(lo
, &tmp_entry
);
2264 static void parse_xattrmap(struct lo_data
*lo
)
2266 const char *map
= lo
->xattrmap
;
2269 lo
->xattr_map_nentries
= 0;
2271 XattrMapEntry tmp_entry
;
2274 if (isspace(*map
)) {
2278 /* The separator is the first non-space of the rule */
2284 tmp_entry
.flags
= 0;
2285 /* Start of 'type' */
2286 if (strstart(map
, "prefix", &map
)) {
2287 tmp_entry
.flags
|= XATTR_MAP_FLAG_PREFIX
;
2288 } else if (strstart(map
, "ok", &map
)) {
2289 tmp_entry
.flags
|= XATTR_MAP_FLAG_OK
;
2290 } else if (strstart(map
, "bad", &map
)) {
2291 tmp_entry
.flags
|= XATTR_MAP_FLAG_BAD
;
2292 } else if (strstart(map
, "map", &map
)) {
2294 * map is sugar that adds a number of rules, and must be
2297 parse_xattrmap_map(lo
, map
, sep
);
2300 fuse_log(FUSE_LOG_ERR
,
2301 "%s: Unexpected type;"
2302 "Expecting 'prefix', 'ok', 'bad' or 'map' in rule %zu\n",
2303 __func__
, lo
->xattr_map_nentries
);
2307 if (*map
++ != sep
) {
2308 fuse_log(FUSE_LOG_ERR
,
2309 "%s: Missing '%c' at end of type field of rule %zu\n",
2310 __func__
, sep
, lo
->xattr_map_nentries
);
2314 /* Start of 'scope' */
2315 if (strstart(map
, "client", &map
)) {
2316 tmp_entry
.flags
|= XATTR_MAP_FLAG_CLIENT
;
2317 } else if (strstart(map
, "server", &map
)) {
2318 tmp_entry
.flags
|= XATTR_MAP_FLAG_SERVER
;
2319 } else if (strstart(map
, "all", &map
)) {
2320 tmp_entry
.flags
|= XATTR_MAP_FLAG_ALL
;
2322 fuse_log(FUSE_LOG_ERR
,
2323 "%s: Unexpected scope;"
2324 " Expecting 'client', 'server', or 'all', in rule %zu\n",
2325 __func__
, lo
->xattr_map_nentries
);
2329 if (*map
++ != sep
) {
2330 fuse_log(FUSE_LOG_ERR
,
2331 "%s: Expecting '%c' found '%c'"
2332 " after scope in rule %zu\n",
2333 __func__
, sep
, *map
, lo
->xattr_map_nentries
);
2337 /* At start of 'key' field */
2338 tmp
= strchr(map
, sep
);
2340 fuse_log(FUSE_LOG_ERR
,
2341 "%s: Missing '%c' at end of key field of rule %zu",
2342 __func__
, sep
, lo
->xattr_map_nentries
);
2345 tmp_entry
.key
= g_strndup(map
, tmp
- map
);
2348 /* At start of 'prepend' field */
2349 tmp
= strchr(map
, sep
);
2351 fuse_log(FUSE_LOG_ERR
,
2352 "%s: Missing '%c' at end of prepend field of rule %zu",
2353 __func__
, sep
, lo
->xattr_map_nentries
);
2356 tmp_entry
.prepend
= g_strndup(map
, tmp
- map
);
2359 add_xattrmap_entry(lo
, &tmp_entry
);
2360 /* End of rule - go around again for another rule */
2363 if (!lo
->xattr_map_nentries
) {
2364 fuse_log(FUSE_LOG_ERR
, "Empty xattr map\n");
2370 * For use with getxattr/setxattr/removexattr, where the client
2371 * gives us a name and we may need to choose a different one.
2372 * Allocates a buffer for the result placing it in *out_name.
2373 * If there's no change then *out_name is not set.
2374 * Returns 0 on success
2375 * Can return -EPERM to indicate we block a given attribute
2376 * (in which case out_name is not allocated)
2377 * Can return -ENOMEM to indicate out_name couldn't be allocated.
2379 static int xattr_map_client(const struct lo_data
*lo
, const char *client_name
,
2383 for (i
= 0; i
< lo
->xattr_map_nentries
; i
++) {
2384 const XattrMapEntry
*cur_entry
= lo
->xattr_map_list
+ i
;
2386 if ((cur_entry
->flags
& XATTR_MAP_FLAG_CLIENT
) &&
2387 (strstart(client_name
, cur_entry
->key
, NULL
))) {
2388 if (cur_entry
->flags
& XATTR_MAP_FLAG_BAD
) {
2391 if (cur_entry
->flags
& XATTR_MAP_FLAG_OK
) {
2392 /* Unmodified name */
2395 if (cur_entry
->flags
& XATTR_MAP_FLAG_PREFIX
) {
2396 *out_name
= g_try_malloc(strlen(client_name
) +
2397 strlen(cur_entry
->prepend
) + 1);
2401 sprintf(*out_name
, "%s%s", cur_entry
->prepend
, client_name
);
2411 * For use with listxattr where the server fs gives us a name and we may need
2412 * to sanitize this for the client.
2413 * Returns a pointer to the result in *out_name
2414 * This is always the original string or the current string with some prefix
2415 * removed; no reallocation is done.
2416 * Returns 0 on success
2417 * Can return -ENODATA to indicate the name should be dropped from the list.
2419 static int xattr_map_server(const struct lo_data
*lo
, const char *server_name
,
2420 const char **out_name
)
2425 for (i
= 0; i
< lo
->xattr_map_nentries
; i
++) {
2426 const XattrMapEntry
*cur_entry
= lo
->xattr_map_list
+ i
;
2428 if ((cur_entry
->flags
& XATTR_MAP_FLAG_SERVER
) &&
2429 (strstart(server_name
, cur_entry
->prepend
, &end
))) {
2430 if (cur_entry
->flags
& XATTR_MAP_FLAG_BAD
) {
2433 if (cur_entry
->flags
& XATTR_MAP_FLAG_OK
) {
2434 *out_name
= server_name
;
2437 if (cur_entry
->flags
& XATTR_MAP_FLAG_PREFIX
) {
2448 static void lo_getxattr(fuse_req_t req
, fuse_ino_t ino
, const char *in_name
,
2451 struct lo_data
*lo
= lo_data(req
);
2456 struct lo_inode
*inode
;
2464 ret
= xattr_map_client(lo
, in_name
, &mapped_name
);
2466 if (ret
== -EPERM
) {
2469 fuse_reply_err(req
, -ret
);
2477 inode
= lo_inode(req
, ino
);
2479 fuse_reply_err(req
, EBADF
);
2480 g_free(mapped_name
);
2485 if (!lo_data(req
)->xattr
) {
2489 fuse_log(FUSE_LOG_DEBUG
, "lo_getxattr(ino=%" PRIu64
", name=%s size=%zd)\n",
2493 value
= malloc(size
);
2499 sprintf(procname
, "%i", inode
->fd
);
2501 * It is not safe to open() non-regular/non-dir files in file server
2502 * unless O_PATH is used, so use that method for regular files/dir
2503 * only (as it seems giving less performance overhead).
2504 * Otherwise, call fchdir() to avoid open().
2506 if (S_ISREG(inode
->filetype
) || S_ISDIR(inode
->filetype
)) {
2507 fd
= openat(lo
->proc_self_fd
, procname
, O_RDONLY
);
2511 ret
= fgetxattr(fd
, name
, value
, size
);
2513 /* fchdir should not fail here */
2514 assert(fchdir(lo
->proc_self_fd
) == 0);
2515 ret
= getxattr(procname
, name
, value
, size
);
2516 assert(fchdir(lo
->root
.fd
) == 0);
2527 fuse_reply_buf(req
, value
, ret
);
2529 fuse_reply_xattr(req
, ret
);
2538 lo_inode_put(lo
, &inode
);
2544 fuse_reply_err(req
, saverr
);
2545 g_free(mapped_name
);
2549 static void lo_listxattr(fuse_req_t req
, fuse_ino_t ino
, size_t size
)
2551 struct lo_data
*lo
= lo_data(req
);
2554 struct lo_inode
*inode
;
2559 inode
= lo_inode(req
, ino
);
2561 fuse_reply_err(req
, EBADF
);
2566 if (!lo_data(req
)->xattr
) {
2570 fuse_log(FUSE_LOG_DEBUG
, "lo_listxattr(ino=%" PRIu64
", size=%zd)\n", ino
,
2574 value
= malloc(size
);
2580 sprintf(procname
, "%i", inode
->fd
);
2581 if (S_ISREG(inode
->filetype
) || S_ISDIR(inode
->filetype
)) {
2582 fd
= openat(lo
->proc_self_fd
, procname
, O_RDONLY
);
2586 ret
= flistxattr(fd
, value
, size
);
2588 /* fchdir should not fail here */
2589 assert(fchdir(lo
->proc_self_fd
) == 0);
2590 ret
= listxattr(procname
, value
, size
);
2591 assert(fchdir(lo
->root
.fd
) == 0);
2603 if (lo
->xattr_map_list
) {
2605 * Map the names back, some attributes might be dropped,
2606 * some shortened, but not increased, so we shouldn't
2609 size_t out_index
, in_index
;
2612 while (in_index
< ret
) {
2613 const char *map_out
;
2614 char *in_ptr
= value
+ in_index
;
2615 /* Length of current attribute name */
2616 size_t in_len
= strlen(value
+ in_index
) + 1;
2618 int mapret
= xattr_map_server(lo
, in_ptr
, &map_out
);
2619 if (mapret
!= -ENODATA
&& mapret
!= 0) {
2620 /* Shouldn't happen */
2625 /* Either unchanged, or truncated */
2627 if (map_out
!= in_ptr
) {
2628 /* +1 copies the NIL */
2629 out_len
= strlen(map_out
) + 1;
2635 * Move result along, may still be needed for an unchanged
2636 * entry if a previous entry was changed.
2638 memmove(value
+ out_index
, map_out
, out_len
);
2640 out_index
+= out_len
;
2649 fuse_reply_buf(req
, value
, ret
);
2652 * xattrmap only ever shortens the result,
2653 * so we don't need to do anything clever with the
2654 * allocation length here.
2656 fuse_reply_xattr(req
, ret
);
2665 lo_inode_put(lo
, &inode
);
2671 fuse_reply_err(req
, saverr
);
2675 static void lo_setxattr(fuse_req_t req
, fuse_ino_t ino
, const char *in_name
,
2676 const char *value
, size_t size
, int flags
)
2681 struct lo_data
*lo
= lo_data(req
);
2682 struct lo_inode
*inode
;
2690 ret
= xattr_map_client(lo
, in_name
, &mapped_name
);
2692 fuse_reply_err(req
, -ret
);
2700 inode
= lo_inode(req
, ino
);
2702 fuse_reply_err(req
, EBADF
);
2703 g_free(mapped_name
);
2708 if (!lo_data(req
)->xattr
) {
2712 fuse_log(FUSE_LOG_DEBUG
, "lo_setxattr(ino=%" PRIu64
2713 ", name=%s value=%s size=%zd)\n", ino
, name
, value
, size
);
2715 sprintf(procname
, "%i", inode
->fd
);
2716 if (S_ISREG(inode
->filetype
) || S_ISDIR(inode
->filetype
)) {
2717 fd
= openat(lo
->proc_self_fd
, procname
, O_RDONLY
);
2722 ret
= fsetxattr(fd
, name
, value
, size
, flags
);
2724 /* fchdir should not fail here */
2725 assert(fchdir(lo
->proc_self_fd
) == 0);
2726 ret
= setxattr(procname
, name
, value
, size
, flags
);
2727 assert(fchdir(lo
->root
.fd
) == 0);
2730 saverr
= ret
== -1 ? errno
: 0;
2737 lo_inode_put(lo
, &inode
);
2738 g_free(mapped_name
);
2739 fuse_reply_err(req
, saverr
);
2742 static void lo_removexattr(fuse_req_t req
, fuse_ino_t ino
, const char *in_name
)
2747 struct lo_data
*lo
= lo_data(req
);
2748 struct lo_inode
*inode
;
2756 ret
= xattr_map_client(lo
, in_name
, &mapped_name
);
2758 fuse_reply_err(req
, -ret
);
2766 inode
= lo_inode(req
, ino
);
2768 fuse_reply_err(req
, EBADF
);
2769 g_free(mapped_name
);
2774 if (!lo_data(req
)->xattr
) {
2778 fuse_log(FUSE_LOG_DEBUG
, "lo_removexattr(ino=%" PRIu64
", name=%s)\n", ino
,
2781 sprintf(procname
, "%i", inode
->fd
);
2782 if (S_ISREG(inode
->filetype
) || S_ISDIR(inode
->filetype
)) {
2783 fd
= openat(lo
->proc_self_fd
, procname
, O_RDONLY
);
2788 ret
= fremovexattr(fd
, name
);
2790 /* fchdir should not fail here */
2791 assert(fchdir(lo
->proc_self_fd
) == 0);
2792 ret
= removexattr(procname
, name
);
2793 assert(fchdir(lo
->root
.fd
) == 0);
2796 saverr
= ret
== -1 ? errno
: 0;
2803 lo_inode_put(lo
, &inode
);
2804 g_free(mapped_name
);
2805 fuse_reply_err(req
, saverr
);
2808 #ifdef HAVE_COPY_FILE_RANGE
2809 static void lo_copy_file_range(fuse_req_t req
, fuse_ino_t ino_in
, off_t off_in
,
2810 struct fuse_file_info
*fi_in
, fuse_ino_t ino_out
,
2811 off_t off_out
, struct fuse_file_info
*fi_out
,
2812 size_t len
, int flags
)
2817 in_fd
= lo_fi_fd(req
, fi_in
);
2818 out_fd
= lo_fi_fd(req
, fi_out
);
2820 fuse_log(FUSE_LOG_DEBUG
,
2821 "lo_copy_file_range(ino=%" PRIu64
"/fd=%d, "
2822 "off=%lu, ino=%" PRIu64
"/fd=%d, "
2823 "off=%lu, size=%zd, flags=0x%x)\n",
2824 ino_in
, in_fd
, off_in
, ino_out
, out_fd
, off_out
, len
, flags
);
2826 res
= copy_file_range(in_fd
, &off_in
, out_fd
, &off_out
, len
, flags
);
2828 fuse_reply_err(req
, errno
);
2830 fuse_reply_write(req
, res
);
2835 static void lo_lseek(fuse_req_t req
, fuse_ino_t ino
, off_t off
, int whence
,
2836 struct fuse_file_info
*fi
)
2841 res
= lseek(lo_fi_fd(req
, fi
), off
, whence
);
2843 fuse_reply_lseek(req
, res
);
2845 fuse_reply_err(req
, errno
);
2849 static void lo_destroy(void *userdata
)
2851 struct lo_data
*lo
= (struct lo_data
*)userdata
;
2853 pthread_mutex_lock(&lo
->mutex
);
2855 GHashTableIter iter
;
2856 gpointer key
, value
;
2858 g_hash_table_iter_init(&iter
, lo
->inodes
);
2859 if (!g_hash_table_iter_next(&iter
, &key
, &value
)) {
2863 struct lo_inode
*inode
= value
;
2864 unref_inode(lo
, inode
, inode
->nlookup
);
2866 pthread_mutex_unlock(&lo
->mutex
);
2869 static struct fuse_lowlevel_ops lo_oper
= {
2871 .lookup
= lo_lookup
,
2874 .symlink
= lo_symlink
,
2876 .unlink
= lo_unlink
,
2878 .rename
= lo_rename
,
2879 .forget
= lo_forget
,
2880 .forget_multi
= lo_forget_multi
,
2881 .getattr
= lo_getattr
,
2882 .setattr
= lo_setattr
,
2883 .readlink
= lo_readlink
,
2884 .opendir
= lo_opendir
,
2885 .readdir
= lo_readdir
,
2886 .readdirplus
= lo_readdirplus
,
2887 .releasedir
= lo_releasedir
,
2888 .fsyncdir
= lo_fsyncdir
,
2889 .create
= lo_create
,
2893 .release
= lo_release
,
2897 .write_buf
= lo_write_buf
,
2898 .statfs
= lo_statfs
,
2899 .fallocate
= lo_fallocate
,
2901 .getxattr
= lo_getxattr
,
2902 .listxattr
= lo_listxattr
,
2903 .setxattr
= lo_setxattr
,
2904 .removexattr
= lo_removexattr
,
2905 #ifdef HAVE_COPY_FILE_RANGE
2906 .copy_file_range
= lo_copy_file_range
,
2909 .destroy
= lo_destroy
,
2912 /* Print vhost-user.json backend program capabilities */
2913 static void print_capabilities(void)
2916 printf(" \"type\": \"fs\"\n");
2921 * Drop all Linux capabilities because the wait parent process only needs to
2922 * sit in waitpid(2) and terminate.
2924 static void setup_wait_parent_capabilities(void)
2926 capng_setpid(syscall(SYS_gettid
));
2927 capng_clear(CAPNG_SELECT_BOTH
);
2928 capng_apply(CAPNG_SELECT_BOTH
);
2932 * Move to a new mount, net, and pid namespaces to isolate this process.
2934 static void setup_namespaces(struct lo_data
*lo
, struct fuse_session
*se
)
2939 * Create a new pid namespace for *child* processes. We'll have to
2940 * fork in order to enter the new pid namespace. A new mount namespace
2941 * is also needed so that we can remount /proc for the new pid
2944 * Our UNIX domain sockets have been created. Now we can move to
2945 * an empty network namespace to prevent TCP/IP and other network
2946 * activity in case this process is compromised.
2948 if (unshare(CLONE_NEWPID
| CLONE_NEWNS
| CLONE_NEWNET
) != 0) {
2949 fuse_log(FUSE_LOG_ERR
, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
2955 fuse_log(FUSE_LOG_ERR
, "fork() failed: %m\n");
2962 setup_wait_parent_capabilities();
2964 /* The parent waits for the child */
2966 waited
= waitpid(child
, &wstatus
, 0);
2967 } while (waited
< 0 && errno
== EINTR
&& !se
->exited
);
2969 /* We were terminated by a signal, see fuse_signals.c */
2974 if (WIFEXITED(wstatus
)) {
2975 exit(WEXITSTATUS(wstatus
));
2981 /* Send us SIGTERM when the parent thread terminates, see prctl(2) */
2982 prctl(PR_SET_PDEATHSIG
, SIGTERM
);
2985 * If the mounts have shared propagation then we want to opt out so our
2986 * mount changes don't affect the parent mount namespace.
2988 if (mount(NULL
, "/", NULL
, MS_REC
| MS_SLAVE
, NULL
) < 0) {
2989 fuse_log(FUSE_LOG_ERR
, "mount(/, MS_REC|MS_SLAVE): %m\n");
2993 /* The child must remount /proc to use the new pid namespace */
2994 if (mount("proc", "/proc", "proc",
2995 MS_NODEV
| MS_NOEXEC
| MS_NOSUID
| MS_RELATIME
, NULL
) < 0) {
2996 fuse_log(FUSE_LOG_ERR
, "mount(/proc): %m\n");
3001 * We only need /proc/self/fd. Prevent ".." from accessing parent
3002 * directories of /proc/self/fd by bind-mounting it over /proc. Since / was
3003 * previously remounted with MS_REC | MS_SLAVE this mount change only
3004 * affects our process.
3006 if (mount("/proc/self/fd", "/proc", NULL
, MS_BIND
, NULL
) < 0) {
3007 fuse_log(FUSE_LOG_ERR
, "mount(/proc/self/fd, MS_BIND): %m\n");
3011 /* Get the /proc (actually /proc/self/fd, see above) file descriptor */
3012 lo
->proc_self_fd
= open("/proc", O_PATH
);
3013 if (lo
->proc_self_fd
== -1) {
3014 fuse_log(FUSE_LOG_ERR
, "open(/proc, O_PATH): %m\n");
3020 * Capture the capability state, we'll need to restore this for individual
3021 * threads later; see load_capng.
3023 static void setup_capng(void)
3025 /* Note this accesses /proc so has to happen before the sandbox */
3026 if (capng_get_caps_process()) {
3027 fuse_log(FUSE_LOG_ERR
, "capng_get_caps_process\n");
3030 pthread_mutex_init(&cap
.mutex
, NULL
);
3031 pthread_mutex_lock(&cap
.mutex
);
3032 cap
.saved
= capng_save_state();
3034 fuse_log(FUSE_LOG_ERR
, "capng_save_state\n");
3037 pthread_mutex_unlock(&cap
.mutex
);
3040 static void cleanup_capng(void)
3044 pthread_mutex_destroy(&cap
.mutex
);
3049 * Make the source directory our root so symlinks cannot escape and no other
3050 * files are accessible. Assumes unshare(CLONE_NEWNS) was already called.
3052 static void setup_mounts(const char *source
)
3057 if (mount(source
, source
, NULL
, MS_BIND
| MS_REC
, NULL
) < 0) {
3058 fuse_log(FUSE_LOG_ERR
, "mount(%s, %s, MS_BIND): %m\n", source
, source
);
3062 /* This magic is based on lxc's lxc_pivot_root() */
3063 oldroot
= open("/", O_DIRECTORY
| O_RDONLY
| O_CLOEXEC
);
3065 fuse_log(FUSE_LOG_ERR
, "open(/): %m\n");
3069 newroot
= open(source
, O_DIRECTORY
| O_RDONLY
| O_CLOEXEC
);
3071 fuse_log(FUSE_LOG_ERR
, "open(%s): %m\n", source
);
3075 if (fchdir(newroot
) < 0) {
3076 fuse_log(FUSE_LOG_ERR
, "fchdir(newroot): %m\n");
3080 if (syscall(__NR_pivot_root
, ".", ".") < 0) {
3081 fuse_log(FUSE_LOG_ERR
, "pivot_root(., .): %m\n");
3085 if (fchdir(oldroot
) < 0) {
3086 fuse_log(FUSE_LOG_ERR
, "fchdir(oldroot): %m\n");
3090 if (mount("", ".", "", MS_SLAVE
| MS_REC
, NULL
) < 0) {
3091 fuse_log(FUSE_LOG_ERR
, "mount(., MS_SLAVE | MS_REC): %m\n");
3095 if (umount2(".", MNT_DETACH
) < 0) {
3096 fuse_log(FUSE_LOG_ERR
, "umount2(., MNT_DETACH): %m\n");
3100 if (fchdir(newroot
) < 0) {
3101 fuse_log(FUSE_LOG_ERR
, "fchdir(newroot): %m\n");
3110 * Only keep whitelisted capabilities that are needed for file system operation
3111 * The (possibly NULL) modcaps_in string passed in is free'd before exit.
3113 static void setup_capabilities(char *modcaps_in
)
3115 char *modcaps
= modcaps_in
;
3116 pthread_mutex_lock(&cap
.mutex
);
3117 capng_restore_state(&cap
.saved
);
3120 * Whitelist file system-related capabilities that are needed for a file
3121 * server to act like root. Drop everything else like networking and
3122 * sysadmin capabilities.
3125 * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl
3126 * and we don't support that.
3127 * 2. CAP_MAC_OVERRIDE is not included because it only seems to be
3128 * used by the Smack LSM. Omit it until there is demand for it.
3130 capng_setpid(syscall(SYS_gettid
));
3131 capng_clear(CAPNG_SELECT_BOTH
);
3132 if (capng_updatev(CAPNG_ADD
, CAPNG_PERMITTED
| CAPNG_EFFECTIVE
,
3142 fuse_log(FUSE_LOG_ERR
, "%s: capng_updatev failed\n", __func__
);
3147 * The modcaps option is a colon separated list of caps,
3148 * each preceded by either + or -.
3154 char *next
= strchr(modcaps
, ':');
3160 switch (modcaps
[0]) {
3166 action
= CAPNG_DROP
;
3170 fuse_log(FUSE_LOG_ERR
,
3171 "%s: Expecting '+'/'-' in modcaps but found '%c'\n",
3172 __func__
, modcaps
[0]);
3175 cap
= capng_name_to_capability(modcaps
+ 1);
3177 fuse_log(FUSE_LOG_ERR
, "%s: Unknown capability '%s'\n", __func__
,
3181 if (capng_update(action
, CAPNG_PERMITTED
| CAPNG_EFFECTIVE
, cap
)) {
3182 fuse_log(FUSE_LOG_ERR
, "%s: capng_update failed for '%s'\n",
3191 if (capng_apply(CAPNG_SELECT_BOTH
)) {
3192 fuse_log(FUSE_LOG_ERR
, "%s: capng_apply failed\n", __func__
);
3196 cap
.saved
= capng_save_state();
3198 fuse_log(FUSE_LOG_ERR
, "%s: capng_save_state failed\n", __func__
);
3201 pthread_mutex_unlock(&cap
.mutex
);
3205 * Use chroot as a weaker sandbox for environments where the process is
3206 * launched without CAP_SYS_ADMIN.
3208 static void setup_chroot(struct lo_data
*lo
)
3210 lo
->proc_self_fd
= open("/proc/self/fd", O_PATH
);
3211 if (lo
->proc_self_fd
== -1) {
3212 fuse_log(FUSE_LOG_ERR
, "open(\"/proc/self/fd\", O_PATH): %m\n");
3217 * Make the shared directory the file system root so that FUSE_OPEN
3218 * (lo_open()) cannot escape the shared directory by opening a symlink.
3220 * The chroot(2) syscall is later disabled by seccomp and the
3221 * CAP_SYS_CHROOT capability is dropped so that tampering with the chroot
3224 * However, it's still possible to escape the chroot via lo->proc_self_fd
3225 * but that requires first gaining control of the process.
3227 if (chroot(lo
->source
) != 0) {
3228 fuse_log(FUSE_LOG_ERR
, "chroot(\"%s\"): %m\n", lo
->source
);
3232 /* Move into the chroot */
3233 if (chdir("/") != 0) {
3234 fuse_log(FUSE_LOG_ERR
, "chdir(\"/\"): %m\n");
3240 * Lock down this process to prevent access to other processes or files outside
3241 * source directory. This reduces the impact of arbitrary code execution bugs.
3243 static void setup_sandbox(struct lo_data
*lo
, struct fuse_session
*se
,
3246 if (lo
->sandbox
== SANDBOX_NAMESPACE
) {
3247 setup_namespaces(lo
, se
);
3248 setup_mounts(lo
->source
);
3253 setup_seccomp(enable_syslog
);
3254 setup_capabilities(g_strdup(lo
->modcaps
));
3257 /* Set the maximum number of open file descriptors */
3258 static void setup_nofile_rlimit(unsigned long rlimit_nofile
)
3260 struct rlimit rlim
= {
3261 .rlim_cur
= rlimit_nofile
,
3262 .rlim_max
= rlimit_nofile
,
3265 if (rlimit_nofile
== 0) {
3266 return; /* nothing to do */
3269 if (setrlimit(RLIMIT_NOFILE
, &rlim
) < 0) {
3270 /* Ignore SELinux denials */
3271 if (errno
== EPERM
) {
3275 fuse_log(FUSE_LOG_ERR
, "setrlimit(RLIMIT_NOFILE): %m\n");
3280 static void log_func(enum fuse_log_level level
, const char *fmt
, va_list ap
)
3282 g_autofree
char *localfmt
= NULL
;
3284 if (current_log_level
< level
) {
3288 if (current_log_level
== FUSE_LOG_DEBUG
) {
3290 localfmt
= g_strdup_printf("[%" PRId64
"] [ID: %08ld] %s",
3291 get_clock(), syscall(__NR_gettid
), fmt
);
3293 localfmt
= g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid
),
3300 int priority
= LOG_ERR
;
3302 case FUSE_LOG_EMERG
:
3303 priority
= LOG_EMERG
;
3305 case FUSE_LOG_ALERT
:
3306 priority
= LOG_ALERT
;
3309 priority
= LOG_CRIT
;
3314 case FUSE_LOG_WARNING
:
3315 priority
= LOG_WARNING
;
3317 case FUSE_LOG_NOTICE
:
3318 priority
= LOG_NOTICE
;
3321 priority
= LOG_INFO
;
3323 case FUSE_LOG_DEBUG
:
3324 priority
= LOG_DEBUG
;
3327 vsyslog(priority
, fmt
, ap
);
3329 vfprintf(stderr
, fmt
, ap
);
3333 static void setup_root(struct lo_data
*lo
, struct lo_inode
*root
)
3338 fd
= open("/", O_PATH
);
3340 fuse_log(FUSE_LOG_ERR
, "open(%s, O_PATH): %m\n", lo
->source
);
3344 res
= fstatat(fd
, "", &stat
, AT_EMPTY_PATH
| AT_SYMLINK_NOFOLLOW
);
3346 fuse_log(FUSE_LOG_ERR
, "fstatat(%s): %m\n", lo
->source
);
3350 root
->filetype
= S_IFDIR
;
3352 root
->key
.ino
= stat
.st_ino
;
3353 root
->key
.dev
= stat
.st_dev
;
3355 g_atomic_int_set(&root
->refcount
, 2);
3358 static guint
lo_key_hash(gconstpointer key
)
3360 const struct lo_key
*lkey
= key
;
3362 return (guint
)lkey
->ino
+ (guint
)lkey
->dev
;
3365 static gboolean
lo_key_equal(gconstpointer a
, gconstpointer b
)
3367 const struct lo_key
*la
= a
;
3368 const struct lo_key
*lb
= b
;
3370 return la
->ino
== lb
->ino
&& la
->dev
== lb
->dev
;
3373 static void fuse_lo_data_cleanup(struct lo_data
*lo
)
3376 g_hash_table_destroy(lo
->inodes
);
3378 lo_map_destroy(&lo
->fd_map
);
3379 lo_map_destroy(&lo
->dirp_map
);
3380 lo_map_destroy(&lo
->ino_map
);
3382 if (lo
->proc_self_fd
>= 0) {
3383 close(lo
->proc_self_fd
);
3386 if (lo
->root
.fd
>= 0) {
3395 int main(int argc
, char *argv
[])
3397 struct fuse_args args
= FUSE_ARGS_INIT(argc
, argv
);
3398 struct fuse_session
*se
;
3399 struct fuse_cmdline_opts opts
;
3400 struct lo_data lo
= {
3401 .sandbox
= SANDBOX_NAMESPACE
,
3405 .allow_direct_io
= 0,
3408 struct lo_map_elem
*root_elem
;
3411 /* Don't mask creation mode, kernel already did that */
3414 qemu_init_exec_dir(argv
[0]);
3416 pthread_mutex_init(&lo
.mutex
, NULL
);
3417 lo
.inodes
= g_hash_table_new(lo_key_hash
, lo_key_equal
);
3419 lo
.root
.fuse_ino
= FUSE_ROOT_ID
;
3420 lo
.cache
= CACHE_AUTO
;
3423 * Set up the ino map like this:
3424 * [0] Reserved (will not be used)
3427 lo_map_init(&lo
.ino_map
);
3428 lo_map_reserve(&lo
.ino_map
, 0)->in_use
= false;
3429 root_elem
= lo_map_reserve(&lo
.ino_map
, lo
.root
.fuse_ino
);
3430 root_elem
->inode
= &lo
.root
;
3432 lo_map_init(&lo
.dirp_map
);
3433 lo_map_init(&lo
.fd_map
);
3435 if (fuse_parse_cmdline(&args
, &opts
) != 0) {
3438 fuse_set_log_func(log_func
);
3439 use_syslog
= opts
.syslog
;
3441 openlog("virtiofsd", LOG_PID
, LOG_DAEMON
);
3444 if (opts
.show_help
) {
3445 printf("usage: %s [options]\n\n", argv
[0]);
3446 fuse_cmdline_help();
3447 printf(" -o source=PATH shared directory tree\n");
3448 fuse_lowlevel_help();
3451 } else if (opts
.show_version
) {
3452 fuse_lowlevel_version();
3455 } else if (opts
.print_capabilities
) {
3456 print_capabilities();
3461 if (fuse_opt_parse(&args
, &lo
, lo_opts
, NULL
) == -1) {
3465 if (opts
.log_level
!= 0) {
3466 current_log_level
= opts
.log_level
;
3468 /* default log level is INFO */
3469 current_log_level
= FUSE_LOG_INFO
;
3471 lo
.debug
= opts
.debug
;
3473 current_log_level
= FUSE_LOG_DEBUG
;
3479 res
= lstat(lo
.source
, &stat
);
3481 fuse_log(FUSE_LOG_ERR
, "failed to stat source (\"%s\"): %m\n",
3485 if (!S_ISDIR(stat
.st_mode
)) {
3486 fuse_log(FUSE_LOG_ERR
, "source is not a directory\n");
3490 lo
.source
= strdup("/");
3494 parse_xattrmap(&lo
);
3497 if (!lo
.timeout_set
) {
3508 lo
.timeout
= 86400.0;
3511 } else if (lo
.timeout
< 0) {
3512 fuse_log(FUSE_LOG_ERR
, "timeout is negative (%lf)\n", lo
.timeout
);
3516 se
= fuse_session_new(&args
, &lo_oper
, sizeof(lo_oper
), &lo
);
3521 if (fuse_set_signal_handlers(se
) != 0) {
3525 if (fuse_session_mount(se
) != 0) {
3529 fuse_daemonize(opts
.foreground
);
3531 setup_nofile_rlimit(opts
.rlimit_nofile
);
3533 /* Must be before sandbox since it wants /proc */
3536 setup_sandbox(&lo
, se
, opts
.syslog
);
3538 setup_root(&lo
, &lo
.root
);
3539 /* Block until ctrl+c or fusermount -u */
3540 ret
= virtio_loop(se
);
3542 fuse_session_unmount(se
);
3545 fuse_remove_signal_handlers(se
);
3547 fuse_session_destroy(se
);
3549 fuse_opt_free_args(&args
);
3551 fuse_lo_data_cleanup(&lo
);