2 * FUSE: Filesystem in Userspace
3 * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
5 * This program can be distributed under the terms of the GNU GPLv2.
6 * See the file COPYING.
11 * This file system mirrors the existing file system hierarchy of the
12 * system, starting at the root file system. This is implemented by
13 * just "passing through" all requests to the corresponding user-space
14 * libc functions. In contrast to passthrough.c and passthrough_fh.c,
15 * this implementation uses the low-level API. Its performance should
16 * be the least bad among the three, but many operations are not
17 * implemented. In particular, it is not possible to remove files (or
18 * directories) because the code necessary to defer actual removal
19 * until the file is not opened anymore would make the example much
22 * When writeback caching is enabled (-o writeback mount option), it
23 * is only possible to write to files for which the mounting user has
24 * read permissions. This is because the writeback cache requires the
25 * kernel to be able to issue read requests for all files (which the
26 * passthrough filesystem cannot satisfy if it can't read the file in
27 * the underlying filesystem).
31 * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o
35 * \include passthrough_ll.c
38 #include "qemu/osdep.h"
39 #include "qemu/timer.h"
40 #include "qemu-version.h"
41 #include "qemu/help-texts.h"
42 #include "fuse_virtio.h"
44 #include "fuse_lowlevel.h"
45 #include "standard-headers/linux/fuse.h"
50 #include <sys/mount.h>
51 #include <sys/prctl.h>
52 #include <sys/resource.h>
53 #include <sys/syscall.h>
55 #include <sys/xattr.h>
59 #include "qemu/cutils.h"
60 #include "passthrough_helpers.h"
61 #include "passthrough_seccomp.h"
63 /* Keep track of inode posix locks for each owner. */
64 struct lo_inode_plock
{
66 int fd
; /* fd for OFD locks */
71 struct lo_inode
*inode
;
79 /* Maps FUSE fh or ino values to internal objects */
81 struct lo_map_elem
*elems
;
96 * Atomic reference count for this object. The nlookup field holds a
97 * reference and release it when nlookup reaches 0.
104 * This counter keeps the inode alive during the FUSE session.
105 * Incremented when the FUSE inode number is sent in a reply
106 * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc). Decremented when an inode is
107 * released by a FUSE_FORGET request.
109 * Note that this value is untrusted because the client can manipulate
110 * it arbitrarily using FUSE_FORGET requests.
112 * Protected by lo->mutex.
117 pthread_mutex_t plock_mutex
;
118 GHashTable
*posix_locks
; /* protected by lo_inode->plock_mutex */
140 typedef struct xattr_map_entry
{
147 pthread_mutex_t mutex
;
155 char *xattr_security_capability
;
162 int readdirplus_clear
;
164 int announce_submounts
;
166 struct lo_inode root
;
167 GHashTable
*inodes
; /* protected by lo->mutex */
168 struct lo_map ino_map
; /* protected by lo->mutex */
169 struct lo_map dirp_map
; /* protected by lo->mutex */
170 struct lo_map fd_map
; /* protected by lo->mutex */
171 XattrMapEntry
*xattr_map_list
;
172 size_t xattr_map_nentries
;
174 /* An O_PATH file descriptor to /proc/self/fd/ */
176 /* An O_PATH file descriptor to /proc/self/task/ */
178 int user_killpriv_v2
, killpriv_v2
;
179 /* If set, virtiofsd is responsible for setting umask during creation */
181 int user_posix_acl
, posix_acl
;
182 /* Keeps track if /proc/<pid>/attr/fscreate should be used or not */
184 int user_security_label
;
187 static const struct fuse_opt lo_opts
[] = {
188 { "sandbox=namespace",
189 offsetof(struct lo_data
, sandbox
),
192 offsetof(struct lo_data
, sandbox
),
194 { "writeback", offsetof(struct lo_data
, writeback
), 1 },
195 { "no_writeback", offsetof(struct lo_data
, writeback
), 0 },
196 { "source=%s", offsetof(struct lo_data
, source
), 0 },
197 { "flock", offsetof(struct lo_data
, flock
), 1 },
198 { "no_flock", offsetof(struct lo_data
, flock
), 0 },
199 { "posix_lock", offsetof(struct lo_data
, posix_lock
), 1 },
200 { "no_posix_lock", offsetof(struct lo_data
, posix_lock
), 0 },
201 { "xattr", offsetof(struct lo_data
, xattr
), 1 },
202 { "no_xattr", offsetof(struct lo_data
, xattr
), 0 },
203 { "xattrmap=%s", offsetof(struct lo_data
, xattrmap
), 0 },
204 { "modcaps=%s", offsetof(struct lo_data
, modcaps
), 0 },
205 { "timeout=%lf", offsetof(struct lo_data
, timeout
), 0 },
206 { "timeout=", offsetof(struct lo_data
, timeout_set
), 1 },
207 { "cache=none", offsetof(struct lo_data
, cache
), CACHE_NONE
},
208 { "cache=auto", offsetof(struct lo_data
, cache
), CACHE_AUTO
},
209 { "cache=always", offsetof(struct lo_data
, cache
), CACHE_ALWAYS
},
210 { "readdirplus", offsetof(struct lo_data
, readdirplus_set
), 1 },
211 { "no_readdirplus", offsetof(struct lo_data
, readdirplus_clear
), 1 },
212 { "allow_direct_io", offsetof(struct lo_data
, allow_direct_io
), 1 },
213 { "no_allow_direct_io", offsetof(struct lo_data
, allow_direct_io
), 0 },
214 { "announce_submounts", offsetof(struct lo_data
, announce_submounts
), 1 },
215 { "killpriv_v2", offsetof(struct lo_data
, user_killpriv_v2
), 1 },
216 { "no_killpriv_v2", offsetof(struct lo_data
, user_killpriv_v2
), 0 },
217 { "posix_acl", offsetof(struct lo_data
, user_posix_acl
), 1 },
218 { "no_posix_acl", offsetof(struct lo_data
, user_posix_acl
), 0 },
219 { "security_label", offsetof(struct lo_data
, user_security_label
), 1 },
220 { "no_security_label", offsetof(struct lo_data
, user_security_label
), 0 },
223 static bool use_syslog
= false;
224 static int current_log_level
;
225 static void unref_inode_lolocked(struct lo_data
*lo
, struct lo_inode
*inode
,
229 pthread_mutex_t mutex
;
232 /* That we loaded cap-ng in the current thread from the saved */
233 static __thread
bool cap_loaded
= 0;
235 static struct lo_inode
*lo_find(struct lo_data
*lo
, struct stat
*st
,
237 static int xattr_map_client(const struct lo_data
*lo
, const char *client_name
,
240 #define FCHDIR_NOFAIL(fd) do { \
241 int fchdir_res = fchdir(fd); \
242 assert(fchdir_res == 0); \
245 static bool is_dot_or_dotdot(const char *name
)
247 return name
[0] == '.' &&
248 (name
[1] == '\0' || (name
[1] == '.' && name
[2] == '\0'));
251 /* Is `path` a single path component that is not "." or ".."? */
252 static bool is_safe_path_component(const char *path
)
254 if (strchr(path
, '/')) {
258 return !is_dot_or_dotdot(path
);
261 static bool is_empty(const char *name
)
263 return name
[0] == '\0';
266 static struct lo_data
*lo_data(fuse_req_t req
)
268 return (struct lo_data
*)fuse_req_userdata(req
);
272 * Tries to figure out if /proc/<pid>/attr/fscreate is usable or not. With
273 * selinux=0, read from fscreate returns -EINVAL.
275 * TODO: Link with libselinux and use is_selinux_enabled() instead down
276 * the line. It probably will be more reliable indicator.
278 static bool is_fscreate_usable(struct lo_data
*lo
)
284 sprintf(procname
, "%ld/attr/fscreate", syscall(SYS_gettid
));
285 fscreate_fd
= openat(lo
->proc_self_task
, procname
, O_RDWR
);
286 if (fscreate_fd
== -1) {
290 bytes_read
= read(fscreate_fd
, procname
, 64);
292 if (bytes_read
== -1) {
298 /* Helpers to set/reset fscreate */
299 static int open_set_proc_fscreate(struct lo_data
*lo
, const void *ctx
,
300 size_t ctxlen
, int *fd
)
303 int fscreate_fd
, err
= 0;
306 sprintf(procname
, "%ld/attr/fscreate", syscall(SYS_gettid
));
307 fscreate_fd
= openat(lo
->proc_self_task
, procname
, O_WRONLY
);
308 err
= fscreate_fd
== -1 ? errno
: 0;
313 written
= write(fscreate_fd
, ctx
, ctxlen
);
314 err
= written
== -1 ? errno
: 0;
326 static void close_reset_proc_fscreate(int fd
)
328 if ((write(fd
, NULL
, 0)) == -1) {
329 fuse_log(FUSE_LOG_WARNING
, "Failed to reset fscreate. err=%d\n", errno
);
336 * Load capng's state from our saved state if the current thread
337 * hadn't previously been loaded.
338 * returns 0 on success
340 static int load_capng(void)
343 pthread_mutex_lock(&cap
.mutex
);
344 capng_restore_state(&cap
.saved
);
346 * restore_state free's the saved copy
349 cap
.saved
= capng_save_state();
351 pthread_mutex_unlock(&cap
.mutex
);
352 fuse_log(FUSE_LOG_ERR
, "capng_save_state (thread)\n");
355 pthread_mutex_unlock(&cap
.mutex
);
358 * We want to use the loaded state for our pid,
361 capng_setpid(syscall(SYS_gettid
));
368 * Helpers for dropping and regaining effective capabilities. Returns 0
369 * on success, error otherwise
371 static int drop_effective_cap(const char *cap_name
, bool *cap_dropped
)
375 cap
= capng_name_to_capability(cap_name
);
378 fuse_log(FUSE_LOG_ERR
, "capng_name_to_capability(%s) failed:%s\n",
379 cap_name
, strerror(errno
));
385 fuse_log(FUSE_LOG_ERR
, "load_capng() failed\n");
389 /* We dont have this capability in effective set already. */
390 if (!capng_have_capability(CAPNG_EFFECTIVE
, cap
)) {
395 if (capng_update(CAPNG_DROP
, CAPNG_EFFECTIVE
, cap
)) {
397 fuse_log(FUSE_LOG_ERR
, "capng_update(DROP,) failed\n");
401 if (capng_apply(CAPNG_SELECT_CAPS
)) {
403 fuse_log(FUSE_LOG_ERR
, "drop:capng_apply() failed\n");
416 static int gain_effective_cap(const char *cap_name
)
421 cap
= capng_name_to_capability(cap_name
);
424 fuse_log(FUSE_LOG_ERR
, "capng_name_to_capability(%s) failed:%s\n",
425 cap_name
, strerror(errno
));
431 fuse_log(FUSE_LOG_ERR
, "load_capng() failed\n");
435 if (capng_update(CAPNG_ADD
, CAPNG_EFFECTIVE
, cap
)) {
437 fuse_log(FUSE_LOG_ERR
, "capng_update(ADD,) failed\n");
441 if (capng_apply(CAPNG_SELECT_CAPS
)) {
443 fuse_log(FUSE_LOG_ERR
, "gain:capng_apply() failed\n");
453 * The host kernel normally drops security.capability xattr's on
454 * any write, however if we're remapping xattr names we need to drop
455 * whatever the clients security.capability is actually stored as.
457 static int drop_security_capability(const struct lo_data
*lo
, int fd
)
459 if (!lo
->xattr_security_capability
) {
460 /* We didn't remap the name, let the host kernel do it */
463 if (!fremovexattr(fd
, lo
->xattr_security_capability
)) {
470 /* Attribute didn't exist, that's fine */
474 /* FS didn't support attribute anyway, also fine */
478 /* Hmm other error */
483 static void lo_map_init(struct lo_map
*map
)
490 static void lo_map_destroy(struct lo_map
*map
)
495 static int lo_map_grow(struct lo_map
*map
, size_t new_nelems
)
497 struct lo_map_elem
*new_elems
;
500 if (new_nelems
<= map
->nelems
) {
504 new_elems
= g_try_realloc_n(map
->elems
, new_nelems
, sizeof(map
->elems
[0]));
509 for (i
= map
->nelems
; i
< new_nelems
; i
++) {
510 new_elems
[i
].freelist
= i
+ 1;
511 new_elems
[i
].in_use
= false;
513 new_elems
[new_nelems
- 1].freelist
= -1;
515 map
->elems
= new_elems
;
516 map
->freelist
= map
->nelems
;
517 map
->nelems
= new_nelems
;
521 static struct lo_map_elem
*lo_map_alloc_elem(struct lo_map
*map
)
523 struct lo_map_elem
*elem
;
525 if (map
->freelist
== -1 && !lo_map_grow(map
, map
->nelems
+ 256)) {
529 elem
= &map
->elems
[map
->freelist
];
530 map
->freelist
= elem
->freelist
;
537 static struct lo_map_elem
*lo_map_reserve(struct lo_map
*map
, size_t key
)
541 if (!lo_map_grow(map
, key
+ 1)) {
545 for (prev
= &map
->freelist
; *prev
!= -1;
546 prev
= &map
->elems
[*prev
].freelist
) {
548 struct lo_map_elem
*elem
= &map
->elems
[key
];
550 *prev
= elem
->freelist
;
558 static struct lo_map_elem
*lo_map_get(struct lo_map
*map
, size_t key
)
560 if (key
>= map
->nelems
) {
563 if (!map
->elems
[key
].in_use
) {
566 return &map
->elems
[key
];
569 static void lo_map_remove(struct lo_map
*map
, size_t key
)
571 struct lo_map_elem
*elem
;
573 if (key
>= map
->nelems
) {
577 elem
= &map
->elems
[key
];
582 elem
->in_use
= false;
584 elem
->freelist
= map
->freelist
;
588 /* Assumes lo->mutex is held */
589 static ssize_t
lo_add_fd_mapping(struct lo_data
*lo
, int fd
)
591 struct lo_map_elem
*elem
;
593 elem
= lo_map_alloc_elem(&lo
->fd_map
);
599 return elem
- lo
->fd_map
.elems
;
602 /* Assumes lo->mutex is held */
603 static ssize_t
lo_add_dirp_mapping(fuse_req_t req
, struct lo_dirp
*dirp
)
605 struct lo_map_elem
*elem
;
607 elem
= lo_map_alloc_elem(&lo_data(req
)->dirp_map
);
613 return elem
- lo_data(req
)->dirp_map
.elems
;
616 /* Assumes lo->mutex is held */
617 static ssize_t
lo_add_inode_mapping(fuse_req_t req
, struct lo_inode
*inode
)
619 struct lo_map_elem
*elem
;
621 elem
= lo_map_alloc_elem(&lo_data(req
)->ino_map
);
627 return elem
- lo_data(req
)->ino_map
.elems
;
630 static void lo_inode_put(struct lo_data
*lo
, struct lo_inode
**inodep
)
632 struct lo_inode
*inode
= *inodep
;
640 if (g_atomic_int_dec_and_test(&inode
->refcount
)) {
646 /* Caller must release refcount using lo_inode_put() */
647 static struct lo_inode
*lo_inode(fuse_req_t req
, fuse_ino_t ino
)
649 struct lo_data
*lo
= lo_data(req
);
650 struct lo_map_elem
*elem
;
652 pthread_mutex_lock(&lo
->mutex
);
653 elem
= lo_map_get(&lo
->ino_map
, ino
);
655 g_atomic_int_inc(&elem
->inode
->refcount
);
657 pthread_mutex_unlock(&lo
->mutex
);
667 * TODO Remove this helper and force callers to hold an inode refcount until
668 * they are done with the fd. This will be done in a later patch to make
671 static int lo_fd(fuse_req_t req
, fuse_ino_t ino
)
673 struct lo_inode
*inode
= lo_inode(req
, ino
);
681 lo_inode_put(lo_data(req
), &inode
);
686 * Open a file descriptor for an inode. Returns -EBADF if the inode is not a
687 * regular file or a directory.
689 * Use this helper function instead of raw openat(2) to prevent security issues
690 * when a malicious client opens special files such as block device nodes.
691 * Symlink inodes are also rejected since symlinks must already have been
692 * traversed on the client side.
694 static int lo_inode_open(struct lo_data
*lo
, struct lo_inode
*inode
,
697 g_autofree
char *fd_str
= g_strdup_printf("%d", inode
->fd
);
700 if (!S_ISREG(inode
->filetype
) && !S_ISDIR(inode
->filetype
)) {
705 * The file is a symlink so O_NOFOLLOW must be ignored. We checked earlier
706 * that the inode is not a special file but if an external process races
707 * with us then symlinks are traversed here. It is not possible to escape
708 * the shared directory since it is mounted as "/" though.
710 fd
= openat(lo
->proc_self_fd
, fd_str
, open_flags
& ~O_NOFOLLOW
);
717 static void lo_init(void *userdata
, struct fuse_conn_info
*conn
)
719 struct lo_data
*lo
= (struct lo_data
*)userdata
;
721 if (conn
->capable
& FUSE_CAP_EXPORT_SUPPORT
) {
722 conn
->want
|= FUSE_CAP_EXPORT_SUPPORT
;
725 if (lo
->writeback
&& conn
->capable
& FUSE_CAP_WRITEBACK_CACHE
) {
726 fuse_log(FUSE_LOG_DEBUG
, "lo_init: activating writeback\n");
727 conn
->want
|= FUSE_CAP_WRITEBACK_CACHE
;
729 if (conn
->capable
& FUSE_CAP_FLOCK_LOCKS
) {
731 fuse_log(FUSE_LOG_DEBUG
, "lo_init: activating flock locks\n");
732 conn
->want
|= FUSE_CAP_FLOCK_LOCKS
;
734 fuse_log(FUSE_LOG_DEBUG
, "lo_init: disabling flock locks\n");
735 conn
->want
&= ~FUSE_CAP_FLOCK_LOCKS
;
739 if (conn
->capable
& FUSE_CAP_POSIX_LOCKS
) {
740 if (lo
->posix_lock
) {
741 fuse_log(FUSE_LOG_DEBUG
, "lo_init: activating posix locks\n");
742 conn
->want
|= FUSE_CAP_POSIX_LOCKS
;
744 fuse_log(FUSE_LOG_DEBUG
, "lo_init: disabling posix locks\n");
745 conn
->want
&= ~FUSE_CAP_POSIX_LOCKS
;
749 if ((lo
->cache
== CACHE_NONE
&& !lo
->readdirplus_set
) ||
750 lo
->readdirplus_clear
) {
751 fuse_log(FUSE_LOG_DEBUG
, "lo_init: disabling readdirplus\n");
752 conn
->want
&= ~FUSE_CAP_READDIRPLUS
;
755 if (!(conn
->capable
& FUSE_CAP_SUBMOUNTS
) && lo
->announce_submounts
) {
756 fuse_log(FUSE_LOG_WARNING
, "lo_init: Cannot announce submounts, client "
757 "does not support it\n");
758 lo
->announce_submounts
= false;
761 if (lo
->user_killpriv_v2
== 1) {
763 * User explicitly asked for this option. Enable it unconditionally.
764 * If connection does not have this capability, it should fail
767 fuse_log(FUSE_LOG_DEBUG
, "lo_init: enabling killpriv_v2\n");
768 conn
->want
|= FUSE_CAP_HANDLE_KILLPRIV_V2
;
770 } else if (lo
->user_killpriv_v2
== -1 &&
771 conn
->capable
& FUSE_CAP_HANDLE_KILLPRIV_V2
) {
773 * User did not specify a value for killpriv_v2. By default enable it
774 * if connection offers this capability
776 fuse_log(FUSE_LOG_DEBUG
, "lo_init: enabling killpriv_v2\n");
777 conn
->want
|= FUSE_CAP_HANDLE_KILLPRIV_V2
;
781 * Either user specified to disable killpriv_v2, or connection does
782 * not offer this capability. Disable killpriv_v2 in both the cases
784 fuse_log(FUSE_LOG_DEBUG
, "lo_init: disabling killpriv_v2\n");
785 conn
->want
&= ~FUSE_CAP_HANDLE_KILLPRIV_V2
;
789 if (lo
->user_posix_acl
== 1) {
791 * User explicitly asked for this option. Enable it unconditionally.
792 * If connection does not have this capability, print error message
793 * now. It will fail later in fuse_lowlevel.c
795 if (!(conn
->capable
& FUSE_CAP_POSIX_ACL
) ||
796 !(conn
->capable
& FUSE_CAP_DONT_MASK
) ||
797 !(conn
->capable
& FUSE_CAP_SETXATTR_EXT
)) {
798 fuse_log(FUSE_LOG_ERR
, "lo_init: Can not enable posix acl."
799 " kernel does not support FUSE_POSIX_ACL, FUSE_DONT_MASK"
800 " or FUSE_SETXATTR_EXT capability.\n");
802 fuse_log(FUSE_LOG_DEBUG
, "lo_init: enabling posix acl\n");
805 conn
->want
|= FUSE_CAP_POSIX_ACL
| FUSE_CAP_DONT_MASK
|
806 FUSE_CAP_SETXATTR_EXT
;
807 lo
->change_umask
= true;
808 lo
->posix_acl
= true;
810 /* User either did not specify anything or wants it disabled */
811 fuse_log(FUSE_LOG_DEBUG
, "lo_init: disabling posix_acl\n");
812 conn
->want
&= ~FUSE_CAP_POSIX_ACL
;
815 if (lo
->user_security_label
== 1) {
816 if (!(conn
->capable
& FUSE_CAP_SECURITY_CTX
)) {
817 fuse_log(FUSE_LOG_ERR
, "lo_init: Can not enable security label."
818 " kernel does not support FUSE_SECURITY_CTX capability.\n");
820 conn
->want
|= FUSE_CAP_SECURITY_CTX
;
822 fuse_log(FUSE_LOG_DEBUG
, "lo_init: disabling security label\n");
823 conn
->want
&= ~FUSE_CAP_SECURITY_CTX
;
827 static void lo_getattr(fuse_req_t req
, fuse_ino_t ino
,
828 struct fuse_file_info
*fi
)
832 struct lo_data
*lo
= lo_data(req
);
837 fstatat(lo_fd(req
, ino
), "", &buf
, AT_EMPTY_PATH
| AT_SYMLINK_NOFOLLOW
);
839 return (void)fuse_reply_err(req
, errno
);
842 fuse_reply_attr(req
, &buf
, lo
->timeout
);
845 static int lo_fi_fd(fuse_req_t req
, struct fuse_file_info
*fi
)
847 struct lo_data
*lo
= lo_data(req
);
848 struct lo_map_elem
*elem
;
850 pthread_mutex_lock(&lo
->mutex
);
851 elem
= lo_map_get(&lo
->fd_map
, fi
->fh
);
852 pthread_mutex_unlock(&lo
->mutex
);
861 static void lo_setattr(fuse_req_t req
, fuse_ino_t ino
, struct stat
*attr
,
862 int valid
, struct fuse_file_info
*fi
)
866 struct lo_data
*lo
= lo_data(req
);
867 struct lo_inode
*inode
;
872 inode
= lo_inode(req
, ino
);
874 fuse_reply_err(req
, EBADF
);
880 /* If fi->fh is invalid we'll report EBADF later */
882 fd
= lo_fi_fd(req
, fi
);
885 if (valid
& FUSE_SET_ATTR_MODE
) {
887 res
= fchmod(fd
, attr
->st_mode
);
889 sprintf(procname
, "%i", ifd
);
890 res
= fchmodat(lo
->proc_self_fd
, procname
, attr
->st_mode
, 0);
897 if (valid
& (FUSE_SET_ATTR_UID
| FUSE_SET_ATTR_GID
)) {
898 uid_t uid
= (valid
& FUSE_SET_ATTR_UID
) ? attr
->st_uid
: (uid_t
)-1;
899 gid_t gid
= (valid
& FUSE_SET_ATTR_GID
) ? attr
->st_gid
: (gid_t
)-1;
901 saverr
= drop_security_capability(lo
, ifd
);
906 res
= fchownat(ifd
, "", uid
, gid
, AT_EMPTY_PATH
| AT_SYMLINK_NOFOLLOW
);
912 if (valid
& FUSE_SET_ATTR_SIZE
) {
915 bool cap_fsetid_dropped
= false;
917 kill_suidgid
= lo
->killpriv_v2
&& (valid
& FUSE_SET_ATTR_KILL_SUIDGID
);
921 truncfd
= lo_inode_open(lo
, inode
, O_RDWR
);
928 saverr
= drop_security_capability(lo
, truncfd
);
937 res
= drop_effective_cap("FSETID", &cap_fsetid_dropped
);
947 res
= ftruncate(truncfd
, attr
->st_size
);
948 saverr
= res
== -1 ? errno
: 0;
950 if (cap_fsetid_dropped
) {
951 if (gain_effective_cap("FSETID")) {
952 fuse_log(FUSE_LOG_ERR
, "Failed to gain CAP_FSETID\n");
962 if (valid
& (FUSE_SET_ATTR_ATIME
| FUSE_SET_ATTR_MTIME
)) {
963 struct timespec tv
[2];
967 tv
[0].tv_nsec
= UTIME_OMIT
;
968 tv
[1].tv_nsec
= UTIME_OMIT
;
970 if (valid
& FUSE_SET_ATTR_ATIME_NOW
) {
971 tv
[0].tv_nsec
= UTIME_NOW
;
972 } else if (valid
& FUSE_SET_ATTR_ATIME
) {
973 tv
[0] = attr
->st_atim
;
976 if (valid
& FUSE_SET_ATTR_MTIME_NOW
) {
977 tv
[1].tv_nsec
= UTIME_NOW
;
978 } else if (valid
& FUSE_SET_ATTR_MTIME
) {
979 tv
[1] = attr
->st_mtim
;
983 res
= futimens(fd
, tv
);
985 sprintf(procname
, "%i", inode
->fd
);
986 res
= utimensat(lo
->proc_self_fd
, procname
, tv
, 0);
993 lo_inode_put(lo
, &inode
);
995 return lo_getattr(req
, ino
, fi
);
998 lo_inode_put(lo
, &inode
);
999 fuse_reply_err(req
, saverr
);
1002 static struct lo_inode
*lo_find(struct lo_data
*lo
, struct stat
*st
,
1006 struct lo_key key
= {
1012 pthread_mutex_lock(&lo
->mutex
);
1013 p
= g_hash_table_lookup(lo
->inodes
, &key
);
1015 assert(p
->nlookup
> 0);
1017 g_atomic_int_inc(&p
->refcount
);
1019 pthread_mutex_unlock(&lo
->mutex
);
1024 /* value_destroy_func for posix_locks GHashTable */
1025 static void posix_locks_value_destroy(gpointer data
)
1027 struct lo_inode_plock
*plock
= data
;
1030 * We had used open() for locks and had only one fd. So
1031 * closing this fd should release all OFD locks.
1037 static int do_statx(struct lo_data
*lo
, int dirfd
, const char *pathname
,
1038 struct stat
*statbuf
, int flags
, uint64_t *mnt_id
)
1042 #if defined(CONFIG_STATX) && defined(CONFIG_STATX_MNT_ID)
1043 if (lo
->use_statx
) {
1044 struct statx statxbuf
;
1046 res
= statx(dirfd
, pathname
, flags
, STATX_BASIC_STATS
| STATX_MNT_ID
,
1049 memset(statbuf
, 0, sizeof(*statbuf
));
1050 statbuf
->st_dev
= makedev(statxbuf
.stx_dev_major
,
1051 statxbuf
.stx_dev_minor
);
1052 statbuf
->st_ino
= statxbuf
.stx_ino
;
1053 statbuf
->st_mode
= statxbuf
.stx_mode
;
1054 statbuf
->st_nlink
= statxbuf
.stx_nlink
;
1055 statbuf
->st_uid
= statxbuf
.stx_uid
;
1056 statbuf
->st_gid
= statxbuf
.stx_gid
;
1057 statbuf
->st_rdev
= makedev(statxbuf
.stx_rdev_major
,
1058 statxbuf
.stx_rdev_minor
);
1059 statbuf
->st_size
= statxbuf
.stx_size
;
1060 statbuf
->st_blksize
= statxbuf
.stx_blksize
;
1061 statbuf
->st_blocks
= statxbuf
.stx_blocks
;
1062 statbuf
->st_atim
.tv_sec
= statxbuf
.stx_atime
.tv_sec
;
1063 statbuf
->st_atim
.tv_nsec
= statxbuf
.stx_atime
.tv_nsec
;
1064 statbuf
->st_mtim
.tv_sec
= statxbuf
.stx_mtime
.tv_sec
;
1065 statbuf
->st_mtim
.tv_nsec
= statxbuf
.stx_mtime
.tv_nsec
;
1066 statbuf
->st_ctim
.tv_sec
= statxbuf
.stx_ctime
.tv_sec
;
1067 statbuf
->st_ctim
.tv_nsec
= statxbuf
.stx_ctime
.tv_nsec
;
1069 if (statxbuf
.stx_mask
& STATX_MNT_ID
) {
1070 *mnt_id
= statxbuf
.stx_mnt_id
;
1075 } else if (errno
!= ENOSYS
) {
1078 lo
->use_statx
= false;
1082 res
= fstatat(dirfd
, pathname
, statbuf
, flags
);
1092 * Increments nlookup on the inode on success. unref_inode_lolocked() must be
1093 * called eventually to decrement nlookup again. If inodep is non-NULL, the
1094 * inode pointer is stored and the caller must call lo_inode_put().
1096 static int lo_do_lookup(fuse_req_t req
, fuse_ino_t parent
, const char *name
,
1097 struct fuse_entry_param
*e
,
1098 struct lo_inode
**inodep
)
1104 struct lo_data
*lo
= lo_data(req
);
1105 struct lo_inode
*inode
= NULL
;
1106 struct lo_inode
*dir
= lo_inode(req
, parent
);
1109 *inodep
= NULL
; /* in case there is an error */
1113 * name_to_handle_at() and open_by_handle_at() can reach here with fuse
1114 * mount point in guest, but we don't have its inode info in the
1121 memset(e
, 0, sizeof(*e
));
1122 e
->attr_timeout
= lo
->timeout
;
1123 e
->entry_timeout
= lo
->timeout
;
1125 /* Do not allow escaping root directory */
1126 if (dir
== &lo
->root
&& strcmp(name
, "..") == 0) {
1130 newfd
= openat(dir
->fd
, name
, O_PATH
| O_NOFOLLOW
);
1135 res
= do_statx(lo
, newfd
, "", &e
->attr
, AT_EMPTY_PATH
| AT_SYMLINK_NOFOLLOW
,
1141 if (S_ISDIR(e
->attr
.st_mode
) && lo
->announce_submounts
&&
1142 (e
->attr
.st_dev
!= dir
->key
.dev
|| mnt_id
!= dir
->key
.mnt_id
)) {
1143 e
->attr_flags
|= FUSE_ATTR_SUBMOUNT
;
1146 inode
= lo_find(lo
, &e
->attr
, mnt_id
);
1150 inode
= calloc(1, sizeof(struct lo_inode
));
1155 /* cache only filetype */
1156 inode
->filetype
= (e
->attr
.st_mode
& S_IFMT
);
1159 * One for the caller and one for nlookup (released in
1160 * unref_inode_lolocked())
1162 g_atomic_int_set(&inode
->refcount
, 2);
1166 inode
->key
.ino
= e
->attr
.st_ino
;
1167 inode
->key
.dev
= e
->attr
.st_dev
;
1168 inode
->key
.mnt_id
= mnt_id
;
1169 if (lo
->posix_lock
) {
1170 pthread_mutex_init(&inode
->plock_mutex
, NULL
);
1171 inode
->posix_locks
= g_hash_table_new_full(
1172 g_direct_hash
, g_direct_equal
, NULL
, posix_locks_value_destroy
);
1174 pthread_mutex_lock(&lo
->mutex
);
1175 inode
->fuse_ino
= lo_add_inode_mapping(req
, inode
);
1176 g_hash_table_insert(lo
->inodes
, &inode
->key
, inode
);
1177 pthread_mutex_unlock(&lo
->mutex
);
1179 e
->ino
= inode
->fuse_ino
;
1181 /* Transfer ownership of inode pointer to caller or drop it */
1185 lo_inode_put(lo
, &inode
);
1188 lo_inode_put(lo
, &dir
);
1190 fuse_log(FUSE_LOG_DEBUG
, " %lli/%s -> %lli\n", (unsigned long long)parent
,
1191 name
, (unsigned long long)e
->ino
);
1200 lo_inode_put(lo
, &inode
);
1201 lo_inode_put(lo
, &dir
);
1205 static void lo_lookup(fuse_req_t req
, fuse_ino_t parent
, const char *name
)
1207 struct fuse_entry_param e
;
1210 fuse_log(FUSE_LOG_DEBUG
, "lo_lookup(parent=%" PRIu64
", name=%s)\n", parent
,
1213 if (is_empty(name
)) {
1214 fuse_reply_err(req
, ENOENT
);
1219 * Don't use is_safe_path_component(), allow "." and ".." for NFS export
1222 if (strchr(name
, '/')) {
1223 fuse_reply_err(req
, EINVAL
);
1227 err
= lo_do_lookup(req
, parent
, name
, &e
, NULL
);
1229 fuse_reply_err(req
, err
);
1231 fuse_reply_entry(req
, &e
);
1236 * On some archs, setres*id is limited to 2^16 but they
1237 * provide setres*id32 variants that allow 2^32.
1238 * Others just let setres*id do 2^32 anyway.
1240 #ifdef SYS_setresgid32
1241 #define OURSYS_setresgid SYS_setresgid32
1243 #define OURSYS_setresgid SYS_setresgid
1246 #ifdef SYS_setresuid32
1247 #define OURSYS_setresuid SYS_setresuid32
1249 #define OURSYS_setresuid SYS_setresuid
1252 static void drop_supplementary_groups(void)
1256 ret
= getgroups(0, NULL
);
1258 fuse_log(FUSE_LOG_ERR
, "getgroups() failed with error=%d:%s\n",
1259 errno
, strerror(errno
));
1267 /* Drop all supplementary groups. We should not need it */
1268 ret
= setgroups(0, NULL
);
1270 fuse_log(FUSE_LOG_ERR
, "setgroups() failed with error=%d:%s\n",
1271 errno
, strerror(errno
));
1277 * Change to uid/gid of caller so that file is created with
1278 * ownership of caller.
1279 * TODO: What about selinux context?
1281 static int lo_change_cred(fuse_req_t req
, struct lo_cred
*old
,
1286 old
->euid
= geteuid();
1287 old
->egid
= getegid();
1289 res
= syscall(OURSYS_setresgid
, -1, fuse_req_ctx(req
)->gid
, -1);
1294 res
= syscall(OURSYS_setresuid
, -1, fuse_req_ctx(req
)->uid
, -1);
1296 int errno_save
= errno
;
1298 syscall(OURSYS_setresgid
, -1, old
->egid
, -1);
1303 old
->umask
= umask(req
->ctx
.umask
);
1308 /* Regain Privileges */
1309 static void lo_restore_cred(struct lo_cred
*old
, bool restore_umask
)
1313 res
= syscall(OURSYS_setresuid
, -1, old
->euid
, -1);
1315 fuse_log(FUSE_LOG_ERR
, "seteuid(%u): %m\n", old
->euid
);
1319 res
= syscall(OURSYS_setresgid
, -1, old
->egid
, -1);
1321 fuse_log(FUSE_LOG_ERR
, "setegid(%u): %m\n", old
->egid
);
1330 * A helper to change cred and drop capability. Returns 0 on success and
1333 static int lo_drop_cap_change_cred(fuse_req_t req
, struct lo_cred
*old
,
1334 bool change_umask
, const char *cap_name
,
1342 ret
= drop_effective_cap(cap_name
, &__cap_dropped
);
1347 ret
= lo_change_cred(req
, old
, change_umask
);
1349 if (__cap_dropped
) {
1350 if (gain_effective_cap(cap_name
)) {
1351 fuse_log(FUSE_LOG_ERR
, "Failed to gain CAP_%s\n", cap_name
);
1357 *cap_dropped
= __cap_dropped
;
1362 static void lo_restore_cred_gain_cap(struct lo_cred
*old
, bool restore_umask
,
1363 const char *cap_name
)
1367 lo_restore_cred(old
, restore_umask
);
1369 if (gain_effective_cap(cap_name
)) {
1370 fuse_log(FUSE_LOG_ERR
, "Failed to gain CAP_%s\n", cap_name
);
1374 static int do_mknod_symlink_secctx(fuse_req_t req
, struct lo_inode
*dir
,
1375 const char *name
, const char *secctx_name
)
1379 struct lo_data
*lo
= lo_data(req
);
1381 if (!req
->secctx
.ctxlen
) {
1385 /* Open newly created element with O_PATH */
1386 path_fd
= openat(dir
->fd
, name
, O_PATH
| O_NOFOLLOW
);
1387 err
= path_fd
== -1 ? errno
: 0;
1391 sprintf(procname
, "%i", path_fd
);
1392 FCHDIR_NOFAIL(lo
->proc_self_fd
);
1393 /* Set security context. This is not atomic w.r.t file creation */
1394 err
= setxattr(procname
, secctx_name
, req
->secctx
.ctx
, req
->secctx
.ctxlen
,
1399 FCHDIR_NOFAIL(lo
->root
.fd
);
1404 static int do_mknod_symlink(fuse_req_t req
, struct lo_inode
*dir
,
1405 const char *name
, mode_t mode
, dev_t rdev
,
1408 int err
, fscreate_fd
= -1;
1409 const char *secctx_name
= req
->secctx
.name
;
1410 struct lo_cred old
= {};
1411 struct lo_data
*lo
= lo_data(req
);
1412 char *mapped_name
= NULL
;
1413 bool secctx_enabled
= req
->secctx
.ctxlen
;
1414 bool do_fscreate
= false;
1416 if (secctx_enabled
&& lo
->xattrmap
) {
1417 err
= xattr_map_client(lo
, req
->secctx
.name
, &mapped_name
);
1421 secctx_name
= mapped_name
;
1425 * If security xattr has not been remapped and selinux is enabled on
1426 * host, set fscreate and no need to do a setxattr() after file creation
1428 if (secctx_enabled
&& !mapped_name
&& lo
->use_fscreate
) {
1430 err
= open_set_proc_fscreate(lo
, req
->secctx
.ctx
, req
->secctx
.ctxlen
,
1437 err
= lo_change_cred(req
, &old
, lo
->change_umask
&& !S_ISLNK(mode
));
1442 err
= mknod_wrapper(dir
->fd
, name
, link
, mode
, rdev
);
1443 err
= err
== -1 ? errno
: 0;
1444 lo_restore_cred(&old
, lo
->change_umask
&& !S_ISLNK(mode
));
1450 err
= do_mknod_symlink_secctx(req
, dir
, name
, secctx_name
);
1452 unlinkat(dir
->fd
, name
, S_ISDIR(mode
) ? AT_REMOVEDIR
: 0);
1456 if (fscreate_fd
!= -1) {
1457 close_reset_proc_fscreate(fscreate_fd
);
1459 g_free(mapped_name
);
1463 static void lo_mknod_symlink(fuse_req_t req
, fuse_ino_t parent
,
1464 const char *name
, mode_t mode
, dev_t rdev
,
1468 struct lo_data
*lo
= lo_data(req
);
1469 struct lo_inode
*dir
;
1470 struct fuse_entry_param e
;
1472 if (is_empty(name
)) {
1473 fuse_reply_err(req
, ENOENT
);
1477 if (!is_safe_path_component(name
)) {
1478 fuse_reply_err(req
, EINVAL
);
1482 dir
= lo_inode(req
, parent
);
1484 fuse_reply_err(req
, EBADF
);
1488 saverr
= do_mknod_symlink(req
, dir
, name
, mode
, rdev
, link
);
1493 saverr
= lo_do_lookup(req
, parent
, name
, &e
, NULL
);
1498 fuse_log(FUSE_LOG_DEBUG
, " %lli/%s -> %lli\n", (unsigned long long)parent
,
1499 name
, (unsigned long long)e
.ino
);
1501 fuse_reply_entry(req
, &e
);
1502 lo_inode_put(lo
, &dir
);
1506 lo_inode_put(lo
, &dir
);
1507 fuse_reply_err(req
, saverr
);
1510 static void lo_mknod(fuse_req_t req
, fuse_ino_t parent
, const char *name
,
1511 mode_t mode
, dev_t rdev
)
1513 lo_mknod_symlink(req
, parent
, name
, mode
, rdev
, NULL
);
1516 static void lo_mkdir(fuse_req_t req
, fuse_ino_t parent
, const char *name
,
1519 lo_mknod_symlink(req
, parent
, name
, S_IFDIR
| mode
, 0, NULL
);
1522 static void lo_symlink(fuse_req_t req
, const char *link
, fuse_ino_t parent
,
1525 lo_mknod_symlink(req
, parent
, name
, S_IFLNK
, 0, link
);
1528 static void lo_link(fuse_req_t req
, fuse_ino_t ino
, fuse_ino_t parent
,
1532 struct lo_data
*lo
= lo_data(req
);
1533 struct lo_inode
*parent_inode
;
1534 struct lo_inode
*inode
;
1535 struct fuse_entry_param e
;
1539 if (is_empty(name
)) {
1540 fuse_reply_err(req
, ENOENT
);
1544 if (!is_safe_path_component(name
)) {
1545 fuse_reply_err(req
, EINVAL
);
1549 parent_inode
= lo_inode(req
, parent
);
1550 inode
= lo_inode(req
, ino
);
1551 if (!parent_inode
|| !inode
) {
1556 memset(&e
, 0, sizeof(struct fuse_entry_param
));
1557 e
.attr_timeout
= lo
->timeout
;
1558 e
.entry_timeout
= lo
->timeout
;
1560 sprintf(procname
, "%i", inode
->fd
);
1561 res
= linkat(lo
->proc_self_fd
, procname
, parent_inode
->fd
, name
,
1567 res
= fstatat(inode
->fd
, "", &e
.attr
, AT_EMPTY_PATH
| AT_SYMLINK_NOFOLLOW
);
1572 pthread_mutex_lock(&lo
->mutex
);
1574 pthread_mutex_unlock(&lo
->mutex
);
1575 e
.ino
= inode
->fuse_ino
;
1577 fuse_log(FUSE_LOG_DEBUG
, " %lli/%s -> %lli\n", (unsigned long long)parent
,
1578 name
, (unsigned long long)e
.ino
);
1580 fuse_reply_entry(req
, &e
);
1581 lo_inode_put(lo
, &parent_inode
);
1582 lo_inode_put(lo
, &inode
);
1587 lo_inode_put(lo
, &parent_inode
);
1588 lo_inode_put(lo
, &inode
);
1589 fuse_reply_err(req
, saverr
);
1592 /* Increments nlookup and caller must release refcount using lo_inode_put() */
1593 static struct lo_inode
*lookup_name(fuse_req_t req
, fuse_ino_t parent
,
1599 struct lo_data
*lo
= lo_data(req
);
1600 struct lo_inode
*dir
= lo_inode(req
, parent
);
1606 res
= do_statx(lo
, dir
->fd
, name
, &attr
, AT_SYMLINK_NOFOLLOW
, &mnt_id
);
1607 lo_inode_put(lo
, &dir
);
1612 return lo_find(lo
, &attr
, mnt_id
);
1615 static void lo_rmdir(fuse_req_t req
, fuse_ino_t parent
, const char *name
)
1618 struct lo_inode
*inode
;
1619 struct lo_data
*lo
= lo_data(req
);
1621 if (is_empty(name
)) {
1622 fuse_reply_err(req
, ENOENT
);
1626 if (!is_safe_path_component(name
)) {
1627 fuse_reply_err(req
, EINVAL
);
1631 inode
= lookup_name(req
, parent
, name
);
1633 fuse_reply_err(req
, EIO
);
1637 res
= unlinkat(lo_fd(req
, parent
), name
, AT_REMOVEDIR
);
1639 fuse_reply_err(req
, res
== -1 ? errno
: 0);
1640 unref_inode_lolocked(lo
, inode
, 1);
1641 lo_inode_put(lo
, &inode
);
1644 static void lo_rename(fuse_req_t req
, fuse_ino_t parent
, const char *name
,
1645 fuse_ino_t newparent
, const char *newname
,
1649 struct lo_inode
*parent_inode
;
1650 struct lo_inode
*newparent_inode
;
1651 struct lo_inode
*oldinode
= NULL
;
1652 struct lo_inode
*newinode
= NULL
;
1653 struct lo_data
*lo
= lo_data(req
);
1655 if (is_empty(name
) || is_empty(newname
)) {
1656 fuse_reply_err(req
, ENOENT
);
1660 if (!is_safe_path_component(name
) || !is_safe_path_component(newname
)) {
1661 fuse_reply_err(req
, EINVAL
);
1665 parent_inode
= lo_inode(req
, parent
);
1666 newparent_inode
= lo_inode(req
, newparent
);
1667 if (!parent_inode
|| !newparent_inode
) {
1668 fuse_reply_err(req
, EBADF
);
1672 oldinode
= lookup_name(req
, parent
, name
);
1673 newinode
= lookup_name(req
, newparent
, newname
);
1676 fuse_reply_err(req
, EIO
);
1681 #ifndef SYS_renameat2
1682 fuse_reply_err(req
, EINVAL
);
1684 res
= syscall(SYS_renameat2
, parent_inode
->fd
, name
,
1685 newparent_inode
->fd
, newname
, flags
);
1686 if (res
== -1 && errno
== ENOSYS
) {
1687 fuse_reply_err(req
, EINVAL
);
1689 fuse_reply_err(req
, res
== -1 ? errno
: 0);
1695 res
= renameat(parent_inode
->fd
, name
, newparent_inode
->fd
, newname
);
1697 fuse_reply_err(req
, res
== -1 ? errno
: 0);
1699 unref_inode_lolocked(lo
, oldinode
, 1);
1700 unref_inode_lolocked(lo
, newinode
, 1);
1701 lo_inode_put(lo
, &oldinode
);
1702 lo_inode_put(lo
, &newinode
);
1703 lo_inode_put(lo
, &parent_inode
);
1704 lo_inode_put(lo
, &newparent_inode
);
1707 static void lo_unlink(fuse_req_t req
, fuse_ino_t parent
, const char *name
)
1710 struct lo_inode
*inode
;
1711 struct lo_data
*lo
= lo_data(req
);
1713 if (is_empty(name
)) {
1714 fuse_reply_err(req
, ENOENT
);
1718 if (!is_safe_path_component(name
)) {
1719 fuse_reply_err(req
, EINVAL
);
1723 inode
= lookup_name(req
, parent
, name
);
1725 fuse_reply_err(req
, EIO
);
1729 res
= unlinkat(lo_fd(req
, parent
), name
, 0);
1731 fuse_reply_err(req
, res
== -1 ? errno
: 0);
1732 unref_inode_lolocked(lo
, inode
, 1);
1733 lo_inode_put(lo
, &inode
);
1736 /* To be called with lo->mutex held */
1737 static void unref_inode(struct lo_data
*lo
, struct lo_inode
*inode
, uint64_t n
)
1743 assert(inode
->nlookup
>= n
);
1744 inode
->nlookup
-= n
;
1745 if (!inode
->nlookup
) {
1746 lo_map_remove(&lo
->ino_map
, inode
->fuse_ino
);
1747 g_hash_table_remove(lo
->inodes
, &inode
->key
);
1748 if (lo
->posix_lock
) {
1749 if (g_hash_table_size(inode
->posix_locks
)) {
1750 fuse_log(FUSE_LOG_WARNING
, "Hash table is not empty\n");
1752 g_hash_table_destroy(inode
->posix_locks
);
1753 pthread_mutex_destroy(&inode
->plock_mutex
);
1755 /* Drop our refcount from lo_do_lookup() */
1756 lo_inode_put(lo
, &inode
);
1760 static void unref_inode_lolocked(struct lo_data
*lo
, struct lo_inode
*inode
,
1767 pthread_mutex_lock(&lo
->mutex
);
1768 unref_inode(lo
, inode
, n
);
1769 pthread_mutex_unlock(&lo
->mutex
);
1772 static void lo_forget_one(fuse_req_t req
, fuse_ino_t ino
, uint64_t nlookup
)
1774 struct lo_data
*lo
= lo_data(req
);
1775 struct lo_inode
*inode
;
1777 inode
= lo_inode(req
, ino
);
1782 fuse_log(FUSE_LOG_DEBUG
, " forget %lli %lli -%lli\n",
1783 (unsigned long long)ino
, (unsigned long long)inode
->nlookup
,
1784 (unsigned long long)nlookup
);
1786 unref_inode_lolocked(lo
, inode
, nlookup
);
1787 lo_inode_put(lo
, &inode
);
1790 static void lo_forget(fuse_req_t req
, fuse_ino_t ino
, uint64_t nlookup
)
1792 lo_forget_one(req
, ino
, nlookup
);
1793 fuse_reply_none(req
);
1796 static void lo_forget_multi(fuse_req_t req
, size_t count
,
1797 struct fuse_forget_data
*forgets
)
1801 for (i
= 0; i
< count
; i
++) {
1802 lo_forget_one(req
, forgets
[i
].ino
, forgets
[i
].nlookup
);
1804 fuse_reply_none(req
);
1807 static void lo_readlink(fuse_req_t req
, fuse_ino_t ino
)
1809 char buf
[PATH_MAX
+ 1];
1812 res
= readlinkat(lo_fd(req
, ino
), "", buf
, sizeof(buf
));
1814 return (void)fuse_reply_err(req
, errno
);
1817 if (res
== sizeof(buf
)) {
1818 return (void)fuse_reply_err(req
, ENAMETOOLONG
);
1823 fuse_reply_readlink(req
, buf
);
1829 struct dirent
*entry
;
1833 static void lo_dirp_put(struct lo_dirp
**dp
)
1835 struct lo_dirp
*d
= *dp
;
1842 if (g_atomic_int_dec_and_test(&d
->refcount
)) {
1848 /* Call lo_dirp_put() on the return value when no longer needed */
1849 static struct lo_dirp
*lo_dirp(fuse_req_t req
, struct fuse_file_info
*fi
)
1851 struct lo_data
*lo
= lo_data(req
);
1852 struct lo_map_elem
*elem
;
1854 pthread_mutex_lock(&lo
->mutex
);
1855 elem
= lo_map_get(&lo
->dirp_map
, fi
->fh
);
1857 g_atomic_int_inc(&elem
->dirp
->refcount
);
1859 pthread_mutex_unlock(&lo
->mutex
);
1867 static void lo_opendir(fuse_req_t req
, fuse_ino_t ino
,
1868 struct fuse_file_info
*fi
)
1871 struct lo_data
*lo
= lo_data(req
);
1876 d
= calloc(1, sizeof(struct lo_dirp
));
1881 fd
= openat(lo_fd(req
, ino
), ".", O_RDONLY
);
1886 d
->dp
= fdopendir(fd
);
1887 if (d
->dp
== NULL
) {
1894 g_atomic_int_set(&d
->refcount
, 1); /* paired with lo_releasedir() */
1895 pthread_mutex_lock(&lo
->mutex
);
1896 fh
= lo_add_dirp_mapping(req
, d
);
1897 pthread_mutex_unlock(&lo
->mutex
);
1903 if (lo
->cache
== CACHE_ALWAYS
) {
1904 fi
->cache_readdir
= 1;
1906 fuse_reply_open(req
, fi
);
1915 } else if (fd
!= -1) {
1920 fuse_reply_err(req
, error
);
1923 static void lo_do_readdir(fuse_req_t req
, fuse_ino_t ino
, size_t size
,
1924 off_t offset
, struct fuse_file_info
*fi
, int plus
)
1926 struct lo_data
*lo
= lo_data(req
);
1927 struct lo_dirp
*d
= NULL
;
1928 struct lo_inode
*dinode
;
1929 g_autofree
char *buf
= NULL
;
1934 dinode
= lo_inode(req
, ino
);
1939 d
= lo_dirp(req
, fi
);
1945 buf
= g_try_malloc0(size
);
1951 if (offset
!= d
->offset
) {
1952 seekdir(d
->dp
, offset
);
1963 d
->entry
= readdir(d
->dp
);
1965 if (errno
) { /* Error */
1968 } else { /* End of stream */
1973 nextoff
= d
->entry
->d_off
;
1974 name
= d
->entry
->d_name
;
1976 fuse_ino_t entry_ino
= 0;
1977 struct fuse_entry_param e
= (struct fuse_entry_param
){
1978 .attr
.st_ino
= d
->entry
->d_ino
,
1979 .attr
.st_mode
= d
->entry
->d_type
<< 12,
1982 /* Hide root's parent directory */
1983 if (dinode
== &lo
->root
&& strcmp(name
, "..") == 0) {
1984 e
.attr
.st_ino
= lo
->root
.key
.ino
;
1985 e
.attr
.st_mode
= DT_DIR
<< 12;
1989 if (!is_dot_or_dotdot(name
)) {
1990 err
= lo_do_lookup(req
, ino
, name
, &e
, NULL
);
1997 entsize
= fuse_add_direntry_plus(req
, p
, rem
, name
, &e
, nextoff
);
1999 entsize
= fuse_add_direntry(req
, p
, rem
, name
, &e
.attr
, nextoff
);
2001 if (entsize
> rem
) {
2002 if (entry_ino
!= 0) {
2003 lo_forget_one(req
, entry_ino
, 1);
2012 d
->offset
= nextoff
;
2018 lo_inode_put(lo
, &dinode
);
2021 * If there's an error, we can only signal it if we haven't stored
2022 * any entries yet - otherwise we'd end up with wrong lookup
2023 * counts for the entries that are already in the buffer. So we
2024 * return what we've collected until that point.
2026 if (err
&& rem
== size
) {
2027 fuse_reply_err(req
, err
);
2029 fuse_reply_buf(req
, buf
, size
- rem
);
2033 static void lo_readdir(fuse_req_t req
, fuse_ino_t ino
, size_t size
,
2034 off_t offset
, struct fuse_file_info
*fi
)
2036 lo_do_readdir(req
, ino
, size
, offset
, fi
, 0);
2039 static void lo_readdirplus(fuse_req_t req
, fuse_ino_t ino
, size_t size
,
2040 off_t offset
, struct fuse_file_info
*fi
)
2042 lo_do_readdir(req
, ino
, size
, offset
, fi
, 1);
2045 static void lo_releasedir(fuse_req_t req
, fuse_ino_t ino
,
2046 struct fuse_file_info
*fi
)
2048 struct lo_data
*lo
= lo_data(req
);
2049 struct lo_map_elem
*elem
;
2054 pthread_mutex_lock(&lo
->mutex
);
2055 elem
= lo_map_get(&lo
->dirp_map
, fi
->fh
);
2057 pthread_mutex_unlock(&lo
->mutex
);
2058 fuse_reply_err(req
, EBADF
);
2063 lo_map_remove(&lo
->dirp_map
, fi
->fh
);
2064 pthread_mutex_unlock(&lo
->mutex
);
2066 lo_dirp_put(&d
); /* paired with lo_opendir() */
2068 fuse_reply_err(req
, 0);
2071 static void update_open_flags(int writeback
, int allow_direct_io
,
2072 struct fuse_file_info
*fi
)
2075 * With writeback cache, kernel may send read requests even
2076 * when userspace opened write-only
2078 if (writeback
&& (fi
->flags
& O_ACCMODE
) == O_WRONLY
) {
2079 fi
->flags
&= ~O_ACCMODE
;
2080 fi
->flags
|= O_RDWR
;
2084 * With writeback cache, O_APPEND is handled by the kernel.
2085 * This breaks atomicity (since the file may change in the
2086 * underlying filesystem, so that the kernel's idea of the
2087 * end of the file isn't accurate anymore). In this example,
2088 * we just accept that. A more rigorous filesystem may want
2089 * to return an error here
2091 if (writeback
&& (fi
->flags
& O_APPEND
)) {
2092 fi
->flags
&= ~O_APPEND
;
2096 * O_DIRECT in guest should not necessarily mean bypassing page
2097 * cache on host as well. Therefore, we discard it by default
2098 * ('-o no_allow_direct_io'). If somebody needs that behavior,
2099 * the '-o allow_direct_io' option should be set.
2101 if (!allow_direct_io
) {
2102 fi
->flags
&= ~O_DIRECT
;
2107 * Open a regular file, set up an fd mapping, and fill out the struct
2108 * fuse_file_info for it. If existing_fd is not negative, use that fd instead
2109 * opening a new one. Takes ownership of existing_fd.
2111 * Returns 0 on success or a positive errno.
2113 static int lo_do_open(struct lo_data
*lo
, struct lo_inode
*inode
,
2114 int existing_fd
, struct fuse_file_info
*fi
)
2117 int fd
= existing_fd
;
2119 bool cap_fsetid_dropped
= false;
2120 bool kill_suidgid
= lo
->killpriv_v2
&& fi
->kill_priv
;
2122 update_open_flags(lo
->writeback
, lo
->allow_direct_io
, fi
);
2126 err
= drop_effective_cap("FSETID", &cap_fsetid_dropped
);
2132 fd
= lo_inode_open(lo
, inode
, fi
->flags
);
2134 if (cap_fsetid_dropped
) {
2135 if (gain_effective_cap("FSETID")) {
2136 fuse_log(FUSE_LOG_ERR
, "Failed to gain CAP_FSETID\n");
2142 if (fi
->flags
& (O_TRUNC
)) {
2143 int err
= drop_security_capability(lo
, fd
);
2151 pthread_mutex_lock(&lo
->mutex
);
2152 fh
= lo_add_fd_mapping(lo
, fd
);
2153 pthread_mutex_unlock(&lo
->mutex
);
2160 if (lo
->cache
== CACHE_NONE
) {
2162 } else if (lo
->cache
== CACHE_ALWAYS
) {
2168 static int do_create_nosecctx(fuse_req_t req
, struct lo_inode
*parent_inode
,
2169 const char *name
, mode_t mode
,
2170 struct fuse_file_info
*fi
, int *open_fd
,
2174 struct lo_cred old
= {};
2175 struct lo_data
*lo
= lo_data(req
);
2179 flags
= fi
->flags
| O_TMPFILE
;
2181 * Don't use O_EXCL as we want to link file later. Also reset O_CREAT
2182 * otherwise openat() returns -EINVAL.
2184 flags
&= ~(O_CREAT
| O_EXCL
);
2186 /* O_TMPFILE needs either O_RDWR or O_WRONLY */
2187 if ((flags
& O_ACCMODE
) == O_RDONLY
) {
2191 flags
= fi
->flags
| O_CREAT
| O_EXCL
;
2194 err
= lo_change_cred(req
, &old
, lo
->change_umask
);
2199 /* Try to create a new file but don't open existing files */
2200 fd
= openat(parent_inode
->fd
, name
, flags
, mode
);
2201 err
= fd
== -1 ? errno
: 0;
2202 lo_restore_cred(&old
, lo
->change_umask
);
2209 static int do_create_secctx_fscreate(fuse_req_t req
,
2210 struct lo_inode
*parent_inode
,
2211 const char *name
, mode_t mode
,
2212 struct fuse_file_info
*fi
, int *open_fd
)
2214 int err
= 0, fd
= -1, fscreate_fd
= -1;
2215 struct lo_data
*lo
= lo_data(req
);
2217 err
= open_set_proc_fscreate(lo
, req
->secctx
.ctx
, req
->secctx
.ctxlen
,
2223 err
= do_create_nosecctx(req
, parent_inode
, name
, mode
, fi
, &fd
, false);
2225 close_reset_proc_fscreate(fscreate_fd
);
2232 static int do_create_secctx_tmpfile(fuse_req_t req
,
2233 struct lo_inode
*parent_inode
,
2234 const char *name
, mode_t mode
,
2235 struct fuse_file_info
*fi
,
2236 const char *secctx_name
, int *open_fd
)
2239 struct lo_data
*lo
= lo_data(req
);
2242 err
= do_create_nosecctx(req
, parent_inode
, ".", mode
, fi
, &fd
, true);
2247 err
= fsetxattr(fd
, secctx_name
, req
->secctx
.ctx
, req
->secctx
.ctxlen
, 0);
2253 /* Security context set on file. Link it in place */
2254 sprintf(procname
, "%d", fd
);
2255 FCHDIR_NOFAIL(lo
->proc_self_fd
);
2256 err
= linkat(AT_FDCWD
, procname
, parent_inode
->fd
, name
,
2258 err
= err
== -1 ? errno
: 0;
2259 FCHDIR_NOFAIL(lo
->root
.fd
);
2264 } else if (fd
!= -1) {
2270 static int do_create_secctx_noatomic(fuse_req_t req
,
2271 struct lo_inode
*parent_inode
,
2272 const char *name
, mode_t mode
,
2273 struct fuse_file_info
*fi
,
2274 const char *secctx_name
, int *open_fd
)
2276 int err
= 0, fd
= -1;
2278 err
= do_create_nosecctx(req
, parent_inode
, name
, mode
, fi
, &fd
, false);
2283 /* Set security context. This is not atomic w.r.t file creation */
2284 err
= fsetxattr(fd
, secctx_name
, req
->secctx
.ctx
, req
->secctx
.ctxlen
, 0);
2285 err
= err
== -1 ? errno
: 0;
2292 unlinkat(parent_inode
->fd
, name
, 0);
2298 static int do_lo_create(fuse_req_t req
, struct lo_inode
*parent_inode
,
2299 const char *name
, mode_t mode
,
2300 struct fuse_file_info
*fi
, int *open_fd
)
2302 struct lo_data
*lo
= lo_data(req
);
2303 char *mapped_name
= NULL
;
2305 const char *ctxname
= req
->secctx
.name
;
2306 bool secctx_enabled
= req
->secctx
.ctxlen
;
2308 if (secctx_enabled
&& lo
->xattrmap
) {
2309 err
= xattr_map_client(lo
, req
->secctx
.name
, &mapped_name
);
2314 ctxname
= mapped_name
;
2317 if (secctx_enabled
) {
2319 * If security.selinux has not been remapped and selinux is enabled,
2320 * use fscreate to set context before file creation. If not, use
2321 * tmpfile method for regular files. Otherwise fallback to
2322 * non-atomic method of file creation and xattr settting.
2324 if (!mapped_name
&& lo
->use_fscreate
) {
2325 err
= do_create_secctx_fscreate(req
, parent_inode
, name
, mode
, fi
,
2328 } else if (S_ISREG(mode
)) {
2329 err
= do_create_secctx_tmpfile(req
, parent_inode
, name
, mode
, fi
,
2332 * If filesystem does not support O_TMPFILE, fallback to non-atomic
2335 if (!err
|| err
!= EOPNOTSUPP
) {
2340 err
= do_create_secctx_noatomic(req
, parent_inode
, name
, mode
, fi
,
2343 err
= do_create_nosecctx(req
, parent_inode
, name
, mode
, fi
, open_fd
,
2348 g_free(mapped_name
);
2352 static void lo_create(fuse_req_t req
, fuse_ino_t parent
, const char *name
,
2353 mode_t mode
, struct fuse_file_info
*fi
)
2356 struct lo_data
*lo
= lo_data(req
);
2357 struct lo_inode
*parent_inode
;
2358 struct lo_inode
*inode
= NULL
;
2359 struct fuse_entry_param e
;
2362 fuse_log(FUSE_LOG_DEBUG
, "lo_create(parent=%" PRIu64
", name=%s)"
2363 " kill_priv=%d\n", parent
, name
, fi
->kill_priv
);
2365 if (!is_safe_path_component(name
)) {
2366 fuse_reply_err(req
, EINVAL
);
2370 parent_inode
= lo_inode(req
, parent
);
2371 if (!parent_inode
) {
2372 fuse_reply_err(req
, EBADF
);
2376 update_open_flags(lo
->writeback
, lo
->allow_direct_io
, fi
);
2378 err
= do_lo_create(req
, parent_inode
, name
, mode
, fi
, &fd
);
2380 /* Ignore the error if file exists and O_EXCL was not given */
2381 if (err
&& (err
!= EEXIST
|| (fi
->flags
& O_EXCL
))) {
2385 err
= lo_do_lookup(req
, parent
, name
, &e
, &inode
);
2390 err
= lo_do_open(lo
, inode
, fd
, fi
);
2391 fd
= -1; /* lo_do_open() takes ownership of fd */
2393 /* Undo lo_do_lookup() nlookup ref */
2394 unref_inode_lolocked(lo
, inode
, 1);
2398 lo_inode_put(lo
, &inode
);
2399 lo_inode_put(lo
, &parent_inode
);
2406 fuse_reply_err(req
, err
);
2408 fuse_reply_create(req
, &e
, fi
);
2412 /* Should be called with inode->plock_mutex held */
2413 static struct lo_inode_plock
*lookup_create_plock_ctx(struct lo_data
*lo
,
2414 struct lo_inode
*inode
,
2415 uint64_t lock_owner
,
2416 pid_t pid
, int *err
)
2418 struct lo_inode_plock
*plock
;
2422 g_hash_table_lookup(inode
->posix_locks
, GUINT_TO_POINTER(lock_owner
));
2428 plock
= malloc(sizeof(struct lo_inode_plock
));
2434 /* Open another instance of file which can be used for ofd locks. */
2435 /* TODO: What if file is not writable? */
2436 fd
= lo_inode_open(lo
, inode
, O_RDWR
);
2443 plock
->lock_owner
= lock_owner
;
2445 g_hash_table_insert(inode
->posix_locks
, GUINT_TO_POINTER(plock
->lock_owner
),
2450 static void lo_getlk(fuse_req_t req
, fuse_ino_t ino
, struct fuse_file_info
*fi
,
2453 struct lo_data
*lo
= lo_data(req
);
2454 struct lo_inode
*inode
;
2455 struct lo_inode_plock
*plock
;
2456 int ret
, saverr
= 0;
2458 fuse_log(FUSE_LOG_DEBUG
,
2459 "lo_getlk(ino=%" PRIu64
", flags=%d)"
2460 " owner=0x%" PRIx64
", l_type=%d l_start=0x%" PRIx64
2461 " l_len=0x%" PRIx64
"\n",
2462 ino
, fi
->flags
, fi
->lock_owner
, lock
->l_type
,
2463 (uint64_t)lock
->l_start
, (uint64_t)lock
->l_len
);
2465 if (!lo
->posix_lock
) {
2466 fuse_reply_err(req
, ENOSYS
);
2470 inode
= lo_inode(req
, ino
);
2472 fuse_reply_err(req
, EBADF
);
2476 pthread_mutex_lock(&inode
->plock_mutex
);
2478 lookup_create_plock_ctx(lo
, inode
, fi
->lock_owner
, lock
->l_pid
, &ret
);
2484 ret
= fcntl(plock
->fd
, F_OFD_GETLK
, lock
);
2490 pthread_mutex_unlock(&inode
->plock_mutex
);
2491 lo_inode_put(lo
, &inode
);
2494 fuse_reply_err(req
, saverr
);
2496 fuse_reply_lock(req
, lock
);
2500 static void lo_setlk(fuse_req_t req
, fuse_ino_t ino
, struct fuse_file_info
*fi
,
2501 struct flock
*lock
, int sleep
)
2503 struct lo_data
*lo
= lo_data(req
);
2504 struct lo_inode
*inode
;
2505 struct lo_inode_plock
*plock
;
2506 int ret
, saverr
= 0;
2508 fuse_log(FUSE_LOG_DEBUG
,
2509 "lo_setlk(ino=%" PRIu64
", flags=%d)"
2510 " cmd=%d pid=%d owner=0x%" PRIx64
" sleep=%d l_whence=%d"
2511 " l_start=0x%" PRIx64
" l_len=0x%" PRIx64
"\n",
2512 ino
, fi
->flags
, lock
->l_type
, lock
->l_pid
, fi
->lock_owner
, sleep
,
2513 lock
->l_whence
, (uint64_t)lock
->l_start
, (uint64_t)lock
->l_len
);
2515 if (!lo
->posix_lock
) {
2516 fuse_reply_err(req
, ENOSYS
);
2521 fuse_reply_err(req
, EOPNOTSUPP
);
2525 inode
= lo_inode(req
, ino
);
2527 fuse_reply_err(req
, EBADF
);
2531 pthread_mutex_lock(&inode
->plock_mutex
);
2533 lookup_create_plock_ctx(lo
, inode
, fi
->lock_owner
, lock
->l_pid
, &ret
);
2540 /* TODO: Is it alright to modify flock? */
2542 ret
= fcntl(plock
->fd
, F_OFD_SETLK
, lock
);
2548 pthread_mutex_unlock(&inode
->plock_mutex
);
2549 lo_inode_put(lo
, &inode
);
2551 fuse_reply_err(req
, saverr
);
2554 static void lo_fsyncdir(fuse_req_t req
, fuse_ino_t ino
, int datasync
,
2555 struct fuse_file_info
*fi
)
2563 d
= lo_dirp(req
, fi
);
2565 fuse_reply_err(req
, EBADF
);
2571 res
= fdatasync(fd
);
2578 fuse_reply_err(req
, res
== -1 ? errno
: 0);
2581 static void lo_open(fuse_req_t req
, fuse_ino_t ino
, struct fuse_file_info
*fi
)
2583 struct lo_data
*lo
= lo_data(req
);
2584 struct lo_inode
*inode
= lo_inode(req
, ino
);
2587 fuse_log(FUSE_LOG_DEBUG
, "lo_open(ino=%" PRIu64
", flags=%d, kill_priv=%d)"
2588 "\n", ino
, fi
->flags
, fi
->kill_priv
);
2591 fuse_reply_err(req
, EBADF
);
2595 err
= lo_do_open(lo
, inode
, -1, fi
);
2596 lo_inode_put(lo
, &inode
);
2598 fuse_reply_err(req
, err
);
2600 fuse_reply_open(req
, fi
);
2604 static void lo_release(fuse_req_t req
, fuse_ino_t ino
,
2605 struct fuse_file_info
*fi
)
2607 struct lo_data
*lo
= lo_data(req
);
2608 struct lo_map_elem
*elem
;
2613 pthread_mutex_lock(&lo
->mutex
);
2614 elem
= lo_map_get(&lo
->fd_map
, fi
->fh
);
2618 lo_map_remove(&lo
->fd_map
, fi
->fh
);
2620 pthread_mutex_unlock(&lo
->mutex
);
2623 fuse_reply_err(req
, 0);
2626 static void lo_flush(fuse_req_t req
, fuse_ino_t ino
, struct fuse_file_info
*fi
)
2630 struct lo_inode
*inode
;
2631 struct lo_data
*lo
= lo_data(req
);
2633 inode
= lo_inode(req
, ino
);
2635 fuse_reply_err(req
, EBADF
);
2639 if (!S_ISREG(inode
->filetype
)) {
2640 lo_inode_put(lo
, &inode
);
2641 fuse_reply_err(req
, EBADF
);
2645 /* An fd is going away. Cleanup associated posix locks */
2646 if (lo
->posix_lock
) {
2647 pthread_mutex_lock(&inode
->plock_mutex
);
2648 g_hash_table_remove(inode
->posix_locks
,
2649 GUINT_TO_POINTER(fi
->lock_owner
));
2650 pthread_mutex_unlock(&inode
->plock_mutex
);
2652 res
= close(dup(lo_fi_fd(req
, fi
)));
2653 lo_inode_put(lo
, &inode
);
2654 fuse_reply_err(req
, res
== -1 ? errno
: 0);
2657 static void lo_fsync(fuse_req_t req
, fuse_ino_t ino
, int datasync
,
2658 struct fuse_file_info
*fi
)
2660 struct lo_inode
*inode
= lo_inode(req
, ino
);
2661 struct lo_data
*lo
= lo_data(req
);
2665 fuse_log(FUSE_LOG_DEBUG
, "lo_fsync(ino=%" PRIu64
", fi=0x%p)\n", ino
,
2669 fuse_reply_err(req
, EBADF
);
2674 fd
= lo_inode_open(lo
, inode
, O_RDWR
);
2680 fd
= lo_fi_fd(req
, fi
);
2684 res
= fdatasync(fd
) == -1 ? errno
: 0;
2686 res
= fsync(fd
) == -1 ? errno
: 0;
2692 lo_inode_put(lo
, &inode
);
2693 fuse_reply_err(req
, res
);
2696 static void lo_read(fuse_req_t req
, fuse_ino_t ino
, size_t size
, off_t offset
,
2697 struct fuse_file_info
*fi
)
2699 struct fuse_bufvec buf
= FUSE_BUFVEC_INIT(size
);
2701 fuse_log(FUSE_LOG_DEBUG
,
2702 "lo_read(ino=%" PRIu64
", size=%zd, "
2704 ino
, size
, (unsigned long)offset
);
2706 buf
.buf
[0].flags
= FUSE_BUF_IS_FD
| FUSE_BUF_FD_SEEK
;
2707 buf
.buf
[0].fd
= lo_fi_fd(req
, fi
);
2708 buf
.buf
[0].pos
= offset
;
2710 fuse_reply_data(req
, &buf
);
2713 static void lo_write_buf(fuse_req_t req
, fuse_ino_t ino
,
2714 struct fuse_bufvec
*in_buf
, off_t off
,
2715 struct fuse_file_info
*fi
)
2719 struct fuse_bufvec out_buf
= FUSE_BUFVEC_INIT(fuse_buf_size(in_buf
));
2720 bool cap_fsetid_dropped
= false;
2722 out_buf
.buf
[0].flags
= FUSE_BUF_IS_FD
| FUSE_BUF_FD_SEEK
;
2723 out_buf
.buf
[0].fd
= lo_fi_fd(req
, fi
);
2724 out_buf
.buf
[0].pos
= off
;
2726 fuse_log(FUSE_LOG_DEBUG
,
2727 "lo_write_buf(ino=%" PRIu64
", size=%zd, off=%lu kill_priv=%d)\n",
2728 ino
, out_buf
.buf
[0].size
, (unsigned long)off
, fi
->kill_priv
);
2730 res
= drop_security_capability(lo_data(req
), out_buf
.buf
[0].fd
);
2732 fuse_reply_err(req
, res
);
2737 * If kill_priv is set, drop CAP_FSETID which should lead to kernel
2738 * clearing setuid/setgid on file. Note, for WRITE, we need to do
2739 * this even if killpriv_v2 is not enabled. fuse direct write path
2742 if (fi
->kill_priv
) {
2743 res
= drop_effective_cap("FSETID", &cap_fsetid_dropped
);
2745 fuse_reply_err(req
, res
);
2750 res
= fuse_buf_copy(&out_buf
, in_buf
);
2752 fuse_reply_err(req
, -res
);
2754 fuse_reply_write(req
, (size_t)res
);
2757 if (cap_fsetid_dropped
) {
2758 res
= gain_effective_cap("FSETID");
2760 fuse_log(FUSE_LOG_ERR
, "Failed to gain CAP_FSETID\n");
2765 static void lo_statfs(fuse_req_t req
, fuse_ino_t ino
)
2768 struct statvfs stbuf
;
2770 res
= fstatvfs(lo_fd(req
, ino
), &stbuf
);
2772 fuse_reply_err(req
, errno
);
2774 fuse_reply_statfs(req
, &stbuf
);
2778 static void lo_fallocate(fuse_req_t req
, fuse_ino_t ino
, int mode
, off_t offset
,
2779 off_t length
, struct fuse_file_info
*fi
)
2781 int err
= EOPNOTSUPP
;
2784 #ifdef CONFIG_FALLOCATE
2785 err
= fallocate(lo_fi_fd(req
, fi
), mode
, offset
, length
);
2790 #elif defined(CONFIG_POSIX_FALLOCATE)
2792 fuse_reply_err(req
, EOPNOTSUPP
);
2796 err
= posix_fallocate(lo_fi_fd(req
, fi
), offset
, length
);
2799 fuse_reply_err(req
, err
);
2802 static void lo_flock(fuse_req_t req
, fuse_ino_t ino
, struct fuse_file_info
*fi
,
2808 if (!(op
& LOCK_NB
)) {
2810 * Blocking flock can deadlock as there is only one thread
2811 * serving the queue.
2813 fuse_reply_err(req
, EOPNOTSUPP
);
2817 res
= flock(lo_fi_fd(req
, fi
), op
);
2819 fuse_reply_err(req
, res
== -1 ? errno
: 0);
2824 * Exit; process attribute unmodified if matched.
2825 * An empty key applies to all.
2827 #define XATTR_MAP_FLAG_OK (1 << 0)
2829 * The attribute is unwanted;
2830 * EPERM on write, hidden on read.
2832 #define XATTR_MAP_FLAG_BAD (1 << 1)
2834 * For attr that start with 'key' prepend 'prepend'
2835 * 'key' may be empty to prepend for all attrs
2836 * key is defined from set/remove point of view.
2837 * Automatically reversed on read
2839 #define XATTR_MAP_FLAG_PREFIX (1 << 2)
2841 * The attribute is unsupported;
2842 * ENOTSUP on write, hidden on read.
2844 #define XATTR_MAP_FLAG_UNSUPPORTED (1 << 3)
2847 /* Apply rule to get/set/remove */
2848 #define XATTR_MAP_FLAG_CLIENT (1 << 16)
2849 /* Apply rule to list */
2850 #define XATTR_MAP_FLAG_SERVER (1 << 17)
2851 /* Apply rule to all */
2852 #define XATTR_MAP_FLAG_ALL (XATTR_MAP_FLAG_SERVER | XATTR_MAP_FLAG_CLIENT)
2854 static void add_xattrmap_entry(struct lo_data
*lo
,
2855 const XattrMapEntry
*new_entry
)
2857 XattrMapEntry
*res
= g_realloc_n(lo
->xattr_map_list
,
2858 lo
->xattr_map_nentries
+ 1,
2859 sizeof(XattrMapEntry
));
2860 res
[lo
->xattr_map_nentries
++] = *new_entry
;
2862 lo
->xattr_map_list
= res
;
2865 static void free_xattrmap(struct lo_data
*lo
)
2867 XattrMapEntry
*map
= lo
->xattr_map_list
;
2874 for (i
= 0; i
< lo
->xattr_map_nentries
; i
++) {
2876 g_free(map
[i
].prepend
);
2880 lo
->xattr_map_list
= NULL
;
2881 lo
->xattr_map_nentries
= -1;
2885 * Handle the 'map' type, which is sugar for a set of commands
2886 * for the common case of prefixing a subset or everything,
2887 * and allowing anything not prefixed through.
2888 * It must be the last entry in the stream, although there
2889 * can be other entries before it.
2893 * key maybe empty in which case all entries are prefixed.
2895 static void parse_xattrmap_map(struct lo_data
*lo
,
2896 const char *rule
, char sep
)
2901 XattrMapEntry tmp_entry
;
2904 fuse_log(FUSE_LOG_ERR
,
2905 "%s: Expecting '%c' after 'map' keyword, found '%c'\n",
2906 __func__
, sep
, *rule
);
2912 /* At start of 'key' field */
2913 tmp
= strchr(rule
, sep
);
2915 fuse_log(FUSE_LOG_ERR
,
2916 "%s: Missing '%c' at end of key field in map rule\n",
2921 key
= g_strndup(rule
, tmp
- rule
);
2924 /* At start of prefix field */
2925 tmp
= strchr(rule
, sep
);
2927 fuse_log(FUSE_LOG_ERR
,
2928 "%s: Missing '%c' at end of prefix field in map rule\n",
2933 prefix
= g_strndup(rule
, tmp
- rule
);
2937 * This should be the end of the string, we don't allow
2938 * any more commands after 'map'.
2941 fuse_log(FUSE_LOG_ERR
,
2942 "%s: Expecting end of command after map, found '%c'\n",
2947 /* 1st: Prefix matches/everything */
2948 tmp_entry
.flags
= XATTR_MAP_FLAG_PREFIX
| XATTR_MAP_FLAG_ALL
;
2949 tmp_entry
.key
= g_strdup(key
);
2950 tmp_entry
.prepend
= g_strdup(prefix
);
2951 add_xattrmap_entry(lo
, &tmp_entry
);
2954 /* Prefix all case */
2956 /* 2nd: Hide any non-prefixed entries on the host */
2957 tmp_entry
.flags
= XATTR_MAP_FLAG_BAD
| XATTR_MAP_FLAG_ALL
;
2958 tmp_entry
.key
= g_strdup("");
2959 tmp_entry
.prepend
= g_strdup("");
2960 add_xattrmap_entry(lo
, &tmp_entry
);
2962 /* Prefix matching case */
2964 /* 2nd: Hide non-prefixed but matching entries on the host */
2965 tmp_entry
.flags
= XATTR_MAP_FLAG_BAD
| XATTR_MAP_FLAG_SERVER
;
2966 tmp_entry
.key
= g_strdup(""); /* Not used */
2967 tmp_entry
.prepend
= g_strdup(key
);
2968 add_xattrmap_entry(lo
, &tmp_entry
);
2970 /* 3rd: Stop the client accessing prefixed attributes directly */
2971 tmp_entry
.flags
= XATTR_MAP_FLAG_BAD
| XATTR_MAP_FLAG_CLIENT
;
2972 tmp_entry
.key
= g_strdup(prefix
);
2973 tmp_entry
.prepend
= g_strdup(""); /* Not used */
2974 add_xattrmap_entry(lo
, &tmp_entry
);
2976 /* 4th: Everything else is OK */
2977 tmp_entry
.flags
= XATTR_MAP_FLAG_OK
| XATTR_MAP_FLAG_ALL
;
2978 tmp_entry
.key
= g_strdup("");
2979 tmp_entry
.prepend
= g_strdup("");
2980 add_xattrmap_entry(lo
, &tmp_entry
);
2987 static void parse_xattrmap(struct lo_data
*lo
)
2989 const char *map
= lo
->xattrmap
;
2993 lo
->xattr_map_nentries
= 0;
2995 XattrMapEntry tmp_entry
;
2998 if (isspace(*map
)) {
3002 /* The separator is the first non-space of the rule */
3008 tmp_entry
.flags
= 0;
3009 /* Start of 'type' */
3010 if (strstart(map
, "prefix", &map
)) {
3011 tmp_entry
.flags
|= XATTR_MAP_FLAG_PREFIX
;
3012 } else if (strstart(map
, "ok", &map
)) {
3013 tmp_entry
.flags
|= XATTR_MAP_FLAG_OK
;
3014 } else if (strstart(map
, "bad", &map
)) {
3015 tmp_entry
.flags
|= XATTR_MAP_FLAG_BAD
;
3016 } else if (strstart(map
, "unsupported", &map
)) {
3017 tmp_entry
.flags
|= XATTR_MAP_FLAG_UNSUPPORTED
;
3018 } else if (strstart(map
, "map", &map
)) {
3020 * map is sugar that adds a number of rules, and must be
3023 parse_xattrmap_map(lo
, map
, sep
);
3026 fuse_log(FUSE_LOG_ERR
,
3027 "%s: Unexpected type;"
3028 "Expecting 'prefix', 'ok', 'bad', 'unsupported' or 'map'"
3029 " in rule %zu\n", __func__
, lo
->xattr_map_nentries
);
3033 if (*map
++ != sep
) {
3034 fuse_log(FUSE_LOG_ERR
,
3035 "%s: Missing '%c' at end of type field of rule %zu\n",
3036 __func__
, sep
, lo
->xattr_map_nentries
);
3040 /* Start of 'scope' */
3041 if (strstart(map
, "client", &map
)) {
3042 tmp_entry
.flags
|= XATTR_MAP_FLAG_CLIENT
;
3043 } else if (strstart(map
, "server", &map
)) {
3044 tmp_entry
.flags
|= XATTR_MAP_FLAG_SERVER
;
3045 } else if (strstart(map
, "all", &map
)) {
3046 tmp_entry
.flags
|= XATTR_MAP_FLAG_ALL
;
3048 fuse_log(FUSE_LOG_ERR
,
3049 "%s: Unexpected scope;"
3050 " Expecting 'client', 'server', or 'all', in rule %zu\n",
3051 __func__
, lo
->xattr_map_nentries
);
3055 if (*map
++ != sep
) {
3056 fuse_log(FUSE_LOG_ERR
,
3057 "%s: Expecting '%c' found '%c'"
3058 " after scope in rule %zu\n",
3059 __func__
, sep
, *map
, lo
->xattr_map_nentries
);
3063 /* At start of 'key' field */
3064 tmp
= strchr(map
, sep
);
3066 fuse_log(FUSE_LOG_ERR
,
3067 "%s: Missing '%c' at end of key field of rule %zu",
3068 __func__
, sep
, lo
->xattr_map_nentries
);
3071 tmp_entry
.key
= g_strndup(map
, tmp
- map
);
3074 /* At start of 'prepend' field */
3075 tmp
= strchr(map
, sep
);
3077 fuse_log(FUSE_LOG_ERR
,
3078 "%s: Missing '%c' at end of prepend field of rule %zu",
3079 __func__
, sep
, lo
->xattr_map_nentries
);
3082 tmp_entry
.prepend
= g_strndup(map
, tmp
- map
);
3085 add_xattrmap_entry(lo
, &tmp_entry
);
3086 /* End of rule - go around again for another rule */
3089 if (!lo
->xattr_map_nentries
) {
3090 fuse_log(FUSE_LOG_ERR
, "Empty xattr map\n");
3094 ret
= xattr_map_client(lo
, "security.capability",
3095 &lo
->xattr_security_capability
);
3097 fuse_log(FUSE_LOG_ERR
, "Failed to map security.capability: %s\n",
3101 if (!lo
->xattr_security_capability
||
3102 !strcmp(lo
->xattr_security_capability
, "security.capability")) {
3103 /* 1-1 mapping, don't need to do anything */
3104 free(lo
->xattr_security_capability
);
3105 lo
->xattr_security_capability
= NULL
;
3110 * For use with getxattr/setxattr/removexattr, where the client
3111 * gives us a name and we may need to choose a different one.
3112 * Allocates a buffer for the result placing it in *out_name.
3113 * If there's no change then *out_name is not set.
3114 * Returns 0 on success
3115 * Can return -EPERM to indicate we block a given attribute
3116 * (in which case out_name is not allocated)
3117 * Can return -ENOMEM to indicate out_name couldn't be allocated.
3119 static int xattr_map_client(const struct lo_data
*lo
, const char *client_name
,
3123 for (i
= 0; i
< lo
->xattr_map_nentries
; i
++) {
3124 const XattrMapEntry
*cur_entry
= lo
->xattr_map_list
+ i
;
3126 if ((cur_entry
->flags
& XATTR_MAP_FLAG_CLIENT
) &&
3127 (strstart(client_name
, cur_entry
->key
, NULL
))) {
3128 if (cur_entry
->flags
& XATTR_MAP_FLAG_BAD
) {
3131 if (cur_entry
->flags
& XATTR_MAP_FLAG_UNSUPPORTED
) {
3134 if (cur_entry
->flags
& XATTR_MAP_FLAG_OK
) {
3135 /* Unmodified name */
3138 if (cur_entry
->flags
& XATTR_MAP_FLAG_PREFIX
) {
3139 *out_name
= g_try_malloc(strlen(client_name
) +
3140 strlen(cur_entry
->prepend
) + 1);
3144 sprintf(*out_name
, "%s%s", cur_entry
->prepend
, client_name
);
3154 * For use with listxattr where the server fs gives us a name and we may need
3155 * to sanitize this for the client.
3156 * Returns a pointer to the result in *out_name
3157 * This is always the original string or the current string with some prefix
3158 * removed; no reallocation is done.
3159 * Returns 0 on success
3160 * Can return -ENODATA to indicate the name should be dropped from the list.
3162 static int xattr_map_server(const struct lo_data
*lo
, const char *server_name
,
3163 const char **out_name
)
3168 for (i
= 0; i
< lo
->xattr_map_nentries
; i
++) {
3169 const XattrMapEntry
*cur_entry
= lo
->xattr_map_list
+ i
;
3171 if ((cur_entry
->flags
& XATTR_MAP_FLAG_SERVER
) &&
3172 (strstart(server_name
, cur_entry
->prepend
, &end
))) {
3173 if (cur_entry
->flags
& XATTR_MAP_FLAG_BAD
||
3174 cur_entry
->flags
& XATTR_MAP_FLAG_UNSUPPORTED
) {
3177 if (cur_entry
->flags
& XATTR_MAP_FLAG_OK
) {
3178 *out_name
= server_name
;
3181 if (cur_entry
->flags
& XATTR_MAP_FLAG_PREFIX
) {
3192 static bool block_xattr(struct lo_data
*lo
, const char *name
)
3195 * If user explicitly enabled posix_acl or did not provide any option,
3196 * do not block acl. Otherwise block system.posix_acl_access and
3197 * system.posix_acl_default xattrs.
3199 if (lo
->user_posix_acl
) {
3202 if (!strcmp(name
, "system.posix_acl_access") ||
3203 !strcmp(name
, "system.posix_acl_default"))
3210 * Returns number of bytes in xattr_list after filtering on success. This
3211 * could be zero as well if nothing is left after filtering.
3213 * Returns negative error code on failure.
3214 * xattr_list is modified in place.
3216 static int remove_blocked_xattrs(struct lo_data
*lo
, char *xattr_list
,
3219 size_t out_index
, in_index
;
3222 * As of now we only filter out acl xattrs. If acls are enabled or
3223 * they have not been explicitly disabled, there is nothing to
3226 if (lo
->user_posix_acl
) {
3232 while (in_index
< in_size
) {
3233 char *in_ptr
= xattr_list
+ in_index
;
3235 /* Length of current attribute name */
3236 size_t in_len
= strlen(xattr_list
+ in_index
) + 1;
3238 if (!block_xattr(lo
, in_ptr
)) {
3239 if (in_index
!= out_index
) {
3240 memmove(xattr_list
+ out_index
, xattr_list
+ in_index
, in_len
);
3242 out_index
+= in_len
;
3249 static void lo_getxattr(fuse_req_t req
, fuse_ino_t ino
, const char *in_name
,
3252 struct lo_data
*lo
= lo_data(req
);
3253 g_autofree
char *value
= NULL
;
3257 struct lo_inode
*inode
;
3262 if (block_xattr(lo
, in_name
)) {
3263 fuse_reply_err(req
, EOPNOTSUPP
);
3270 ret
= xattr_map_client(lo
, in_name
, &mapped_name
);
3272 if (ret
== -EPERM
) {
3275 fuse_reply_err(req
, -ret
);
3283 inode
= lo_inode(req
, ino
);
3285 fuse_reply_err(req
, EBADF
);
3286 g_free(mapped_name
);
3291 if (!lo_data(req
)->xattr
) {
3295 fuse_log(FUSE_LOG_DEBUG
, "lo_getxattr(ino=%" PRIu64
", name=%s size=%zd)\n",
3299 value
= g_try_malloc(size
);
3305 sprintf(procname
, "%i", inode
->fd
);
3307 * It is not safe to open() non-regular/non-dir files in file server
3308 * unless O_PATH is used, so use that method for regular files/dir
3309 * only (as it seems giving less performance overhead).
3310 * Otherwise, call fchdir() to avoid open().
3312 if (S_ISREG(inode
->filetype
) || S_ISDIR(inode
->filetype
)) {
3313 fd
= openat(lo
->proc_self_fd
, procname
, O_RDONLY
);
3317 ret
= fgetxattr(fd
, name
, value
, size
);
3318 saverr
= ret
== -1 ? errno
: 0;
3320 /* fchdir should not fail here */
3321 FCHDIR_NOFAIL(lo
->proc_self_fd
);
3322 ret
= getxattr(procname
, name
, value
, size
);
3323 saverr
= ret
== -1 ? errno
: 0;
3324 FCHDIR_NOFAIL(lo
->root
.fd
);
3335 fuse_reply_buf(req
, value
, ret
);
3337 fuse_reply_xattr(req
, ret
);
3344 lo_inode_put(lo
, &inode
);
3350 fuse_reply_err(req
, saverr
);
3351 g_free(mapped_name
);
3355 static void lo_listxattr(fuse_req_t req
, fuse_ino_t ino
, size_t size
)
3357 struct lo_data
*lo
= lo_data(req
);
3358 g_autofree
char *value
= NULL
;
3360 struct lo_inode
*inode
;
3365 inode
= lo_inode(req
, ino
);
3367 fuse_reply_err(req
, EBADF
);
3372 if (!lo_data(req
)->xattr
) {
3376 fuse_log(FUSE_LOG_DEBUG
, "lo_listxattr(ino=%" PRIu64
", size=%zd)\n", ino
,
3380 value
= g_try_malloc(size
);
3386 sprintf(procname
, "%i", inode
->fd
);
3387 if (S_ISREG(inode
->filetype
) || S_ISDIR(inode
->filetype
)) {
3388 fd
= openat(lo
->proc_self_fd
, procname
, O_RDONLY
);
3392 ret
= flistxattr(fd
, value
, size
);
3393 saverr
= ret
== -1 ? errno
: 0;
3395 /* fchdir should not fail here */
3396 FCHDIR_NOFAIL(lo
->proc_self_fd
);
3397 ret
= listxattr(procname
, value
, size
);
3398 saverr
= ret
== -1 ? errno
: 0;
3399 FCHDIR_NOFAIL(lo
->root
.fd
);
3411 if (lo
->xattr_map_list
) {
3413 * Map the names back, some attributes might be dropped,
3414 * some shortened, but not increased, so we shouldn't
3417 size_t out_index
, in_index
;
3420 while (in_index
< ret
) {
3421 const char *map_out
;
3422 char *in_ptr
= value
+ in_index
;
3423 /* Length of current attribute name */
3424 size_t in_len
= strlen(value
+ in_index
) + 1;
3426 int mapret
= xattr_map_server(lo
, in_ptr
, &map_out
);
3427 if (mapret
!= -ENODATA
&& mapret
!= 0) {
3428 /* Shouldn't happen */
3433 /* Either unchanged, or truncated */
3435 if (map_out
!= in_ptr
) {
3436 /* +1 copies the NIL */
3437 out_len
= strlen(map_out
) + 1;
3443 * Move result along, may still be needed for an unchanged
3444 * entry if a previous entry was changed.
3446 memmove(value
+ out_index
, map_out
, out_len
);
3448 out_index
+= out_len
;
3458 ret
= remove_blocked_xattrs(lo
, value
, ret
);
3463 fuse_reply_buf(req
, value
, ret
);
3466 * xattrmap only ever shortens the result,
3467 * so we don't need to do anything clever with the
3468 * allocation length here.
3470 fuse_reply_xattr(req
, ret
);
3477 lo_inode_put(lo
, &inode
);
3483 fuse_reply_err(req
, saverr
);
3487 static void lo_setxattr(fuse_req_t req
, fuse_ino_t ino
, const char *in_name
,
3488 const char *value
, size_t size
, int flags
,
3489 uint32_t extra_flags
)
3494 struct lo_data
*lo
= lo_data(req
);
3495 struct lo_inode
*inode
;
3499 bool switched_creds
= false;
3500 bool cap_fsetid_dropped
= false;
3501 struct lo_cred old
= {};
3503 if (block_xattr(lo
, in_name
)) {
3504 fuse_reply_err(req
, EOPNOTSUPP
);
3511 ret
= xattr_map_client(lo
, in_name
, &mapped_name
);
3513 fuse_reply_err(req
, -ret
);
3521 inode
= lo_inode(req
, ino
);
3523 fuse_reply_err(req
, EBADF
);
3524 g_free(mapped_name
);
3529 if (!lo_data(req
)->xattr
) {
3533 fuse_log(FUSE_LOG_DEBUG
, "lo_setxattr(ino=%" PRIu64
3534 ", name=%s value=%s size=%zd)\n", ino
, name
, value
, size
);
3536 sprintf(procname
, "%i", inode
->fd
);
3538 * If we are setting posix access acl and if SGID needs to be
3539 * cleared, then switch to caller's gid and drop CAP_FSETID
3540 * and that should make sure host kernel clears SGID.
3542 * This probably will not work when we support idmapped mounts.
3543 * In that case we will need to find a non-root gid and switch
3544 * to it. (Instead of gid in request). Fix it when we support
3547 if (lo
->posix_acl
&& !strcmp(name
, "system.posix_acl_access")
3548 && (extra_flags
& FUSE_SETXATTR_ACL_KILL_SGID
)) {
3549 ret
= lo_drop_cap_change_cred(req
, &old
, false, "FSETID",
3550 &cap_fsetid_dropped
);
3555 switched_creds
= true;
3557 if (S_ISREG(inode
->filetype
) || S_ISDIR(inode
->filetype
)) {
3558 fd
= openat(lo
->proc_self_fd
, procname
, O_RDONLY
);
3563 ret
= fsetxattr(fd
, name
, value
, size
, flags
);
3564 saverr
= ret
== -1 ? errno
: 0;
3566 /* fchdir should not fail here */
3567 FCHDIR_NOFAIL(lo
->proc_self_fd
);
3568 ret
= setxattr(procname
, name
, value
, size
, flags
);
3569 saverr
= ret
== -1 ? errno
: 0;
3570 FCHDIR_NOFAIL(lo
->root
.fd
);
3572 if (switched_creds
) {
3573 if (cap_fsetid_dropped
)
3574 lo_restore_cred_gain_cap(&old
, false, "FSETID");
3576 lo_restore_cred(&old
, false);
3584 lo_inode_put(lo
, &inode
);
3585 g_free(mapped_name
);
3586 fuse_reply_err(req
, saverr
);
3589 static void lo_removexattr(fuse_req_t req
, fuse_ino_t ino
, const char *in_name
)
3594 struct lo_data
*lo
= lo_data(req
);
3595 struct lo_inode
*inode
;
3600 if (block_xattr(lo
, in_name
)) {
3601 fuse_reply_err(req
, EOPNOTSUPP
);
3608 ret
= xattr_map_client(lo
, in_name
, &mapped_name
);
3610 fuse_reply_err(req
, -ret
);
3618 inode
= lo_inode(req
, ino
);
3620 fuse_reply_err(req
, EBADF
);
3621 g_free(mapped_name
);
3626 if (!lo_data(req
)->xattr
) {
3630 fuse_log(FUSE_LOG_DEBUG
, "lo_removexattr(ino=%" PRIu64
", name=%s)\n", ino
,
3633 sprintf(procname
, "%i", inode
->fd
);
3634 if (S_ISREG(inode
->filetype
) || S_ISDIR(inode
->filetype
)) {
3635 fd
= openat(lo
->proc_self_fd
, procname
, O_RDONLY
);
3640 ret
= fremovexattr(fd
, name
);
3641 saverr
= ret
== -1 ? errno
: 0;
3643 /* fchdir should not fail here */
3644 FCHDIR_NOFAIL(lo
->proc_self_fd
);
3645 ret
= removexattr(procname
, name
);
3646 saverr
= ret
== -1 ? errno
: 0;
3647 FCHDIR_NOFAIL(lo
->root
.fd
);
3655 lo_inode_put(lo
, &inode
);
3656 g_free(mapped_name
);
3657 fuse_reply_err(req
, saverr
);
3660 #ifdef HAVE_COPY_FILE_RANGE
3661 static void lo_copy_file_range(fuse_req_t req
, fuse_ino_t ino_in
, off_t off_in
,
3662 struct fuse_file_info
*fi_in
, fuse_ino_t ino_out
,
3663 off_t off_out
, struct fuse_file_info
*fi_out
,
3664 size_t len
, int flags
)
3669 in_fd
= lo_fi_fd(req
, fi_in
);
3670 out_fd
= lo_fi_fd(req
, fi_out
);
3672 fuse_log(FUSE_LOG_DEBUG
,
3673 "lo_copy_file_range(ino=%" PRIu64
"/fd=%d, "
3674 "off=%ju, ino=%" PRIu64
"/fd=%d, "
3675 "off=%ju, size=%zd, flags=0x%x)\n",
3676 ino_in
, in_fd
, (intmax_t)off_in
,
3677 ino_out
, out_fd
, (intmax_t)off_out
, len
, flags
);
3679 res
= copy_file_range(in_fd
, &off_in
, out_fd
, &off_out
, len
, flags
);
3681 fuse_reply_err(req
, errno
);
3683 fuse_reply_write(req
, res
);
3688 static void lo_lseek(fuse_req_t req
, fuse_ino_t ino
, off_t off
, int whence
,
3689 struct fuse_file_info
*fi
)
3694 res
= lseek(lo_fi_fd(req
, fi
), off
, whence
);
3696 fuse_reply_lseek(req
, res
);
3698 fuse_reply_err(req
, errno
);
3702 static int lo_do_syncfs(struct lo_data
*lo
, struct lo_inode
*inode
)
3706 fuse_log(FUSE_LOG_DEBUG
, "lo_do_syncfs(ino=%" PRIu64
")\n",
3709 fd
= lo_inode_open(lo
, inode
, O_RDONLY
);
3714 if (syncfs(fd
) < 0) {
3722 static void lo_syncfs(fuse_req_t req
, fuse_ino_t ino
)
3724 struct lo_data
*lo
= lo_data(req
);
3725 struct lo_inode
*inode
= lo_inode(req
, ino
);
3729 fuse_reply_err(req
, EBADF
);
3733 err
= lo_do_syncfs(lo
, inode
);
3734 lo_inode_put(lo
, &inode
);
3737 * If submounts aren't announced, the client only sends a request to
3738 * sync the root inode. TODO: Track submounts internally and iterate
3739 * over them as well.
3742 fuse_reply_err(req
, err
);
3745 static void lo_destroy(void *userdata
)
3747 struct lo_data
*lo
= (struct lo_data
*)userdata
;
3749 pthread_mutex_lock(&lo
->mutex
);
3751 GHashTableIter iter
;
3752 gpointer key
, value
;
3754 g_hash_table_iter_init(&iter
, lo
->inodes
);
3755 if (!g_hash_table_iter_next(&iter
, &key
, &value
)) {
3759 struct lo_inode
*inode
= value
;
3760 unref_inode(lo
, inode
, inode
->nlookup
);
3762 pthread_mutex_unlock(&lo
->mutex
);
3765 static struct fuse_lowlevel_ops lo_oper
= {
3767 .lookup
= lo_lookup
,
3770 .symlink
= lo_symlink
,
3772 .unlink
= lo_unlink
,
3774 .rename
= lo_rename
,
3775 .forget
= lo_forget
,
3776 .forget_multi
= lo_forget_multi
,
3777 .getattr
= lo_getattr
,
3778 .setattr
= lo_setattr
,
3779 .readlink
= lo_readlink
,
3780 .opendir
= lo_opendir
,
3781 .readdir
= lo_readdir
,
3782 .readdirplus
= lo_readdirplus
,
3783 .releasedir
= lo_releasedir
,
3784 .fsyncdir
= lo_fsyncdir
,
3785 .create
= lo_create
,
3789 .release
= lo_release
,
3793 .write_buf
= lo_write_buf
,
3794 .statfs
= lo_statfs
,
3795 .fallocate
= lo_fallocate
,
3797 .getxattr
= lo_getxattr
,
3798 .listxattr
= lo_listxattr
,
3799 .setxattr
= lo_setxattr
,
3800 .removexattr
= lo_removexattr
,
3801 #ifdef HAVE_COPY_FILE_RANGE
3802 .copy_file_range
= lo_copy_file_range
,
3805 .syncfs
= lo_syncfs
,
3806 .destroy
= lo_destroy
,
3809 /* Print vhost-user.json backend program capabilities */
3810 static void print_capabilities(void)
3813 printf(" \"type\": \"fs\"\n");
3818 * Drop all Linux capabilities because the wait parent process only needs to
3819 * sit in waitpid(2) and terminate.
3821 static void setup_wait_parent_capabilities(void)
3823 capng_setpid(syscall(SYS_gettid
));
3824 capng_clear(CAPNG_SELECT_BOTH
);
3825 capng_apply(CAPNG_SELECT_BOTH
);
3829 * Move to a new mount, net, and pid namespaces to isolate this process.
3831 static void setup_namespaces(struct lo_data
*lo
, struct fuse_session
*se
)
3836 * Create a new pid namespace for *child* processes. We'll have to
3837 * fork in order to enter the new pid namespace. A new mount namespace
3838 * is also needed so that we can remount /proc for the new pid
3841 * Our UNIX domain sockets have been created. Now we can move to
3842 * an empty network namespace to prevent TCP/IP and other network
3843 * activity in case this process is compromised.
3845 if (unshare(CLONE_NEWPID
| CLONE_NEWNS
| CLONE_NEWNET
) != 0) {
3846 fuse_log(FUSE_LOG_ERR
, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
3852 fuse_log(FUSE_LOG_ERR
, "fork() failed: %m\n");
3859 setup_wait_parent_capabilities();
3861 /* The parent waits for the child */
3863 waited
= waitpid(child
, &wstatus
, 0);
3864 } while (waited
< 0 && errno
== EINTR
&& !se
->exited
);
3866 /* We were terminated by a signal, see fuse_signals.c */
3871 if (WIFEXITED(wstatus
)) {
3872 exit(WEXITSTATUS(wstatus
));
3878 /* Send us SIGTERM when the parent thread terminates, see prctl(2) */
3879 prctl(PR_SET_PDEATHSIG
, SIGTERM
);
3882 * If the mounts have shared propagation then we want to opt out so our
3883 * mount changes don't affect the parent mount namespace.
3885 if (mount(NULL
, "/", NULL
, MS_REC
| MS_SLAVE
, NULL
) < 0) {
3886 fuse_log(FUSE_LOG_ERR
, "mount(/, MS_REC|MS_SLAVE): %m\n");
3890 /* The child must remount /proc to use the new pid namespace */
3891 if (mount("proc", "/proc", "proc",
3892 MS_NODEV
| MS_NOEXEC
| MS_NOSUID
| MS_RELATIME
, NULL
) < 0) {
3893 fuse_log(FUSE_LOG_ERR
, "mount(/proc): %m\n");
3897 /* Get the /proc/self/task descriptor */
3898 lo
->proc_self_task
= open("/proc/self/task/", O_PATH
);
3899 if (lo
->proc_self_task
== -1) {
3900 fuse_log(FUSE_LOG_ERR
, "open(/proc/self/task, O_PATH): %m\n");
3904 lo
->use_fscreate
= is_fscreate_usable(lo
);
3907 * We only need /proc/self/fd. Prevent ".." from accessing parent
3908 * directories of /proc/self/fd by bind-mounting it over /proc. Since / was
3909 * previously remounted with MS_REC | MS_SLAVE this mount change only
3910 * affects our process.
3912 if (mount("/proc/self/fd", "/proc", NULL
, MS_BIND
, NULL
) < 0) {
3913 fuse_log(FUSE_LOG_ERR
, "mount(/proc/self/fd, MS_BIND): %m\n");
3917 /* Get the /proc (actually /proc/self/fd, see above) file descriptor */
3918 lo
->proc_self_fd
= open("/proc", O_PATH
);
3919 if (lo
->proc_self_fd
== -1) {
3920 fuse_log(FUSE_LOG_ERR
, "open(/proc, O_PATH): %m\n");
3926 * Capture the capability state, we'll need to restore this for individual
3927 * threads later; see load_capng.
3929 static void setup_capng(void)
3931 /* Note this accesses /proc so has to happen before the sandbox */
3932 if (capng_get_caps_process()) {
3933 fuse_log(FUSE_LOG_ERR
, "capng_get_caps_process\n");
3936 pthread_mutex_init(&cap
.mutex
, NULL
);
3937 pthread_mutex_lock(&cap
.mutex
);
3938 cap
.saved
= capng_save_state();
3940 fuse_log(FUSE_LOG_ERR
, "capng_save_state\n");
3943 pthread_mutex_unlock(&cap
.mutex
);
3946 static void cleanup_capng(void)
3950 pthread_mutex_destroy(&cap
.mutex
);
3955 * Make the source directory our root so symlinks cannot escape and no other
3956 * files are accessible. Assumes unshare(CLONE_NEWNS) was already called.
3958 static void setup_mounts(const char *source
)
3963 if (mount(source
, source
, NULL
, MS_BIND
| MS_REC
, NULL
) < 0) {
3964 fuse_log(FUSE_LOG_ERR
, "mount(%s, %s, MS_BIND): %m\n", source
, source
);
3968 /* This magic is based on lxc's lxc_pivot_root() */
3969 oldroot
= open("/", O_DIRECTORY
| O_RDONLY
| O_CLOEXEC
);
3971 fuse_log(FUSE_LOG_ERR
, "open(/): %m\n");
3975 newroot
= open(source
, O_DIRECTORY
| O_RDONLY
| O_CLOEXEC
);
3977 fuse_log(FUSE_LOG_ERR
, "open(%s): %m\n", source
);
3981 if (fchdir(newroot
) < 0) {
3982 fuse_log(FUSE_LOG_ERR
, "fchdir(newroot): %m\n");
3986 if (syscall(__NR_pivot_root
, ".", ".") < 0) {
3987 fuse_log(FUSE_LOG_ERR
, "pivot_root(., .): %m\n");
3991 if (fchdir(oldroot
) < 0) {
3992 fuse_log(FUSE_LOG_ERR
, "fchdir(oldroot): %m\n");
3996 if (mount("", ".", "", MS_SLAVE
| MS_REC
, NULL
) < 0) {
3997 fuse_log(FUSE_LOG_ERR
, "mount(., MS_SLAVE | MS_REC): %m\n");
4001 if (umount2(".", MNT_DETACH
) < 0) {
4002 fuse_log(FUSE_LOG_ERR
, "umount2(., MNT_DETACH): %m\n");
4006 if (fchdir(newroot
) < 0) {
4007 fuse_log(FUSE_LOG_ERR
, "fchdir(newroot): %m\n");
4016 * Only keep capabilities in allowlist that are needed for file system operation
4017 * The (possibly NULL) modcaps_in string passed in is free'd before exit.
4019 static void setup_capabilities(char *modcaps_in
)
4021 char *modcaps
= modcaps_in
;
4022 pthread_mutex_lock(&cap
.mutex
);
4023 capng_restore_state(&cap
.saved
);
4026 * Add to allowlist file system-related capabilities that are needed for a
4027 * file server to act like root. Drop everything else like networking and
4028 * sysadmin capabilities.
4031 * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl
4032 * and we don't support that.
4033 * 2. CAP_MAC_OVERRIDE is not included because it only seems to be
4034 * used by the Smack LSM. Omit it until there is demand for it.
4036 capng_setpid(syscall(SYS_gettid
));
4037 capng_clear(CAPNG_SELECT_BOTH
);
4038 if (capng_updatev(CAPNG_ADD
, CAPNG_PERMITTED
| CAPNG_EFFECTIVE
,
4048 fuse_log(FUSE_LOG_ERR
, "%s: capng_updatev failed\n", __func__
);
4053 * The modcaps option is a colon separated list of caps,
4054 * each preceded by either + or -.
4060 char *next
= strchr(modcaps
, ':');
4066 switch (modcaps
[0]) {
4072 action
= CAPNG_DROP
;
4076 fuse_log(FUSE_LOG_ERR
,
4077 "%s: Expecting '+'/'-' in modcaps but found '%c'\n",
4078 __func__
, modcaps
[0]);
4081 cap
= capng_name_to_capability(modcaps
+ 1);
4083 fuse_log(FUSE_LOG_ERR
, "%s: Unknown capability '%s'\n", __func__
,
4087 if (capng_update(action
, CAPNG_PERMITTED
| CAPNG_EFFECTIVE
, cap
)) {
4088 fuse_log(FUSE_LOG_ERR
, "%s: capng_update failed for '%s'\n",
4097 if (capng_apply(CAPNG_SELECT_BOTH
)) {
4098 fuse_log(FUSE_LOG_ERR
, "%s: capng_apply failed\n", __func__
);
4102 cap
.saved
= capng_save_state();
4104 fuse_log(FUSE_LOG_ERR
, "%s: capng_save_state failed\n", __func__
);
4107 pthread_mutex_unlock(&cap
.mutex
);
4111 * Use chroot as a weaker sandbox for environments where the process is
4112 * launched without CAP_SYS_ADMIN.
4114 static void setup_chroot(struct lo_data
*lo
)
4116 lo
->proc_self_fd
= open("/proc/self/fd", O_PATH
);
4117 if (lo
->proc_self_fd
== -1) {
4118 fuse_log(FUSE_LOG_ERR
, "open(\"/proc/self/fd\", O_PATH): %m\n");
4122 lo
->proc_self_task
= open("/proc/self/task", O_PATH
);
4123 if (lo
->proc_self_fd
== -1) {
4124 fuse_log(FUSE_LOG_ERR
, "open(\"/proc/self/task\", O_PATH): %m\n");
4128 lo
->use_fscreate
= is_fscreate_usable(lo
);
4131 * Make the shared directory the file system root so that FUSE_OPEN
4132 * (lo_open()) cannot escape the shared directory by opening a symlink.
4134 * The chroot(2) syscall is later disabled by seccomp and the
4135 * CAP_SYS_CHROOT capability is dropped so that tampering with the chroot
4138 * However, it's still possible to escape the chroot via lo->proc_self_fd
4139 * but that requires first gaining control of the process.
4141 if (chroot(lo
->source
) != 0) {
4142 fuse_log(FUSE_LOG_ERR
, "chroot(\"%s\"): %m\n", lo
->source
);
4146 /* Move into the chroot */
4147 if (chdir("/") != 0) {
4148 fuse_log(FUSE_LOG_ERR
, "chdir(\"/\"): %m\n");
4154 * Lock down this process to prevent access to other processes or files outside
4155 * source directory. This reduces the impact of arbitrary code execution bugs.
4157 static void setup_sandbox(struct lo_data
*lo
, struct fuse_session
*se
,
4160 if (lo
->sandbox
== SANDBOX_NAMESPACE
) {
4161 setup_namespaces(lo
, se
);
4162 setup_mounts(lo
->source
);
4167 setup_seccomp(enable_syslog
);
4168 setup_capabilities(g_strdup(lo
->modcaps
));
4171 /* Set the maximum number of open file descriptors */
4172 static void setup_nofile_rlimit(unsigned long rlimit_nofile
)
4174 struct rlimit rlim
= {
4175 .rlim_cur
= rlimit_nofile
,
4176 .rlim_max
= rlimit_nofile
,
4179 if (rlimit_nofile
== 0) {
4180 return; /* nothing to do */
4183 if (setrlimit(RLIMIT_NOFILE
, &rlim
) < 0) {
4184 /* Ignore SELinux denials */
4185 if (errno
== EPERM
) {
4189 fuse_log(FUSE_LOG_ERR
, "setrlimit(RLIMIT_NOFILE): %m\n");
4194 static void log_func(enum fuse_log_level level
, const char *fmt
, va_list ap
)
4196 g_autofree
char *localfmt
= NULL
;
4198 if (current_log_level
< level
) {
4202 if (current_log_level
== FUSE_LOG_DEBUG
) {
4204 /* no timestamp needed */
4205 localfmt
= g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid
),
4208 g_autoptr(GDateTime
) now
= g_date_time_new_now_utc();
4209 g_autofree
char *nowstr
= g_date_time_format(now
, "%Y-%m-%d %H:%M:%S.%f%z");
4210 localfmt
= g_strdup_printf("[%s] [ID: %08ld] %s",
4211 nowstr
, syscall(__NR_gettid
), fmt
);
4217 int priority
= LOG_ERR
;
4219 case FUSE_LOG_EMERG
:
4220 priority
= LOG_EMERG
;
4222 case FUSE_LOG_ALERT
:
4223 priority
= LOG_ALERT
;
4226 priority
= LOG_CRIT
;
4231 case FUSE_LOG_WARNING
:
4232 priority
= LOG_WARNING
;
4234 case FUSE_LOG_NOTICE
:
4235 priority
= LOG_NOTICE
;
4238 priority
= LOG_INFO
;
4240 case FUSE_LOG_DEBUG
:
4241 priority
= LOG_DEBUG
;
4244 vsyslog(priority
, fmt
, ap
);
4246 vfprintf(stderr
, fmt
, ap
);
4250 static void setup_root(struct lo_data
*lo
, struct lo_inode
*root
)
4256 fd
= open("/", O_PATH
);
4258 fuse_log(FUSE_LOG_ERR
, "open(%s, O_PATH): %m\n", lo
->source
);
4262 res
= do_statx(lo
, fd
, "", &stat
, AT_EMPTY_PATH
| AT_SYMLINK_NOFOLLOW
,
4265 fuse_log(FUSE_LOG_ERR
, "fstatat(%s): %m\n", lo
->source
);
4269 root
->filetype
= S_IFDIR
;
4271 root
->key
.ino
= stat
.st_ino
;
4272 root
->key
.dev
= stat
.st_dev
;
4273 root
->key
.mnt_id
= mnt_id
;
4275 g_atomic_int_set(&root
->refcount
, 2);
4276 if (lo
->posix_lock
) {
4277 pthread_mutex_init(&root
->plock_mutex
, NULL
);
4278 root
->posix_locks
= g_hash_table_new_full(
4279 g_direct_hash
, g_direct_equal
, NULL
, posix_locks_value_destroy
);
4283 static guint
lo_key_hash(gconstpointer key
)
4285 const struct lo_key
*lkey
= key
;
4287 return (guint
)lkey
->ino
+ (guint
)lkey
->dev
+ (guint
)lkey
->mnt_id
;
4290 static gboolean
lo_key_equal(gconstpointer a
, gconstpointer b
)
4292 const struct lo_key
*la
= a
;
4293 const struct lo_key
*lb
= b
;
4295 return la
->ino
== lb
->ino
&& la
->dev
== lb
->dev
&& la
->mnt_id
== lb
->mnt_id
;
4298 static void fuse_lo_data_cleanup(struct lo_data
*lo
)
4301 g_hash_table_destroy(lo
->inodes
);
4304 if (lo
->root
.posix_locks
) {
4305 g_hash_table_destroy(lo
->root
.posix_locks
);
4307 lo_map_destroy(&lo
->fd_map
);
4308 lo_map_destroy(&lo
->dirp_map
);
4309 lo_map_destroy(&lo
->ino_map
);
4311 if (lo
->proc_self_fd
>= 0) {
4312 close(lo
->proc_self_fd
);
4315 if (lo
->proc_self_task
>= 0) {
4316 close(lo
->proc_self_task
);
4319 if (lo
->root
.fd
>= 0) {
4325 free(lo
->xattr_security_capability
);
4329 static void qemu_version(void)
4331 printf("virtiofsd version " QEMU_FULL_VERSION
"\n" QEMU_COPYRIGHT
"\n");
4334 int main(int argc
, char *argv
[])
4336 struct fuse_args args
= FUSE_ARGS_INIT(argc
, argv
);
4337 struct fuse_session
*se
;
4338 struct fuse_cmdline_opts opts
;
4339 struct lo_data lo
= {
4340 .sandbox
= SANDBOX_NAMESPACE
,
4344 .allow_direct_io
= 0,
4346 .proc_self_task
= -1,
4347 .user_killpriv_v2
= -1,
4348 .user_posix_acl
= -1,
4349 .user_security_label
= -1,
4351 struct lo_map_elem
*root_elem
;
4352 struct lo_map_elem
*reserve_elem
;
4355 /* Initialize time conversion information for localtime_r(). */
4358 /* Don't mask creation mode, kernel already did that */
4361 qemu_init_exec_dir(argv
[0]);
4363 drop_supplementary_groups();
4365 pthread_mutex_init(&lo
.mutex
, NULL
);
4366 lo
.inodes
= g_hash_table_new(lo_key_hash
, lo_key_equal
);
4368 lo
.root
.fuse_ino
= FUSE_ROOT_ID
;
4369 lo
.cache
= CACHE_AUTO
;
4372 * Set up the ino map like this:
4373 * [0] Reserved (will not be used)
4376 lo_map_init(&lo
.ino_map
);
4377 reserve_elem
= lo_map_reserve(&lo
.ino_map
, 0);
4378 if (!reserve_elem
) {
4379 fuse_log(FUSE_LOG_ERR
, "failed to alloc reserve_elem.\n");
4382 reserve_elem
->in_use
= false;
4383 root_elem
= lo_map_reserve(&lo
.ino_map
, lo
.root
.fuse_ino
);
4385 fuse_log(FUSE_LOG_ERR
, "failed to alloc root_elem.\n");
4388 root_elem
->inode
= &lo
.root
;
4390 lo_map_init(&lo
.dirp_map
);
4391 lo_map_init(&lo
.fd_map
);
4393 if (fuse_parse_cmdline(&args
, &opts
) != 0) {
4396 fuse_set_log_func(log_func
);
4397 use_syslog
= opts
.syslog
;
4399 openlog("virtiofsd", LOG_PID
, LOG_DAEMON
);
4402 if (opts
.show_help
) {
4403 printf("usage: %s [options]\n\n", argv
[0]);
4404 fuse_cmdline_help();
4405 printf(" -o source=PATH shared directory tree\n");
4406 fuse_lowlevel_help();
4409 } else if (opts
.show_version
) {
4411 fuse_lowlevel_version();
4414 } else if (opts
.print_capabilities
) {
4415 print_capabilities();
4420 if (fuse_opt_parse(&args
, &lo
, lo_opts
, NULL
) == -1) {
4424 if (opts
.log_level
!= 0) {
4425 current_log_level
= opts
.log_level
;
4427 /* default log level is INFO */
4428 current_log_level
= FUSE_LOG_INFO
;
4430 lo
.debug
= opts
.debug
;
4432 current_log_level
= FUSE_LOG_DEBUG
;
4438 res
= lstat(lo
.source
, &stat
);
4440 fuse_log(FUSE_LOG_ERR
, "failed to stat source (\"%s\"): %m\n",
4444 if (!S_ISDIR(stat
.st_mode
)) {
4445 fuse_log(FUSE_LOG_ERR
, "source is not a directory\n");
4449 lo
.source
= strdup("/");
4451 fuse_log(FUSE_LOG_ERR
, "failed to strdup source\n");
4458 parse_xattrmap(&lo
);
4461 if (!lo
.timeout_set
) {
4472 lo
.timeout
= 86400.0;
4475 } else if (lo
.timeout
< 0) {
4476 fuse_log(FUSE_LOG_ERR
, "timeout is negative (%lf)\n", lo
.timeout
);
4480 if (lo
.user_posix_acl
== 1 && !lo
.xattr
) {
4481 fuse_log(FUSE_LOG_ERR
, "Can't enable posix ACLs. xattrs are disabled."
4486 lo
.use_statx
= true;
4488 se
= fuse_session_new(&args
, &lo_oper
, sizeof(lo_oper
), &lo
);
4493 if (fuse_set_signal_handlers(se
) != 0) {
4497 if (fuse_session_mount(se
) != 0) {
4501 fuse_daemonize(opts
.foreground
);
4503 setup_nofile_rlimit(opts
.rlimit_nofile
);
4505 /* Must be before sandbox since it wants /proc */
4508 setup_sandbox(&lo
, se
, opts
.syslog
);
4510 setup_root(&lo
, &lo
.root
);
4511 /* Block until ctrl+c or fusermount -u */
4512 ret
= virtio_loop(se
);
4514 fuse_session_unmount(se
);
4517 fuse_remove_signal_handlers(se
);
4519 fuse_session_destroy(se
);
4521 fuse_opt_free_args(&args
);
4523 fuse_lo_data_cleanup(&lo
);