osdep: Move memalign-related functions to their own header
[qemu/rayw.git] / block / export / fuse.c
blobe80b24a86715e46384a72a50a595de995c8dae38
1 /*
2 * Present a block device as a raw image through FUSE
4 * Copyright (c) 2020 Max Reitz <mreitz@redhat.com>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; under version 2 or later of the License.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, see <http://www.gnu.org/licenses/>.
19 #define FUSE_USE_VERSION 31
21 #include "qemu/osdep.h"
22 #include "qemu/memalign.h"
23 #include "block/aio.h"
24 #include "block/block.h"
25 #include "block/export.h"
26 #include "block/fuse.h"
27 #include "block/qapi.h"
28 #include "qapi/error.h"
29 #include "qapi/qapi-commands-block.h"
30 #include "sysemu/block-backend.h"
32 #include <fuse.h>
33 #include <fuse_lowlevel.h>
35 #if defined(CONFIG_FALLOCATE_ZERO_RANGE)
36 #include <linux/falloc.h>
37 #endif
39 #ifdef __linux__
40 #include <linux/fs.h>
41 #endif
43 /* Prevent overly long bounce buffer allocations */
44 #define FUSE_MAX_BOUNCE_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 64 * 1024 * 1024))
47 typedef struct FuseExport {
48 BlockExport common;
50 struct fuse_session *fuse_session;
51 struct fuse_buf fuse_buf;
52 bool mounted, fd_handler_set_up;
54 char *mountpoint;
55 bool writable;
56 bool growable;
57 /* Whether allow_other was used as a mount option or not */
58 bool allow_other;
60 mode_t st_mode;
61 uid_t st_uid;
62 gid_t st_gid;
63 } FuseExport;
65 static GHashTable *exports;
66 static const struct fuse_lowlevel_ops fuse_ops;
68 static void fuse_export_shutdown(BlockExport *exp);
69 static void fuse_export_delete(BlockExport *exp);
71 static void init_exports_table(void);
73 static int setup_fuse_export(FuseExport *exp, const char *mountpoint,
74 bool allow_other, Error **errp);
75 static void read_from_fuse_export(void *opaque);
77 static bool is_regular_file(const char *path, Error **errp);
80 static int fuse_export_create(BlockExport *blk_exp,
81 BlockExportOptions *blk_exp_args,
82 Error **errp)
84 FuseExport *exp = container_of(blk_exp, FuseExport, common);
85 BlockExportOptionsFuse *args = &blk_exp_args->u.fuse;
86 int ret;
88 assert(blk_exp_args->type == BLOCK_EXPORT_TYPE_FUSE);
90 /* For growable and writable exports, take the RESIZE permission */
91 if (args->growable || blk_exp_args->writable) {
92 uint64_t blk_perm, blk_shared_perm;
94 blk_get_perm(exp->common.blk, &blk_perm, &blk_shared_perm);
96 ret = blk_set_perm(exp->common.blk, blk_perm | BLK_PERM_RESIZE,
97 blk_shared_perm, errp);
98 if (ret < 0) {
99 return ret;
103 init_exports_table();
106 * It is important to do this check before calling is_regular_file() --
107 * that function will do a stat(), which we would have to handle if we
108 * already exported something on @mountpoint. But we cannot, because
109 * we are currently caught up here.
110 * (Note that ideally we would want to resolve relative paths here,
111 * but bdrv_make_absolute_filename() might do the wrong thing for
112 * paths that contain colons, and realpath() would resolve symlinks,
113 * which we do not want: The mount point is not going to be the
114 * symlink's destination, but the link itself.)
115 * So this will not catch all potential clashes, but hopefully at
116 * least the most common one of specifying exactly the same path
117 * string twice.
119 if (g_hash_table_contains(exports, args->mountpoint)) {
120 error_setg(errp, "There already is a FUSE export on '%s'",
121 args->mountpoint);
122 ret = -EEXIST;
123 goto fail;
126 if (!is_regular_file(args->mountpoint, errp)) {
127 ret = -EINVAL;
128 goto fail;
131 exp->mountpoint = g_strdup(args->mountpoint);
132 exp->writable = blk_exp_args->writable;
133 exp->growable = args->growable;
135 /* set default */
136 if (!args->has_allow_other) {
137 args->allow_other = FUSE_EXPORT_ALLOW_OTHER_AUTO;
140 exp->st_mode = S_IFREG | S_IRUSR;
141 if (exp->writable) {
142 exp->st_mode |= S_IWUSR;
144 exp->st_uid = getuid();
145 exp->st_gid = getgid();
147 if (args->allow_other == FUSE_EXPORT_ALLOW_OTHER_AUTO) {
148 /* Ignore errors on our first attempt */
149 ret = setup_fuse_export(exp, args->mountpoint, true, NULL);
150 exp->allow_other = ret == 0;
151 if (ret < 0) {
152 ret = setup_fuse_export(exp, args->mountpoint, false, errp);
154 } else {
155 exp->allow_other = args->allow_other == FUSE_EXPORT_ALLOW_OTHER_ON;
156 ret = setup_fuse_export(exp, args->mountpoint, exp->allow_other, errp);
158 if (ret < 0) {
159 goto fail;
162 return 0;
164 fail:
165 fuse_export_delete(blk_exp);
166 return ret;
170 * Allocates the global @exports hash table.
172 static void init_exports_table(void)
174 if (exports) {
175 return;
178 exports = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
182 * Create exp->fuse_session and mount it.
184 static int setup_fuse_export(FuseExport *exp, const char *mountpoint,
185 bool allow_other, Error **errp)
187 const char *fuse_argv[4];
188 char *mount_opts;
189 struct fuse_args fuse_args;
190 int ret;
193 * max_read needs to match what fuse_init() sets.
194 * max_write need not be supplied.
196 mount_opts = g_strdup_printf("max_read=%zu,default_permissions%s",
197 FUSE_MAX_BOUNCE_BYTES,
198 allow_other ? ",allow_other" : "");
200 fuse_argv[0] = ""; /* Dummy program name */
201 fuse_argv[1] = "-o";
202 fuse_argv[2] = mount_opts;
203 fuse_argv[3] = NULL;
204 fuse_args = (struct fuse_args)FUSE_ARGS_INIT(3, (char **)fuse_argv);
206 exp->fuse_session = fuse_session_new(&fuse_args, &fuse_ops,
207 sizeof(fuse_ops), exp);
208 g_free(mount_opts);
209 if (!exp->fuse_session) {
210 error_setg(errp, "Failed to set up FUSE session");
211 ret = -EIO;
212 goto fail;
215 ret = fuse_session_mount(exp->fuse_session, mountpoint);
216 if (ret < 0) {
217 error_setg(errp, "Failed to mount FUSE session to export");
218 ret = -EIO;
219 goto fail;
221 exp->mounted = true;
223 g_hash_table_insert(exports, g_strdup(mountpoint), NULL);
225 aio_set_fd_handler(exp->common.ctx,
226 fuse_session_fd(exp->fuse_session), true,
227 read_from_fuse_export, NULL, NULL, NULL, exp);
228 exp->fd_handler_set_up = true;
230 return 0;
232 fail:
233 fuse_export_shutdown(&exp->common);
234 return ret;
238 * Callback to be invoked when the FUSE session FD can be read from.
239 * (This is basically the FUSE event loop.)
241 static void read_from_fuse_export(void *opaque)
243 FuseExport *exp = opaque;
244 int ret;
246 blk_exp_ref(&exp->common);
248 do {
249 ret = fuse_session_receive_buf(exp->fuse_session, &exp->fuse_buf);
250 } while (ret == -EINTR);
251 if (ret < 0) {
252 goto out;
255 fuse_session_process_buf(exp->fuse_session, &exp->fuse_buf);
257 out:
258 blk_exp_unref(&exp->common);
261 static void fuse_export_shutdown(BlockExport *blk_exp)
263 FuseExport *exp = container_of(blk_exp, FuseExport, common);
265 if (exp->fuse_session) {
266 fuse_session_exit(exp->fuse_session);
268 if (exp->fd_handler_set_up) {
269 aio_set_fd_handler(exp->common.ctx,
270 fuse_session_fd(exp->fuse_session), true,
271 NULL, NULL, NULL, NULL, NULL);
272 exp->fd_handler_set_up = false;
276 if (exp->mountpoint) {
278 * Safe to drop now, because we will not handle any requests
279 * for this export anymore anyway.
281 g_hash_table_remove(exports, exp->mountpoint);
285 static void fuse_export_delete(BlockExport *blk_exp)
287 FuseExport *exp = container_of(blk_exp, FuseExport, common);
289 if (exp->fuse_session) {
290 if (exp->mounted) {
291 fuse_session_unmount(exp->fuse_session);
294 fuse_session_destroy(exp->fuse_session);
297 free(exp->fuse_buf.mem);
298 g_free(exp->mountpoint);
302 * Check whether @path points to a regular file. If not, put an
303 * appropriate message into *errp.
305 static bool is_regular_file(const char *path, Error **errp)
307 struct stat statbuf;
308 int ret;
310 ret = stat(path, &statbuf);
311 if (ret < 0) {
312 error_setg_errno(errp, errno, "Failed to stat '%s'", path);
313 return false;
316 if (!S_ISREG(statbuf.st_mode)) {
317 error_setg(errp, "'%s' is not a regular file", path);
318 return false;
321 return true;
325 * A chance to set change some parameters supplied to FUSE_INIT.
327 static void fuse_init(void *userdata, struct fuse_conn_info *conn)
330 * MIN_NON_ZERO() would not be wrong here, but what we set here
331 * must equal what has been passed to fuse_session_new().
332 * Therefore, as long as max_read must be passed as a mount option
333 * (which libfuse claims will be changed at some point), we have
334 * to set max_read to a fixed value here.
336 conn->max_read = FUSE_MAX_BOUNCE_BYTES;
338 conn->max_write = MIN_NON_ZERO(BDRV_REQUEST_MAX_BYTES, conn->max_write);
342 * Let clients look up files. Always return ENOENT because we only
343 * care about the mountpoint itself.
345 static void fuse_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
347 fuse_reply_err(req, ENOENT);
351 * Let clients get file attributes (i.e., stat() the file).
353 static void fuse_getattr(fuse_req_t req, fuse_ino_t inode,
354 struct fuse_file_info *fi)
356 struct stat statbuf;
357 int64_t length, allocated_blocks;
358 time_t now = time(NULL);
359 FuseExport *exp = fuse_req_userdata(req);
361 length = blk_getlength(exp->common.blk);
362 if (length < 0) {
363 fuse_reply_err(req, -length);
364 return;
367 allocated_blocks = bdrv_get_allocated_file_size(blk_bs(exp->common.blk));
368 if (allocated_blocks <= 0) {
369 allocated_blocks = DIV_ROUND_UP(length, 512);
370 } else {
371 allocated_blocks = DIV_ROUND_UP(allocated_blocks, 512);
374 statbuf = (struct stat) {
375 .st_ino = inode,
376 .st_mode = exp->st_mode,
377 .st_nlink = 1,
378 .st_uid = exp->st_uid,
379 .st_gid = exp->st_gid,
380 .st_size = length,
381 .st_blksize = blk_bs(exp->common.blk)->bl.request_alignment,
382 .st_blocks = allocated_blocks,
383 .st_atime = now,
384 .st_mtime = now,
385 .st_ctime = now,
388 fuse_reply_attr(req, &statbuf, 1.);
391 static int fuse_do_truncate(const FuseExport *exp, int64_t size,
392 bool req_zero_write, PreallocMode prealloc)
394 uint64_t blk_perm, blk_shared_perm;
395 BdrvRequestFlags truncate_flags = 0;
396 bool add_resize_perm;
397 int ret, ret_check;
399 /* Growable and writable exports have a permanent RESIZE permission */
400 add_resize_perm = !exp->growable && !exp->writable;
402 if (req_zero_write) {
403 truncate_flags |= BDRV_REQ_ZERO_WRITE;
406 if (add_resize_perm) {
408 if (!qemu_in_main_thread()) {
409 /* Changing permissions like below only works in the main thread */
410 return -EPERM;
413 blk_get_perm(exp->common.blk, &blk_perm, &blk_shared_perm);
415 ret = blk_set_perm(exp->common.blk, blk_perm | BLK_PERM_RESIZE,
416 blk_shared_perm, NULL);
417 if (ret < 0) {
418 return ret;
422 ret = blk_truncate(exp->common.blk, size, true, prealloc,
423 truncate_flags, NULL);
425 if (add_resize_perm) {
426 /* Must succeed, because we are only giving up the RESIZE permission */
427 ret_check = blk_set_perm(exp->common.blk, blk_perm,
428 blk_shared_perm, &error_abort);
429 assert(ret_check == 0);
432 return ret;
436 * Let clients set file attributes. Only resizing and changing
437 * permissions (st_mode, st_uid, st_gid) is allowed.
438 * Changing permissions is only allowed as far as it will actually
439 * permit access: Read-only exports cannot be given +w, and exports
440 * without allow_other cannot be given a different UID or GID, and
441 * they cannot be given non-owner access.
443 static void fuse_setattr(fuse_req_t req, fuse_ino_t inode, struct stat *statbuf,
444 int to_set, struct fuse_file_info *fi)
446 FuseExport *exp = fuse_req_userdata(req);
447 int supported_attrs;
448 int ret;
450 supported_attrs = FUSE_SET_ATTR_SIZE | FUSE_SET_ATTR_MODE;
451 if (exp->allow_other) {
452 supported_attrs |= FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID;
455 if (to_set & ~supported_attrs) {
456 fuse_reply_err(req, ENOTSUP);
457 return;
460 /* Do some argument checks first before committing to anything */
461 if (to_set & FUSE_SET_ATTR_MODE) {
463 * Without allow_other, non-owners can never access the export, so do
464 * not allow setting permissions for them
466 if (!exp->allow_other &&
467 (statbuf->st_mode & (S_IRWXG | S_IRWXO)) != 0)
469 fuse_reply_err(req, EPERM);
470 return;
473 /* +w for read-only exports makes no sense, disallow it */
474 if (!exp->writable &&
475 (statbuf->st_mode & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)
477 fuse_reply_err(req, EROFS);
478 return;
482 if (to_set & FUSE_SET_ATTR_SIZE) {
483 if (!exp->writable) {
484 fuse_reply_err(req, EACCES);
485 return;
488 ret = fuse_do_truncate(exp, statbuf->st_size, true, PREALLOC_MODE_OFF);
489 if (ret < 0) {
490 fuse_reply_err(req, -ret);
491 return;
495 if (to_set & FUSE_SET_ATTR_MODE) {
496 /* Ignore FUSE-supplied file type, only change the mode */
497 exp->st_mode = (statbuf->st_mode & 07777) | S_IFREG;
500 if (to_set & FUSE_SET_ATTR_UID) {
501 exp->st_uid = statbuf->st_uid;
504 if (to_set & FUSE_SET_ATTR_GID) {
505 exp->st_gid = statbuf->st_gid;
508 fuse_getattr(req, inode, fi);
512 * Let clients open a file (i.e., the exported image).
514 static void fuse_open(fuse_req_t req, fuse_ino_t inode,
515 struct fuse_file_info *fi)
517 fuse_reply_open(req, fi);
521 * Handle client reads from the exported image.
523 static void fuse_read(fuse_req_t req, fuse_ino_t inode,
524 size_t size, off_t offset, struct fuse_file_info *fi)
526 FuseExport *exp = fuse_req_userdata(req);
527 int64_t length;
528 void *buf;
529 int ret;
531 /* Limited by max_read, should not happen */
532 if (size > FUSE_MAX_BOUNCE_BYTES) {
533 fuse_reply_err(req, EINVAL);
534 return;
538 * Clients will expect short reads at EOF, so we have to limit
539 * offset+size to the image length.
541 length = blk_getlength(exp->common.blk);
542 if (length < 0) {
543 fuse_reply_err(req, -length);
544 return;
547 if (offset + size > length) {
548 size = length - offset;
551 buf = qemu_try_blockalign(blk_bs(exp->common.blk), size);
552 if (!buf) {
553 fuse_reply_err(req, ENOMEM);
554 return;
557 ret = blk_pread(exp->common.blk, offset, buf, size);
558 if (ret >= 0) {
559 fuse_reply_buf(req, buf, size);
560 } else {
561 fuse_reply_err(req, -ret);
564 qemu_vfree(buf);
568 * Handle client writes to the exported image.
570 static void fuse_write(fuse_req_t req, fuse_ino_t inode, const char *buf,
571 size_t size, off_t offset, struct fuse_file_info *fi)
573 FuseExport *exp = fuse_req_userdata(req);
574 int64_t length;
575 int ret;
577 /* Limited by max_write, should not happen */
578 if (size > BDRV_REQUEST_MAX_BYTES) {
579 fuse_reply_err(req, EINVAL);
580 return;
583 if (!exp->writable) {
584 fuse_reply_err(req, EACCES);
585 return;
589 * Clients will expect short writes at EOF, so we have to limit
590 * offset+size to the image length.
592 length = blk_getlength(exp->common.blk);
593 if (length < 0) {
594 fuse_reply_err(req, -length);
595 return;
598 if (offset + size > length) {
599 if (exp->growable) {
600 ret = fuse_do_truncate(exp, offset + size, true, PREALLOC_MODE_OFF);
601 if (ret < 0) {
602 fuse_reply_err(req, -ret);
603 return;
605 } else {
606 size = length - offset;
610 ret = blk_pwrite(exp->common.blk, offset, buf, size, 0);
611 if (ret >= 0) {
612 fuse_reply_write(req, size);
613 } else {
614 fuse_reply_err(req, -ret);
619 * Let clients perform various fallocate() operations.
621 static void fuse_fallocate(fuse_req_t req, fuse_ino_t inode, int mode,
622 off_t offset, off_t length,
623 struct fuse_file_info *fi)
625 FuseExport *exp = fuse_req_userdata(req);
626 int64_t blk_len;
627 int ret;
629 if (!exp->writable) {
630 fuse_reply_err(req, EACCES);
631 return;
634 blk_len = blk_getlength(exp->common.blk);
635 if (blk_len < 0) {
636 fuse_reply_err(req, -blk_len);
637 return;
640 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
641 if (mode & FALLOC_FL_KEEP_SIZE) {
642 length = MIN(length, blk_len - offset);
644 #endif /* CONFIG_FALLOCATE_PUNCH_HOLE */
646 if (!mode) {
647 /* We can only fallocate at the EOF with a truncate */
648 if (offset < blk_len) {
649 fuse_reply_err(req, EOPNOTSUPP);
650 return;
653 if (offset > blk_len) {
654 /* No preallocation needed here */
655 ret = fuse_do_truncate(exp, offset, true, PREALLOC_MODE_OFF);
656 if (ret < 0) {
657 fuse_reply_err(req, -ret);
658 return;
662 ret = fuse_do_truncate(exp, offset + length, true,
663 PREALLOC_MODE_FALLOC);
665 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
666 else if (mode & FALLOC_FL_PUNCH_HOLE) {
667 if (!(mode & FALLOC_FL_KEEP_SIZE)) {
668 fuse_reply_err(req, EINVAL);
669 return;
672 do {
673 int size = MIN(length, BDRV_REQUEST_MAX_BYTES);
675 ret = blk_pdiscard(exp->common.blk, offset, size);
676 offset += size;
677 length -= size;
678 } while (ret == 0 && length > 0);
680 #endif /* CONFIG_FALLOCATE_PUNCH_HOLE */
681 #ifdef CONFIG_FALLOCATE_ZERO_RANGE
682 else if (mode & FALLOC_FL_ZERO_RANGE) {
683 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + length > blk_len) {
684 /* No need for zeroes, we are going to write them ourselves */
685 ret = fuse_do_truncate(exp, offset + length, false,
686 PREALLOC_MODE_OFF);
687 if (ret < 0) {
688 fuse_reply_err(req, -ret);
689 return;
693 do {
694 int size = MIN(length, BDRV_REQUEST_MAX_BYTES);
696 ret = blk_pwrite_zeroes(exp->common.blk,
697 offset, size, 0);
698 offset += size;
699 length -= size;
700 } while (ret == 0 && length > 0);
702 #endif /* CONFIG_FALLOCATE_ZERO_RANGE */
703 else {
704 ret = -EOPNOTSUPP;
707 fuse_reply_err(req, ret < 0 ? -ret : 0);
711 * Let clients fsync the exported image.
713 static void fuse_fsync(fuse_req_t req, fuse_ino_t inode, int datasync,
714 struct fuse_file_info *fi)
716 FuseExport *exp = fuse_req_userdata(req);
717 int ret;
719 ret = blk_flush(exp->common.blk);
720 fuse_reply_err(req, ret < 0 ? -ret : 0);
724 * Called before an FD to the exported image is closed. (libfuse
725 * notes this to be a way to return last-minute errors.)
727 static void fuse_flush(fuse_req_t req, fuse_ino_t inode,
728 struct fuse_file_info *fi)
730 fuse_fsync(req, inode, 1, fi);
733 #ifdef CONFIG_FUSE_LSEEK
735 * Let clients inquire allocation status.
737 static void fuse_lseek(fuse_req_t req, fuse_ino_t inode, off_t offset,
738 int whence, struct fuse_file_info *fi)
740 FuseExport *exp = fuse_req_userdata(req);
742 if (whence != SEEK_HOLE && whence != SEEK_DATA) {
743 fuse_reply_err(req, EINVAL);
744 return;
747 while (true) {
748 int64_t pnum;
749 int ret;
751 ret = bdrv_block_status_above(blk_bs(exp->common.blk), NULL,
752 offset, INT64_MAX, &pnum, NULL, NULL);
753 if (ret < 0) {
754 fuse_reply_err(req, -ret);
755 return;
758 if (!pnum && (ret & BDRV_BLOCK_EOF)) {
759 int64_t blk_len;
762 * If blk_getlength() rounds (e.g. by sectors), then the
763 * export length will be rounded, too. However,
764 * bdrv_block_status_above() may return EOF at unaligned
765 * offsets. We must not let this become visible and thus
766 * always simulate a hole between @offset (the real EOF)
767 * and @blk_len (the client-visible EOF).
770 blk_len = blk_getlength(exp->common.blk);
771 if (blk_len < 0) {
772 fuse_reply_err(req, -blk_len);
773 return;
776 if (offset > blk_len || whence == SEEK_DATA) {
777 fuse_reply_err(req, ENXIO);
778 } else {
779 fuse_reply_lseek(req, offset);
781 return;
784 if (ret & BDRV_BLOCK_DATA) {
785 if (whence == SEEK_DATA) {
786 fuse_reply_lseek(req, offset);
787 return;
789 } else {
790 if (whence == SEEK_HOLE) {
791 fuse_reply_lseek(req, offset);
792 return;
796 /* Safety check against infinite loops */
797 if (!pnum) {
798 fuse_reply_err(req, ENXIO);
799 return;
802 offset += pnum;
805 #endif
807 static const struct fuse_lowlevel_ops fuse_ops = {
808 .init = fuse_init,
809 .lookup = fuse_lookup,
810 .getattr = fuse_getattr,
811 .setattr = fuse_setattr,
812 .open = fuse_open,
813 .read = fuse_read,
814 .write = fuse_write,
815 .fallocate = fuse_fallocate,
816 .flush = fuse_flush,
817 .fsync = fuse_fsync,
818 #ifdef CONFIG_FUSE_LSEEK
819 .lseek = fuse_lseek,
820 #endif
823 const BlockExportDriver blk_exp_fuse = {
824 .type = BLOCK_EXPORT_TYPE_FUSE,
825 .instance_size = sizeof(FuseExport),
826 .create = fuse_export_create,
827 .delete = fuse_export_delete,
828 .request_shutdown = fuse_export_shutdown,