mptsas: remove unnecessary internal msi state flag
[qemu/ar7.git] / block / gluster.c
blob16f7778a5079c5b8903cf6dc30a4fe47a3e6e4d4
1 /*
2 * GlusterFS backend for QEMU
4 * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com>
6 * This work is licensed under the terms of the GNU GPL, version 2 or later.
7 * See the COPYING file in the top-level directory.
9 */
10 #include "qemu/osdep.h"
11 #include <glusterfs/api/glfs.h>
12 #include "block/block_int.h"
13 #include "qapi/error.h"
14 #include "qemu/uri.h"
16 typedef struct GlusterAIOCB {
17 int64_t size;
18 int ret;
19 QEMUBH *bh;
20 Coroutine *coroutine;
21 AioContext *aio_context;
22 } GlusterAIOCB;
24 typedef struct BDRVGlusterState {
25 struct glfs *glfs;
26 struct glfs_fd *fd;
27 bool supports_seek_data;
28 int debug_level;
29 } BDRVGlusterState;
31 typedef struct GlusterConf {
32 char *server;
33 int port;
34 char *volname;
35 char *image;
36 char *transport;
37 int debug_level;
38 } GlusterConf;
40 static void qemu_gluster_gconf_free(GlusterConf *gconf)
42 if (gconf) {
43 g_free(gconf->server);
44 g_free(gconf->volname);
45 g_free(gconf->image);
46 g_free(gconf->transport);
47 g_free(gconf);
51 static int parse_volume_options(GlusterConf *gconf, char *path)
53 char *p, *q;
55 if (!path) {
56 return -EINVAL;
59 /* volume */
60 p = q = path + strspn(path, "/");
61 p += strcspn(p, "/");
62 if (*p == '\0') {
63 return -EINVAL;
65 gconf->volname = g_strndup(q, p - q);
67 /* image */
68 p += strspn(p, "/");
69 if (*p == '\0') {
70 return -EINVAL;
72 gconf->image = g_strdup(p);
73 return 0;
77 * file=gluster[+transport]://[server[:port]]/volname/image[?socket=...]
79 * 'gluster' is the protocol.
81 * 'transport' specifies the transport type used to connect to gluster
82 * management daemon (glusterd). Valid transport types are
83 * tcp, unix and rdma. If a transport type isn't specified, then tcp
84 * type is assumed.
86 * 'server' specifies the server where the volume file specification for
87 * the given volume resides. This can be either hostname, ipv4 address
88 * or ipv6 address. ipv6 address needs to be within square brackets [ ].
89 * If transport type is 'unix', then 'server' field should not be specified.
90 * The 'socket' field needs to be populated with the path to unix domain
91 * socket.
93 * 'port' is the port number on which glusterd is listening. This is optional
94 * and if not specified, QEMU will send 0 which will make gluster to use the
95 * default port. If the transport type is unix, then 'port' should not be
96 * specified.
98 * 'volname' is the name of the gluster volume which contains the VM image.
100 * 'image' is the path to the actual VM image that resides on gluster volume.
102 * Examples:
104 * file=gluster://1.2.3.4/testvol/a.img
105 * file=gluster+tcp://1.2.3.4/testvol/a.img
106 * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img
107 * file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
108 * file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img
109 * file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img
110 * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
111 * file=gluster+rdma://1.2.3.4:24007/testvol/a.img
113 static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename)
115 URI *uri;
116 QueryParams *qp = NULL;
117 bool is_unix = false;
118 int ret = 0;
120 uri = uri_parse(filename);
121 if (!uri) {
122 return -EINVAL;
125 /* transport */
126 if (!uri->scheme || !strcmp(uri->scheme, "gluster")) {
127 gconf->transport = g_strdup("tcp");
128 } else if (!strcmp(uri->scheme, "gluster+tcp")) {
129 gconf->transport = g_strdup("tcp");
130 } else if (!strcmp(uri->scheme, "gluster+unix")) {
131 gconf->transport = g_strdup("unix");
132 is_unix = true;
133 } else if (!strcmp(uri->scheme, "gluster+rdma")) {
134 gconf->transport = g_strdup("rdma");
135 } else {
136 ret = -EINVAL;
137 goto out;
140 ret = parse_volume_options(gconf, uri->path);
141 if (ret < 0) {
142 goto out;
145 qp = query_params_parse(uri->query);
146 if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) {
147 ret = -EINVAL;
148 goto out;
151 if (is_unix) {
152 if (uri->server || uri->port) {
153 ret = -EINVAL;
154 goto out;
156 if (strcmp(qp->p[0].name, "socket")) {
157 ret = -EINVAL;
158 goto out;
160 gconf->server = g_strdup(qp->p[0].value);
161 } else {
162 gconf->server = g_strdup(uri->server ? uri->server : "localhost");
163 gconf->port = uri->port;
166 out:
167 if (qp) {
168 query_params_free(qp);
170 uri_free(uri);
171 return ret;
174 static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename,
175 Error **errp)
177 struct glfs *glfs = NULL;
178 int ret;
179 int old_errno;
181 ret = qemu_gluster_parseuri(gconf, filename);
182 if (ret < 0) {
183 error_setg(errp, "Usage: file=gluster[+transport]://[server[:port]]/"
184 "volname/image[?socket=...]");
185 errno = -ret;
186 goto out;
189 glfs = glfs_new(gconf->volname);
190 if (!glfs) {
191 goto out;
194 ret = glfs_set_volfile_server(glfs, gconf->transport, gconf->server,
195 gconf->port);
196 if (ret < 0) {
197 goto out;
200 ret = glfs_set_logging(glfs, "-", gconf->debug_level);
201 if (ret < 0) {
202 goto out;
205 ret = glfs_init(glfs);
206 if (ret) {
207 error_setg_errno(errp, errno,
208 "Gluster connection failed for server=%s port=%d "
209 "volume=%s image=%s transport=%s", gconf->server,
210 gconf->port, gconf->volname, gconf->image,
211 gconf->transport);
213 /* glfs_init sometimes doesn't set errno although docs suggest that */
214 if (errno == 0)
215 errno = EINVAL;
217 goto out;
219 return glfs;
221 out:
222 if (glfs) {
223 old_errno = errno;
224 glfs_fini(glfs);
225 errno = old_errno;
227 return NULL;
230 static void qemu_gluster_complete_aio(void *opaque)
232 GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
234 qemu_bh_delete(acb->bh);
235 acb->bh = NULL;
236 qemu_coroutine_enter(acb->coroutine, NULL);
240 * AIO callback routine called from GlusterFS thread.
242 static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
244 GlusterAIOCB *acb = (GlusterAIOCB *)arg;
246 if (!ret || ret == acb->size) {
247 acb->ret = 0; /* Success */
248 } else if (ret < 0) {
249 acb->ret = -errno; /* Read/Write failed */
250 } else {
251 acb->ret = -EIO; /* Partial read/write - fail it */
254 acb->bh = aio_bh_new(acb->aio_context, qemu_gluster_complete_aio, acb);
255 qemu_bh_schedule(acb->bh);
258 #define GLUSTER_OPT_FILENAME "filename"
259 #define GLUSTER_OPT_DEBUG "debug"
260 #define GLUSTER_DEBUG_DEFAULT 4
261 #define GLUSTER_DEBUG_MAX 9
263 /* TODO Convert to fine grained options */
264 static QemuOptsList runtime_opts = {
265 .name = "gluster",
266 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
267 .desc = {
269 .name = GLUSTER_OPT_FILENAME,
270 .type = QEMU_OPT_STRING,
271 .help = "URL to the gluster image",
274 .name = GLUSTER_OPT_DEBUG,
275 .type = QEMU_OPT_NUMBER,
276 .help = "Gluster log level, valid range is 0-9",
278 { /* end of list */ }
282 static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
284 assert(open_flags != NULL);
286 *open_flags |= O_BINARY;
288 if (bdrv_flags & BDRV_O_RDWR) {
289 *open_flags |= O_RDWR;
290 } else {
291 *open_flags |= O_RDONLY;
294 if ((bdrv_flags & BDRV_O_NOCACHE)) {
295 *open_flags |= O_DIRECT;
300 * Do SEEK_DATA/HOLE to detect if it is functional. Older broken versions of
301 * gfapi incorrectly return the current offset when SEEK_DATA/HOLE is used.
302 * - Corrected versions return -1 and set errno to EINVAL.
303 * - Versions that support SEEK_DATA/HOLE correctly, will return -1 and set
304 * errno to ENXIO when SEEK_DATA is called with a position of EOF.
306 static bool qemu_gluster_test_seek(struct glfs_fd *fd)
308 off_t ret, eof;
310 eof = glfs_lseek(fd, 0, SEEK_END);
311 if (eof < 0) {
312 /* this should never occur */
313 return false;
316 /* this should always fail with ENXIO if SEEK_DATA is supported */
317 ret = glfs_lseek(fd, eof, SEEK_DATA);
318 return (ret < 0) && (errno == ENXIO);
321 static int qemu_gluster_open(BlockDriverState *bs, QDict *options,
322 int bdrv_flags, Error **errp)
324 BDRVGlusterState *s = bs->opaque;
325 int open_flags = 0;
326 int ret = 0;
327 GlusterConf *gconf = g_new0(GlusterConf, 1);
328 QemuOpts *opts;
329 Error *local_err = NULL;
330 const char *filename;
332 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
333 qemu_opts_absorb_qdict(opts, options, &local_err);
334 if (local_err) {
335 error_propagate(errp, local_err);
336 ret = -EINVAL;
337 goto out;
340 filename = qemu_opt_get(opts, GLUSTER_OPT_FILENAME);
342 s->debug_level = qemu_opt_get_number(opts, GLUSTER_OPT_DEBUG,
343 GLUSTER_DEBUG_DEFAULT);
344 if (s->debug_level < 0) {
345 s->debug_level = 0;
346 } else if (s->debug_level > GLUSTER_DEBUG_MAX) {
347 s->debug_level = GLUSTER_DEBUG_MAX;
350 gconf->debug_level = s->debug_level;
351 s->glfs = qemu_gluster_init(gconf, filename, errp);
352 if (!s->glfs) {
353 ret = -errno;
354 goto out;
357 #ifdef CONFIG_GLUSTERFS_XLATOR_OPT
358 /* Without this, if fsync fails for a recoverable reason (for instance,
359 * ENOSPC), gluster will dump its cache, preventing retries. This means
360 * almost certain data loss. Not all gluster versions support the
361 * 'resync-failed-syncs-after-fsync' key value, but there is no way to
362 * discover during runtime if it is supported (this api returns success for
363 * unknown key/value pairs) */
364 ret = glfs_set_xlator_option(s->glfs, "*-write-behind",
365 "resync-failed-syncs-after-fsync",
366 "on");
367 if (ret < 0) {
368 error_setg_errno(errp, errno, "Unable to set xlator key/value pair");
369 ret = -errno;
370 goto out;
372 #endif
374 qemu_gluster_parse_flags(bdrv_flags, &open_flags);
376 s->fd = glfs_open(s->glfs, gconf->image, open_flags);
377 if (!s->fd) {
378 ret = -errno;
381 s->supports_seek_data = qemu_gluster_test_seek(s->fd);
383 out:
384 qemu_opts_del(opts);
385 qemu_gluster_gconf_free(gconf);
386 if (!ret) {
387 return ret;
389 if (s->fd) {
390 glfs_close(s->fd);
392 if (s->glfs) {
393 glfs_fini(s->glfs);
395 return ret;
398 typedef struct BDRVGlusterReopenState {
399 struct glfs *glfs;
400 struct glfs_fd *fd;
401 } BDRVGlusterReopenState;
404 static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
405 BlockReopenQueue *queue, Error **errp)
407 int ret = 0;
408 BDRVGlusterState *s;
409 BDRVGlusterReopenState *reop_s;
410 GlusterConf *gconf = NULL;
411 int open_flags = 0;
413 assert(state != NULL);
414 assert(state->bs != NULL);
416 s = state->bs->opaque;
418 state->opaque = g_new0(BDRVGlusterReopenState, 1);
419 reop_s = state->opaque;
421 qemu_gluster_parse_flags(state->flags, &open_flags);
423 gconf = g_new0(GlusterConf, 1);
425 gconf->debug_level = s->debug_level;
426 reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp);
427 if (reop_s->glfs == NULL) {
428 ret = -errno;
429 goto exit;
432 #ifdef CONFIG_GLUSTERFS_XLATOR_OPT
433 ret = glfs_set_xlator_option(reop_s->glfs, "*-write-behind",
434 "resync-failed-syncs-after-fsync", "on");
435 if (ret < 0) {
436 error_setg_errno(errp, errno, "Unable to set xlator key/value pair");
437 ret = -errno;
438 goto exit;
440 #endif
442 reop_s->fd = glfs_open(reop_s->glfs, gconf->image, open_flags);
443 if (reop_s->fd == NULL) {
444 /* reops->glfs will be cleaned up in _abort */
445 ret = -errno;
446 goto exit;
449 exit:
450 /* state->opaque will be freed in either the _abort or _commit */
451 qemu_gluster_gconf_free(gconf);
452 return ret;
455 static void qemu_gluster_reopen_commit(BDRVReopenState *state)
457 BDRVGlusterReopenState *reop_s = state->opaque;
458 BDRVGlusterState *s = state->bs->opaque;
461 /* close the old */
462 if (s->fd) {
463 glfs_close(s->fd);
465 if (s->glfs) {
466 glfs_fini(s->glfs);
469 /* use the newly opened image / connection */
470 s->fd = reop_s->fd;
471 s->glfs = reop_s->glfs;
473 g_free(state->opaque);
474 state->opaque = NULL;
476 return;
480 static void qemu_gluster_reopen_abort(BDRVReopenState *state)
482 BDRVGlusterReopenState *reop_s = state->opaque;
484 if (reop_s == NULL) {
485 return;
488 if (reop_s->fd) {
489 glfs_close(reop_s->fd);
492 if (reop_s->glfs) {
493 glfs_fini(reop_s->glfs);
496 g_free(state->opaque);
497 state->opaque = NULL;
499 return;
502 #ifdef CONFIG_GLUSTERFS_ZEROFILL
503 static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
504 int64_t offset, int size, BdrvRequestFlags flags)
506 int ret;
507 GlusterAIOCB acb;
508 BDRVGlusterState *s = bs->opaque;
510 acb.size = size;
511 acb.ret = 0;
512 acb.coroutine = qemu_coroutine_self();
513 acb.aio_context = bdrv_get_aio_context(bs);
515 ret = glfs_zerofill_async(s->fd, offset, size, gluster_finish_aiocb, &acb);
516 if (ret < 0) {
517 return -errno;
520 qemu_coroutine_yield();
521 return acb.ret;
524 static inline bool gluster_supports_zerofill(void)
526 return 1;
529 static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
530 int64_t size)
532 return glfs_zerofill(fd, offset, size);
535 #else
536 static inline bool gluster_supports_zerofill(void)
538 return 0;
541 static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
542 int64_t size)
544 return 0;
546 #endif
548 static int qemu_gluster_create(const char *filename,
549 QemuOpts *opts, Error **errp)
551 struct glfs *glfs;
552 struct glfs_fd *fd;
553 int ret = 0;
554 int prealloc = 0;
555 int64_t total_size = 0;
556 char *tmp = NULL;
557 GlusterConf *gconf = g_new0(GlusterConf, 1);
559 gconf->debug_level = qemu_opt_get_number_del(opts, GLUSTER_OPT_DEBUG,
560 GLUSTER_DEBUG_DEFAULT);
561 if (gconf->debug_level < 0) {
562 gconf->debug_level = 0;
563 } else if (gconf->debug_level > GLUSTER_DEBUG_MAX) {
564 gconf->debug_level = GLUSTER_DEBUG_MAX;
567 glfs = qemu_gluster_init(gconf, filename, errp);
568 if (!glfs) {
569 ret = -errno;
570 goto out;
573 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
574 BDRV_SECTOR_SIZE);
576 tmp = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
577 if (!tmp || !strcmp(tmp, "off")) {
578 prealloc = 0;
579 } else if (!strcmp(tmp, "full") &&
580 gluster_supports_zerofill()) {
581 prealloc = 1;
582 } else {
583 error_setg(errp, "Invalid preallocation mode: '%s'"
584 " or GlusterFS doesn't support zerofill API",
585 tmp);
586 ret = -EINVAL;
587 goto out;
590 fd = glfs_creat(glfs, gconf->image,
591 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
592 if (!fd) {
593 ret = -errno;
594 } else {
595 if (!glfs_ftruncate(fd, total_size)) {
596 if (prealloc && qemu_gluster_zerofill(fd, 0, total_size)) {
597 ret = -errno;
599 } else {
600 ret = -errno;
603 if (glfs_close(fd) != 0) {
604 ret = -errno;
607 out:
608 g_free(tmp);
609 qemu_gluster_gconf_free(gconf);
610 if (glfs) {
611 glfs_fini(glfs);
613 return ret;
616 static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
617 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write)
619 int ret;
620 GlusterAIOCB acb;
621 BDRVGlusterState *s = bs->opaque;
622 size_t size = nb_sectors * BDRV_SECTOR_SIZE;
623 off_t offset = sector_num * BDRV_SECTOR_SIZE;
625 acb.size = size;
626 acb.ret = 0;
627 acb.coroutine = qemu_coroutine_self();
628 acb.aio_context = bdrv_get_aio_context(bs);
630 if (write) {
631 ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
632 gluster_finish_aiocb, &acb);
633 } else {
634 ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
635 gluster_finish_aiocb, &acb);
638 if (ret < 0) {
639 return -errno;
642 qemu_coroutine_yield();
643 return acb.ret;
646 static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
648 int ret;
649 BDRVGlusterState *s = bs->opaque;
651 ret = glfs_ftruncate(s->fd, offset);
652 if (ret < 0) {
653 return -errno;
656 return 0;
659 static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs,
660 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
662 return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0);
665 static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
666 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
668 return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1);
671 static void qemu_gluster_close(BlockDriverState *bs)
673 BDRVGlusterState *s = bs->opaque;
675 if (s->fd) {
676 glfs_close(s->fd);
677 s->fd = NULL;
679 glfs_fini(s->glfs);
682 static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
684 int ret;
685 GlusterAIOCB acb;
686 BDRVGlusterState *s = bs->opaque;
688 acb.size = 0;
689 acb.ret = 0;
690 acb.coroutine = qemu_coroutine_self();
691 acb.aio_context = bdrv_get_aio_context(bs);
693 ret = glfs_fsync_async(s->fd, gluster_finish_aiocb, &acb);
694 if (ret < 0) {
695 ret = -errno;
696 goto error;
699 qemu_coroutine_yield();
700 if (acb.ret < 0) {
701 ret = acb.ret;
702 goto error;
705 return acb.ret;
707 error:
708 /* Some versions of Gluster (3.5.6 -> 3.5.8?) will not retain its cache
709 * after a fsync failure, so we have no way of allowing the guest to safely
710 * continue. Gluster versions prior to 3.5.6 don't retain the cache
711 * either, but will invalidate the fd on error, so this is again our only
712 * option.
714 * The 'resync-failed-syncs-after-fsync' xlator option for the
715 * write-behind cache will cause later gluster versions to retain its
716 * cache after error, so long as the fd remains open. However, we
717 * currently have no way of knowing if this option is supported.
719 * TODO: Once gluster provides a way for us to determine if the option
720 * is supported, bypass the closure and setting drv to NULL. */
721 qemu_gluster_close(bs);
722 bs->drv = NULL;
723 return ret;
726 #ifdef CONFIG_GLUSTERFS_DISCARD
727 static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs,
728 int64_t sector_num, int nb_sectors)
730 int ret;
731 GlusterAIOCB acb;
732 BDRVGlusterState *s = bs->opaque;
733 size_t size = nb_sectors * BDRV_SECTOR_SIZE;
734 off_t offset = sector_num * BDRV_SECTOR_SIZE;
736 acb.size = 0;
737 acb.ret = 0;
738 acb.coroutine = qemu_coroutine_self();
739 acb.aio_context = bdrv_get_aio_context(bs);
741 ret = glfs_discard_async(s->fd, offset, size, gluster_finish_aiocb, &acb);
742 if (ret < 0) {
743 return -errno;
746 qemu_coroutine_yield();
747 return acb.ret;
749 #endif
751 static int64_t qemu_gluster_getlength(BlockDriverState *bs)
753 BDRVGlusterState *s = bs->opaque;
754 int64_t ret;
756 ret = glfs_lseek(s->fd, 0, SEEK_END);
757 if (ret < 0) {
758 return -errno;
759 } else {
760 return ret;
764 static int64_t qemu_gluster_allocated_file_size(BlockDriverState *bs)
766 BDRVGlusterState *s = bs->opaque;
767 struct stat st;
768 int ret;
770 ret = glfs_fstat(s->fd, &st);
771 if (ret < 0) {
772 return -errno;
773 } else {
774 return st.st_blocks * 512;
778 static int qemu_gluster_has_zero_init(BlockDriverState *bs)
780 /* GlusterFS volume could be backed by a block device */
781 return 0;
785 * Find allocation range in @bs around offset @start.
786 * May change underlying file descriptor's file offset.
787 * If @start is not in a hole, store @start in @data, and the
788 * beginning of the next hole in @hole, and return 0.
789 * If @start is in a non-trailing hole, store @start in @hole and the
790 * beginning of the next non-hole in @data, and return 0.
791 * If @start is in a trailing hole or beyond EOF, return -ENXIO.
792 * If we can't find out, return a negative errno other than -ENXIO.
794 * (Shamefully copied from raw-posix.c, only miniscule adaptions.)
796 static int find_allocation(BlockDriverState *bs, off_t start,
797 off_t *data, off_t *hole)
799 BDRVGlusterState *s = bs->opaque;
800 off_t offs;
802 if (!s->supports_seek_data) {
803 return -ENOTSUP;
807 * SEEK_DATA cases:
808 * D1. offs == start: start is in data
809 * D2. offs > start: start is in a hole, next data at offs
810 * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
811 * or start is beyond EOF
812 * If the latter happens, the file has been truncated behind
813 * our back since we opened it. All bets are off then.
814 * Treating like a trailing hole is simplest.
815 * D4. offs < 0, errno != ENXIO: we learned nothing
817 offs = glfs_lseek(s->fd, start, SEEK_DATA);
818 if (offs < 0) {
819 return -errno; /* D3 or D4 */
821 assert(offs >= start);
823 if (offs > start) {
824 /* D2: in hole, next data at offs */
825 *hole = start;
826 *data = offs;
827 return 0;
830 /* D1: in data, end not yet known */
833 * SEEK_HOLE cases:
834 * H1. offs == start: start is in a hole
835 * If this happens here, a hole has been dug behind our back
836 * since the previous lseek().
837 * H2. offs > start: either start is in data, next hole at offs,
838 * or start is in trailing hole, EOF at offs
839 * Linux treats trailing holes like any other hole: offs ==
840 * start. Solaris seeks to EOF instead: offs > start (blech).
841 * If that happens here, a hole has been dug behind our back
842 * since the previous lseek().
843 * H3. offs < 0, errno = ENXIO: start is beyond EOF
844 * If this happens, the file has been truncated behind our
845 * back since we opened it. Treat it like a trailing hole.
846 * H4. offs < 0, errno != ENXIO: we learned nothing
847 * Pretend we know nothing at all, i.e. "forget" about D1.
849 offs = glfs_lseek(s->fd, start, SEEK_HOLE);
850 if (offs < 0) {
851 return -errno; /* D1 and (H3 or H4) */
853 assert(offs >= start);
855 if (offs > start) {
857 * D1 and H2: either in data, next hole at offs, or it was in
858 * data but is now in a trailing hole. In the latter case,
859 * all bets are off. Treating it as if it there was data all
860 * the way to EOF is safe, so simply do that.
862 *data = start;
863 *hole = offs;
864 return 0;
867 /* D1 and H1 */
868 return -EBUSY;
872 * Returns the allocation status of the specified sectors.
874 * If 'sector_num' is beyond the end of the disk image the return value is 0
875 * and 'pnum' is set to 0.
877 * 'pnum' is set to the number of sectors (including and immediately following
878 * the specified sector) that are known to be in the same
879 * allocated/unallocated state.
881 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
882 * beyond the end of the disk image it will be clamped.
884 * (Based on raw_co_get_block_status() from raw-posix.c.)
886 static int64_t coroutine_fn qemu_gluster_co_get_block_status(
887 BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
888 BlockDriverState **file)
890 BDRVGlusterState *s = bs->opaque;
891 off_t start, data = 0, hole = 0;
892 int64_t total_size;
893 int ret = -EINVAL;
895 if (!s->fd) {
896 return ret;
899 start = sector_num * BDRV_SECTOR_SIZE;
900 total_size = bdrv_getlength(bs);
901 if (total_size < 0) {
902 return total_size;
903 } else if (start >= total_size) {
904 *pnum = 0;
905 return 0;
906 } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
907 nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
910 ret = find_allocation(bs, start, &data, &hole);
911 if (ret == -ENXIO) {
912 /* Trailing hole */
913 *pnum = nb_sectors;
914 ret = BDRV_BLOCK_ZERO;
915 } else if (ret < 0) {
916 /* No info available, so pretend there are no holes */
917 *pnum = nb_sectors;
918 ret = BDRV_BLOCK_DATA;
919 } else if (data == start) {
920 /* On a data extent, compute sectors to the end of the extent,
921 * possibly including a partial sector at EOF. */
922 *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
923 ret = BDRV_BLOCK_DATA;
924 } else {
925 /* On a hole, compute sectors to the beginning of the next extent. */
926 assert(hole == start);
927 *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
928 ret = BDRV_BLOCK_ZERO;
931 *file = bs;
933 return ret | BDRV_BLOCK_OFFSET_VALID | start;
937 static QemuOptsList qemu_gluster_create_opts = {
938 .name = "qemu-gluster-create-opts",
939 .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head),
940 .desc = {
942 .name = BLOCK_OPT_SIZE,
943 .type = QEMU_OPT_SIZE,
944 .help = "Virtual disk size"
947 .name = BLOCK_OPT_PREALLOC,
948 .type = QEMU_OPT_STRING,
949 .help = "Preallocation mode (allowed values: off, full)"
952 .name = GLUSTER_OPT_DEBUG,
953 .type = QEMU_OPT_NUMBER,
954 .help = "Gluster log level, valid range is 0-9",
956 { /* end of list */ }
960 static BlockDriver bdrv_gluster = {
961 .format_name = "gluster",
962 .protocol_name = "gluster",
963 .instance_size = sizeof(BDRVGlusterState),
964 .bdrv_needs_filename = true,
965 .bdrv_file_open = qemu_gluster_open,
966 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
967 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
968 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
969 .bdrv_close = qemu_gluster_close,
970 .bdrv_create = qemu_gluster_create,
971 .bdrv_getlength = qemu_gluster_getlength,
972 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
973 .bdrv_truncate = qemu_gluster_truncate,
974 .bdrv_co_readv = qemu_gluster_co_readv,
975 .bdrv_co_writev = qemu_gluster_co_writev,
976 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
977 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
978 #ifdef CONFIG_GLUSTERFS_DISCARD
979 .bdrv_co_discard = qemu_gluster_co_discard,
980 #endif
981 #ifdef CONFIG_GLUSTERFS_ZEROFILL
982 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
983 #endif
984 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
985 .create_opts = &qemu_gluster_create_opts,
988 static BlockDriver bdrv_gluster_tcp = {
989 .format_name = "gluster",
990 .protocol_name = "gluster+tcp",
991 .instance_size = sizeof(BDRVGlusterState),
992 .bdrv_needs_filename = true,
993 .bdrv_file_open = qemu_gluster_open,
994 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
995 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
996 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
997 .bdrv_close = qemu_gluster_close,
998 .bdrv_create = qemu_gluster_create,
999 .bdrv_getlength = qemu_gluster_getlength,
1000 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
1001 .bdrv_truncate = qemu_gluster_truncate,
1002 .bdrv_co_readv = qemu_gluster_co_readv,
1003 .bdrv_co_writev = qemu_gluster_co_writev,
1004 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
1005 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
1006 #ifdef CONFIG_GLUSTERFS_DISCARD
1007 .bdrv_co_discard = qemu_gluster_co_discard,
1008 #endif
1009 #ifdef CONFIG_GLUSTERFS_ZEROFILL
1010 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
1011 #endif
1012 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
1013 .create_opts = &qemu_gluster_create_opts,
1016 static BlockDriver bdrv_gluster_unix = {
1017 .format_name = "gluster",
1018 .protocol_name = "gluster+unix",
1019 .instance_size = sizeof(BDRVGlusterState),
1020 .bdrv_needs_filename = true,
1021 .bdrv_file_open = qemu_gluster_open,
1022 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
1023 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
1024 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
1025 .bdrv_close = qemu_gluster_close,
1026 .bdrv_create = qemu_gluster_create,
1027 .bdrv_getlength = qemu_gluster_getlength,
1028 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
1029 .bdrv_truncate = qemu_gluster_truncate,
1030 .bdrv_co_readv = qemu_gluster_co_readv,
1031 .bdrv_co_writev = qemu_gluster_co_writev,
1032 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
1033 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
1034 #ifdef CONFIG_GLUSTERFS_DISCARD
1035 .bdrv_co_discard = qemu_gluster_co_discard,
1036 #endif
1037 #ifdef CONFIG_GLUSTERFS_ZEROFILL
1038 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
1039 #endif
1040 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
1041 .create_opts = &qemu_gluster_create_opts,
1044 static BlockDriver bdrv_gluster_rdma = {
1045 .format_name = "gluster",
1046 .protocol_name = "gluster+rdma",
1047 .instance_size = sizeof(BDRVGlusterState),
1048 .bdrv_needs_filename = true,
1049 .bdrv_file_open = qemu_gluster_open,
1050 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
1051 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
1052 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
1053 .bdrv_close = qemu_gluster_close,
1054 .bdrv_create = qemu_gluster_create,
1055 .bdrv_getlength = qemu_gluster_getlength,
1056 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
1057 .bdrv_truncate = qemu_gluster_truncate,
1058 .bdrv_co_readv = qemu_gluster_co_readv,
1059 .bdrv_co_writev = qemu_gluster_co_writev,
1060 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
1061 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
1062 #ifdef CONFIG_GLUSTERFS_DISCARD
1063 .bdrv_co_discard = qemu_gluster_co_discard,
1064 #endif
1065 #ifdef CONFIG_GLUSTERFS_ZEROFILL
1066 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes,
1067 #endif
1068 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status,
1069 .create_opts = &qemu_gluster_create_opts,
1072 static void bdrv_gluster_init(void)
1074 bdrv_register(&bdrv_gluster_rdma);
1075 bdrv_register(&bdrv_gluster_unix);
1076 bdrv_register(&bdrv_gluster_tcp);
1077 bdrv_register(&bdrv_gluster);
1080 block_init(bdrv_gluster_init);