rbd: Pass BlockdevOptionsRbd to qemu_rbd_connect()
[qemu/ar7.git] / block / rbd.c
blob999fea105f2f5410ce3c6e4d41dc2f6ed933f5b8
1 /*
2 * QEMU Block driver for RADOS (Ceph)
4 * Copyright (C) 2010-2011 Christian Brunner <chb@muc.de>,
5 * Josh Durgin <josh.durgin@dreamhost.com>
7 * This work is licensed under the terms of the GNU GPL, version 2. See
8 * the COPYING file in the top-level directory.
10 * Contributions after 2012-01-13 are licensed under the terms of the
11 * GNU GPL, version 2 or (at your option) any later version.
14 #include "qemu/osdep.h"
16 #include <rbd/librbd.h>
17 #include "qapi/error.h"
18 #include "qemu/error-report.h"
19 #include "qemu/option.h"
20 #include "block/block_int.h"
21 #include "crypto/secret.h"
22 #include "qemu/cutils.h"
23 #include "qapi/qmp/qstring.h"
24 #include "qapi/qmp/qdict.h"
25 #include "qapi/qmp/qjson.h"
26 #include "qapi/qmp/qlist.h"
27 #include "qapi/qobject-input-visitor.h"
28 #include "qapi/qapi-visit-block-core.h"
31 * When specifying the image filename use:
33 * rbd:poolname/devicename[@snapshotname][:option1=value1[:option2=value2...]]
35 * poolname must be the name of an existing rados pool.
37 * devicename is the name of the rbd image.
39 * Each option given is used to configure rados, and may be any valid
40 * Ceph option, "id", or "conf".
42 * The "id" option indicates what user we should authenticate as to
43 * the Ceph cluster. If it is excluded we will use the Ceph default
44 * (normally 'admin').
46 * The "conf" option specifies a Ceph configuration file to read. If
47 * it is not specified, we will read from the default Ceph locations
48 * (e.g., /etc/ceph/ceph.conf). To avoid reading _any_ configuration
49 * file, specify conf=/dev/null.
51 * Configuration values containing :, @, or = can be escaped with a
52 * leading "\".
55 /* rbd_aio_discard added in 0.1.2 */
56 #if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 2)
57 #define LIBRBD_SUPPORTS_DISCARD
58 #else
59 #undef LIBRBD_SUPPORTS_DISCARD
60 #endif
62 #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
64 #define RBD_MAX_SNAPS 100
66 /* The LIBRBD_SUPPORTS_IOVEC is defined in librbd.h */
67 #ifdef LIBRBD_SUPPORTS_IOVEC
68 #define LIBRBD_USE_IOVEC 1
69 #else
70 #define LIBRBD_USE_IOVEC 0
71 #endif
73 typedef enum {
74 RBD_AIO_READ,
75 RBD_AIO_WRITE,
76 RBD_AIO_DISCARD,
77 RBD_AIO_FLUSH
78 } RBDAIOCmd;
80 typedef struct RBDAIOCB {
81 BlockAIOCB common;
82 int64_t ret;
83 QEMUIOVector *qiov;
84 char *bounce;
85 RBDAIOCmd cmd;
86 int error;
87 struct BDRVRBDState *s;
88 } RBDAIOCB;
90 typedef struct RADOSCB {
91 RBDAIOCB *acb;
92 struct BDRVRBDState *s;
93 int64_t size;
94 char *buf;
95 int64_t ret;
96 } RADOSCB;
98 typedef struct BDRVRBDState {
99 rados_t cluster;
100 rados_ioctx_t io_ctx;
101 rbd_image_t image;
102 char *image_name;
103 char *snap;
104 } BDRVRBDState;
106 static char *qemu_rbd_next_tok(char *src, char delim, char **p)
108 char *end;
110 *p = NULL;
112 for (end = src; *end; ++end) {
113 if (*end == delim) {
114 break;
116 if (*end == '\\' && end[1] != '\0') {
117 end++;
120 if (*end == delim) {
121 *p = end + 1;
122 *end = '\0';
124 return src;
127 static void qemu_rbd_unescape(char *src)
129 char *p;
131 for (p = src; *src; ++src, ++p) {
132 if (*src == '\\' && src[1] != '\0') {
133 src++;
135 *p = *src;
137 *p = '\0';
140 static void qemu_rbd_parse_filename(const char *filename, QDict *options,
141 Error **errp)
143 const char *start;
144 char *p, *buf;
145 QList *keypairs = NULL;
146 char *found_str;
148 if (!strstart(filename, "rbd:", &start)) {
149 error_setg(errp, "File name must start with 'rbd:'");
150 return;
153 buf = g_strdup(start);
154 p = buf;
156 found_str = qemu_rbd_next_tok(p, '/', &p);
157 if (!p) {
158 error_setg(errp, "Pool name is required");
159 goto done;
161 qemu_rbd_unescape(found_str);
162 qdict_put_str(options, "pool", found_str);
164 if (strchr(p, '@')) {
165 found_str = qemu_rbd_next_tok(p, '@', &p);
166 qemu_rbd_unescape(found_str);
167 qdict_put_str(options, "image", found_str);
169 found_str = qemu_rbd_next_tok(p, ':', &p);
170 qemu_rbd_unescape(found_str);
171 qdict_put_str(options, "snapshot", found_str);
172 } else {
173 found_str = qemu_rbd_next_tok(p, ':', &p);
174 qemu_rbd_unescape(found_str);
175 qdict_put_str(options, "image", found_str);
177 if (!p) {
178 goto done;
181 /* The following are essentially all key/value pairs, and we treat
182 * 'id' and 'conf' a bit special. Key/value pairs may be in any order. */
183 while (p) {
184 char *name, *value;
185 name = qemu_rbd_next_tok(p, '=', &p);
186 if (!p) {
187 error_setg(errp, "conf option %s has no value", name);
188 break;
191 qemu_rbd_unescape(name);
193 value = qemu_rbd_next_tok(p, ':', &p);
194 qemu_rbd_unescape(value);
196 if (!strcmp(name, "conf")) {
197 qdict_put_str(options, "conf", value);
198 } else if (!strcmp(name, "id")) {
199 qdict_put_str(options, "user", value);
200 } else {
202 * We pass these internally to qemu_rbd_set_keypairs(), so
203 * we can get away with the simpler list of [ "key1",
204 * "value1", "key2", "value2" ] rather than a raw dict
205 * { "key1": "value1", "key2": "value2" } where we can't
206 * guarantee order, or even a more correct but complex
207 * [ { "key1": "value1" }, { "key2": "value2" } ]
209 if (!keypairs) {
210 keypairs = qlist_new();
212 qlist_append_str(keypairs, name);
213 qlist_append_str(keypairs, value);
217 if (keypairs) {
218 qdict_put(options, "=keyvalue-pairs",
219 qobject_to_json(QOBJECT(keypairs)));
222 done:
223 g_free(buf);
224 QDECREF(keypairs);
225 return;
229 static int qemu_rbd_set_auth(rados_t cluster, const char *secretid,
230 Error **errp)
232 if (secretid == 0) {
233 return 0;
236 gchar *secret = qcrypto_secret_lookup_as_base64(secretid,
237 errp);
238 if (!secret) {
239 return -1;
242 rados_conf_set(cluster, "key", secret);
243 g_free(secret);
245 return 0;
248 static int qemu_rbd_set_keypairs(rados_t cluster, const char *keypairs_json,
249 Error **errp)
251 QList *keypairs;
252 QString *name;
253 QString *value;
254 const char *key;
255 size_t remaining;
256 int ret = 0;
258 if (!keypairs_json) {
259 return ret;
261 keypairs = qobject_to_qlist(qobject_from_json(keypairs_json,
262 &error_abort));
263 remaining = qlist_size(keypairs) / 2;
264 assert(remaining);
266 while (remaining--) {
267 name = qobject_to_qstring(qlist_pop(keypairs));
268 value = qobject_to_qstring(qlist_pop(keypairs));
269 assert(name && value);
270 key = qstring_get_str(name);
272 ret = rados_conf_set(cluster, key, qstring_get_str(value));
273 QDECREF(value);
274 if (ret < 0) {
275 error_setg_errno(errp, -ret, "invalid conf option %s", key);
276 QDECREF(name);
277 ret = -EINVAL;
278 break;
280 QDECREF(name);
283 QDECREF(keypairs);
284 return ret;
287 static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
289 if (LIBRBD_USE_IOVEC) {
290 RBDAIOCB *acb = rcb->acb;
291 iov_memset(acb->qiov->iov, acb->qiov->niov, offs, 0,
292 acb->qiov->size - offs);
293 } else {
294 memset(rcb->buf + offs, 0, rcb->size - offs);
298 static QemuOptsList runtime_opts = {
299 .name = "rbd",
300 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
301 .desc = {
303 .name = "pool",
304 .type = QEMU_OPT_STRING,
305 .help = "Rados pool name",
308 .name = "image",
309 .type = QEMU_OPT_STRING,
310 .help = "Image name in the pool",
313 .name = "conf",
314 .type = QEMU_OPT_STRING,
315 .help = "Rados config file location",
318 .name = "snapshot",
319 .type = QEMU_OPT_STRING,
320 .help = "Ceph snapshot name",
323 /* maps to 'id' in rados_create() */
324 .name = "user",
325 .type = QEMU_OPT_STRING,
326 .help = "Rados id name",
329 * server.* extracted manually, see qemu_rbd_mon_host()
331 { /* end of list */ }
335 static int coroutine_fn qemu_rbd_co_create_opts(const char *filename,
336 QemuOpts *opts,
337 Error **errp)
339 Error *local_err = NULL;
340 int64_t bytes = 0;
341 int64_t objsize;
342 int obj_order = 0;
343 const char *pool, *image_name, *conf, *user, *keypairs;
344 const char *secretid;
345 rados_t cluster;
346 rados_ioctx_t io_ctx;
347 QDict *options = NULL;
348 int ret = 0;
350 secretid = qemu_opt_get(opts, "password-secret");
352 /* Read out options */
353 bytes = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
354 BDRV_SECTOR_SIZE);
355 objsize = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, 0);
356 if (objsize) {
357 if ((objsize - 1) & objsize) { /* not a power of 2? */
358 error_setg(errp, "obj size needs to be power of 2");
359 ret = -EINVAL;
360 goto exit;
362 if (objsize < 4096) {
363 error_setg(errp, "obj size too small");
364 ret = -EINVAL;
365 goto exit;
367 obj_order = ctz32(objsize);
370 options = qdict_new();
371 qemu_rbd_parse_filename(filename, options, &local_err);
372 if (local_err) {
373 ret = -EINVAL;
374 error_propagate(errp, local_err);
375 goto exit;
379 * Caution: while qdict_get_try_str() is fine, getting non-string
380 * types would require more care. When @options come from -blockdev
381 * or blockdev_add, its members are typed according to the QAPI
382 * schema, but when they come from -drive, they're all QString.
384 pool = qdict_get_try_str(options, "pool");
385 conf = qdict_get_try_str(options, "conf");
386 user = qdict_get_try_str(options, "user");
387 image_name = qdict_get_try_str(options, "image");
388 keypairs = qdict_get_try_str(options, "=keyvalue-pairs");
390 ret = rados_create(&cluster, user);
391 if (ret < 0) {
392 error_setg_errno(errp, -ret, "error initializing");
393 goto exit;
396 /* try default location when conf=NULL, but ignore failure */
397 ret = rados_conf_read_file(cluster, conf);
398 if (conf && ret < 0) {
399 error_setg_errno(errp, -ret, "error reading conf file %s", conf);
400 ret = -EIO;
401 goto shutdown;
404 ret = qemu_rbd_set_keypairs(cluster, keypairs, errp);
405 if (ret < 0) {
406 ret = -EIO;
407 goto shutdown;
410 if (qemu_rbd_set_auth(cluster, secretid, errp) < 0) {
411 ret = -EIO;
412 goto shutdown;
415 ret = rados_connect(cluster);
416 if (ret < 0) {
417 error_setg_errno(errp, -ret, "error connecting");
418 goto shutdown;
421 ret = rados_ioctx_create(cluster, pool, &io_ctx);
422 if (ret < 0) {
423 error_setg_errno(errp, -ret, "error opening pool %s", pool);
424 goto shutdown;
427 ret = rbd_create(io_ctx, image_name, bytes, &obj_order);
428 if (ret < 0) {
429 error_setg_errno(errp, -ret, "error rbd create");
432 rados_ioctx_destroy(io_ctx);
434 shutdown:
435 rados_shutdown(cluster);
437 exit:
438 QDECREF(options);
439 return ret;
443 * This aio completion is being called from rbd_finish_bh() and runs in qemu
444 * BH context.
446 static void qemu_rbd_complete_aio(RADOSCB *rcb)
448 RBDAIOCB *acb = rcb->acb;
449 int64_t r;
451 r = rcb->ret;
453 if (acb->cmd != RBD_AIO_READ) {
454 if (r < 0) {
455 acb->ret = r;
456 acb->error = 1;
457 } else if (!acb->error) {
458 acb->ret = rcb->size;
460 } else {
461 if (r < 0) {
462 qemu_rbd_memset(rcb, 0);
463 acb->ret = r;
464 acb->error = 1;
465 } else if (r < rcb->size) {
466 qemu_rbd_memset(rcb, r);
467 if (!acb->error) {
468 acb->ret = rcb->size;
470 } else if (!acb->error) {
471 acb->ret = r;
475 g_free(rcb);
477 if (!LIBRBD_USE_IOVEC) {
478 if (acb->cmd == RBD_AIO_READ) {
479 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
481 qemu_vfree(acb->bounce);
484 acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
486 qemu_aio_unref(acb);
489 static char *qemu_rbd_mon_host(BlockdevOptionsRbd *opts, Error **errp)
491 const char **vals;
492 const char *host, *port;
493 char *rados_str;
494 InetSocketAddressBaseList *p;
495 int i, cnt;
497 if (!opts->has_server) {
498 return NULL;
501 for (cnt = 0, p = opts->server; p; p = p->next) {
502 cnt++;
505 vals = g_new(const char *, cnt + 1);
507 for (i = 0, p = opts->server; p; p = p->next, i++) {
508 host = p->value->host;
509 port = p->value->port;
511 if (strchr(host, ':')) {
512 vals[i] = g_strdup_printf("[%s]:%s", host, port);
513 } else {
514 vals[i] = g_strdup_printf("%s:%s", host, port);
517 vals[i] = NULL;
519 rados_str = i ? g_strjoinv(";", (char **)vals) : NULL;
520 g_strfreev((char **)vals);
521 return rados_str;
524 static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
525 char **s_snap, char **s_image_name,
526 BlockdevOptionsRbd *opts, bool cache,
527 const char *keypairs, const char *secretid,
528 Error **errp)
530 char *mon_host = NULL;
531 Error *local_err = NULL;
532 int r;
534 mon_host = qemu_rbd_mon_host(opts, &local_err);
535 if (local_err) {
536 error_propagate(errp, local_err);
537 r = -EINVAL;
538 goto failed_opts;
541 r = rados_create(cluster, opts->user);
542 if (r < 0) {
543 error_setg_errno(errp, -r, "error initializing");
544 goto failed_opts;
547 *s_snap = g_strdup(opts->snapshot);
548 *s_image_name = g_strdup(opts->image);
550 /* try default location when conf=NULL, but ignore failure */
551 r = rados_conf_read_file(*cluster, opts->conf);
552 if (opts->has_conf && r < 0) {
553 error_setg_errno(errp, -r, "error reading conf file %s", opts->conf);
554 goto failed_shutdown;
557 r = qemu_rbd_set_keypairs(*cluster, keypairs, errp);
558 if (r < 0) {
559 goto failed_shutdown;
562 if (mon_host) {
563 r = rados_conf_set(*cluster, "mon_host", mon_host);
564 if (r < 0) {
565 goto failed_shutdown;
569 if (qemu_rbd_set_auth(*cluster, secretid, errp) < 0) {
570 r = -EIO;
571 goto failed_shutdown;
575 * Fallback to more conservative semantics if setting cache
576 * options fails. Ignore errors from setting rbd_cache because the
577 * only possible error is that the option does not exist, and
578 * librbd defaults to no caching. If write through caching cannot
579 * be set up, fall back to no caching.
581 if (cache) {
582 rados_conf_set(*cluster, "rbd_cache", "true");
583 } else {
584 rados_conf_set(*cluster, "rbd_cache", "false");
587 r = rados_connect(*cluster);
588 if (r < 0) {
589 error_setg_errno(errp, -r, "error connecting");
590 goto failed_shutdown;
593 r = rados_ioctx_create(*cluster, opts->pool, io_ctx);
594 if (r < 0) {
595 error_setg_errno(errp, -r, "error opening pool %s", opts->pool);
596 goto failed_shutdown;
599 return 0;
601 failed_shutdown:
602 rados_shutdown(*cluster);
603 g_free(*s_snap);
604 g_free(*s_image_name);
605 failed_opts:
606 g_free(mon_host);
607 return r;
610 static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
611 Error **errp)
613 BDRVRBDState *s = bs->opaque;
614 BlockdevOptionsRbd *opts = NULL;
615 Visitor *v;
616 QObject *crumpled = NULL;
617 Error *local_err = NULL;
618 const char *filename;
619 char *keypairs, *secretid;
620 int r;
622 /* If we are given a filename, parse the filename, with precedence given to
623 * filename encoded options */
624 filename = qdict_get_try_str(options, "filename");
625 if (filename) {
626 warn_report("'filename' option specified. "
627 "This is an unsupported option, and may be deprecated "
628 "in the future");
629 qemu_rbd_parse_filename(filename, options, &local_err);
630 qdict_del(options, "filename");
631 if (local_err) {
632 error_propagate(errp, local_err);
633 return -EINVAL;
637 keypairs = g_strdup(qdict_get_try_str(options, "=keyvalue-pairs"));
638 if (keypairs) {
639 qdict_del(options, "=keyvalue-pairs");
642 secretid = g_strdup(qdict_get_try_str(options, "password-secret"));
643 if (secretid) {
644 qdict_del(options, "password-secret");
647 /* Convert the remaining options into a QAPI object */
648 crumpled = qdict_crumple(options, errp);
649 if (crumpled == NULL) {
650 r = -EINVAL;
651 goto out;
654 v = qobject_input_visitor_new_keyval(crumpled);
655 visit_type_BlockdevOptionsRbd(v, NULL, &opts, &local_err);
656 visit_free(v);
657 qobject_decref(crumpled);
659 if (local_err) {
660 error_propagate(errp, local_err);
661 r = -EINVAL;
662 goto out;
665 r = qemu_rbd_connect(&s->cluster, &s->io_ctx, &s->snap, &s->image_name,
666 opts, !(flags & BDRV_O_NOCACHE), keypairs, secretid,
667 errp);
668 if (r < 0) {
669 goto out;
672 /* rbd_open is always r/w */
673 r = rbd_open(s->io_ctx, s->image_name, &s->image, s->snap);
674 if (r < 0) {
675 error_setg_errno(errp, -r, "error reading header from %s",
676 s->image_name);
677 goto failed_open;
680 /* If we are using an rbd snapshot, we must be r/o, otherwise
681 * leave as-is */
682 if (s->snap != NULL) {
683 if (!bdrv_is_read_only(bs)) {
684 error_report("Opening rbd snapshots without an explicit "
685 "read-only=on option is deprecated. Future versions "
686 "will refuse to open the image instead of "
687 "automatically marking the image read-only.");
688 r = bdrv_set_read_only(bs, true, &local_err);
689 if (r < 0) {
690 error_propagate(errp, local_err);
691 goto failed_open;
696 r = 0;
697 goto out;
699 failed_open:
700 rados_ioctx_destroy(s->io_ctx);
701 g_free(s->snap);
702 g_free(s->image_name);
703 rados_shutdown(s->cluster);
704 out:
705 qapi_free_BlockdevOptionsRbd(opts);
706 g_free(keypairs);
707 g_free(secretid);
708 return r;
712 /* Since RBD is currently always opened R/W via the API,
713 * we just need to check if we are using a snapshot or not, in
714 * order to determine if we will allow it to be R/W */
715 static int qemu_rbd_reopen_prepare(BDRVReopenState *state,
716 BlockReopenQueue *queue, Error **errp)
718 BDRVRBDState *s = state->bs->opaque;
719 int ret = 0;
721 if (s->snap && state->flags & BDRV_O_RDWR) {
722 error_setg(errp,
723 "Cannot change node '%s' to r/w when using RBD snapshot",
724 bdrv_get_device_or_node_name(state->bs));
725 ret = -EINVAL;
728 return ret;
731 static void qemu_rbd_close(BlockDriverState *bs)
733 BDRVRBDState *s = bs->opaque;
735 rbd_close(s->image);
736 rados_ioctx_destroy(s->io_ctx);
737 g_free(s->snap);
738 g_free(s->image_name);
739 rados_shutdown(s->cluster);
742 static const AIOCBInfo rbd_aiocb_info = {
743 .aiocb_size = sizeof(RBDAIOCB),
746 static void rbd_finish_bh(void *opaque)
748 RADOSCB *rcb = opaque;
749 qemu_rbd_complete_aio(rcb);
753 * This is the callback function for rbd_aio_read and _write
755 * Note: this function is being called from a non qemu thread so
756 * we need to be careful about what we do here. Generally we only
757 * schedule a BH, and do the rest of the io completion handling
758 * from rbd_finish_bh() which runs in a qemu context.
760 static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb)
762 RBDAIOCB *acb = rcb->acb;
764 rcb->ret = rbd_aio_get_return_value(c);
765 rbd_aio_release(c);
767 aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs),
768 rbd_finish_bh, rcb);
771 static int rbd_aio_discard_wrapper(rbd_image_t image,
772 uint64_t off,
773 uint64_t len,
774 rbd_completion_t comp)
776 #ifdef LIBRBD_SUPPORTS_DISCARD
777 return rbd_aio_discard(image, off, len, comp);
778 #else
779 return -ENOTSUP;
780 #endif
783 static int rbd_aio_flush_wrapper(rbd_image_t image,
784 rbd_completion_t comp)
786 #ifdef LIBRBD_SUPPORTS_AIO_FLUSH
787 return rbd_aio_flush(image, comp);
788 #else
789 return -ENOTSUP;
790 #endif
793 static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
794 int64_t off,
795 QEMUIOVector *qiov,
796 int64_t size,
797 BlockCompletionFunc *cb,
798 void *opaque,
799 RBDAIOCmd cmd)
801 RBDAIOCB *acb;
802 RADOSCB *rcb = NULL;
803 rbd_completion_t c;
804 int r;
806 BDRVRBDState *s = bs->opaque;
808 acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque);
809 acb->cmd = cmd;
810 acb->qiov = qiov;
811 assert(!qiov || qiov->size == size);
813 rcb = g_new(RADOSCB, 1);
815 if (!LIBRBD_USE_IOVEC) {
816 if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) {
817 acb->bounce = NULL;
818 } else {
819 acb->bounce = qemu_try_blockalign(bs, qiov->size);
820 if (acb->bounce == NULL) {
821 goto failed;
824 if (cmd == RBD_AIO_WRITE) {
825 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
827 rcb->buf = acb->bounce;
830 acb->ret = 0;
831 acb->error = 0;
832 acb->s = s;
834 rcb->acb = acb;
835 rcb->s = acb->s;
836 rcb->size = size;
837 r = rbd_aio_create_completion(rcb, (rbd_callback_t) rbd_finish_aiocb, &c);
838 if (r < 0) {
839 goto failed;
842 switch (cmd) {
843 case RBD_AIO_WRITE:
844 #ifdef LIBRBD_SUPPORTS_IOVEC
845 r = rbd_aio_writev(s->image, qiov->iov, qiov->niov, off, c);
846 #else
847 r = rbd_aio_write(s->image, off, size, rcb->buf, c);
848 #endif
849 break;
850 case RBD_AIO_READ:
851 #ifdef LIBRBD_SUPPORTS_IOVEC
852 r = rbd_aio_readv(s->image, qiov->iov, qiov->niov, off, c);
853 #else
854 r = rbd_aio_read(s->image, off, size, rcb->buf, c);
855 #endif
856 break;
857 case RBD_AIO_DISCARD:
858 r = rbd_aio_discard_wrapper(s->image, off, size, c);
859 break;
860 case RBD_AIO_FLUSH:
861 r = rbd_aio_flush_wrapper(s->image, c);
862 break;
863 default:
864 r = -EINVAL;
867 if (r < 0) {
868 goto failed_completion;
870 return &acb->common;
872 failed_completion:
873 rbd_aio_release(c);
874 failed:
875 g_free(rcb);
876 if (!LIBRBD_USE_IOVEC) {
877 qemu_vfree(acb->bounce);
880 qemu_aio_unref(acb);
881 return NULL;
884 static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
885 int64_t sector_num,
886 QEMUIOVector *qiov,
887 int nb_sectors,
888 BlockCompletionFunc *cb,
889 void *opaque)
891 return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
892 (int64_t) nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
893 RBD_AIO_READ);
896 static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
897 int64_t sector_num,
898 QEMUIOVector *qiov,
899 int nb_sectors,
900 BlockCompletionFunc *cb,
901 void *opaque)
903 return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
904 (int64_t) nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
905 RBD_AIO_WRITE);
908 #ifdef LIBRBD_SUPPORTS_AIO_FLUSH
909 static BlockAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs,
910 BlockCompletionFunc *cb,
911 void *opaque)
913 return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH);
916 #else
918 static int qemu_rbd_co_flush(BlockDriverState *bs)
920 #if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1)
921 /* rbd_flush added in 0.1.1 */
922 BDRVRBDState *s = bs->opaque;
923 return rbd_flush(s->image);
924 #else
925 return 0;
926 #endif
928 #endif
930 static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
932 BDRVRBDState *s = bs->opaque;
933 rbd_image_info_t info;
934 int r;
936 r = rbd_stat(s->image, &info, sizeof(info));
937 if (r < 0) {
938 return r;
941 bdi->cluster_size = info.obj_size;
942 return 0;
945 static int64_t qemu_rbd_getlength(BlockDriverState *bs)
947 BDRVRBDState *s = bs->opaque;
948 rbd_image_info_t info;
949 int r;
951 r = rbd_stat(s->image, &info, sizeof(info));
952 if (r < 0) {
953 return r;
956 return info.size;
959 static int qemu_rbd_truncate(BlockDriverState *bs, int64_t offset,
960 PreallocMode prealloc, Error **errp)
962 BDRVRBDState *s = bs->opaque;
963 int r;
965 if (prealloc != PREALLOC_MODE_OFF) {
966 error_setg(errp, "Unsupported preallocation mode '%s'",
967 PreallocMode_str(prealloc));
968 return -ENOTSUP;
971 r = rbd_resize(s->image, offset);
972 if (r < 0) {
973 error_setg_errno(errp, -r, "Failed to resize file");
974 return r;
977 return 0;
980 static int qemu_rbd_snap_create(BlockDriverState *bs,
981 QEMUSnapshotInfo *sn_info)
983 BDRVRBDState *s = bs->opaque;
984 int r;
986 if (sn_info->name[0] == '\0') {
987 return -EINVAL; /* we need a name for rbd snapshots */
991 * rbd snapshots are using the name as the user controlled unique identifier
992 * we can't use the rbd snapid for that purpose, as it can't be set
994 if (sn_info->id_str[0] != '\0' &&
995 strcmp(sn_info->id_str, sn_info->name) != 0) {
996 return -EINVAL;
999 if (strlen(sn_info->name) >= sizeof(sn_info->id_str)) {
1000 return -ERANGE;
1003 r = rbd_snap_create(s->image, sn_info->name);
1004 if (r < 0) {
1005 error_report("failed to create snap: %s", strerror(-r));
1006 return r;
1009 return 0;
1012 static int qemu_rbd_snap_remove(BlockDriverState *bs,
1013 const char *snapshot_id,
1014 const char *snapshot_name,
1015 Error **errp)
1017 BDRVRBDState *s = bs->opaque;
1018 int r;
1020 if (!snapshot_name) {
1021 error_setg(errp, "rbd need a valid snapshot name");
1022 return -EINVAL;
1025 /* If snapshot_id is specified, it must be equal to name, see
1026 qemu_rbd_snap_list() */
1027 if (snapshot_id && strcmp(snapshot_id, snapshot_name)) {
1028 error_setg(errp,
1029 "rbd do not support snapshot id, it should be NULL or "
1030 "equal to snapshot name");
1031 return -EINVAL;
1034 r = rbd_snap_remove(s->image, snapshot_name);
1035 if (r < 0) {
1036 error_setg_errno(errp, -r, "Failed to remove the snapshot");
1038 return r;
1041 static int qemu_rbd_snap_rollback(BlockDriverState *bs,
1042 const char *snapshot_name)
1044 BDRVRBDState *s = bs->opaque;
1046 return rbd_snap_rollback(s->image, snapshot_name);
1049 static int qemu_rbd_snap_list(BlockDriverState *bs,
1050 QEMUSnapshotInfo **psn_tab)
1052 BDRVRBDState *s = bs->opaque;
1053 QEMUSnapshotInfo *sn_info, *sn_tab = NULL;
1054 int i, snap_count;
1055 rbd_snap_info_t *snaps;
1056 int max_snaps = RBD_MAX_SNAPS;
1058 do {
1059 snaps = g_new(rbd_snap_info_t, max_snaps);
1060 snap_count = rbd_snap_list(s->image, snaps, &max_snaps);
1061 if (snap_count <= 0) {
1062 g_free(snaps);
1064 } while (snap_count == -ERANGE);
1066 if (snap_count <= 0) {
1067 goto done;
1070 sn_tab = g_new0(QEMUSnapshotInfo, snap_count);
1072 for (i = 0; i < snap_count; i++) {
1073 const char *snap_name = snaps[i].name;
1075 sn_info = sn_tab + i;
1076 pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), snap_name);
1077 pstrcpy(sn_info->name, sizeof(sn_info->name), snap_name);
1079 sn_info->vm_state_size = snaps[i].size;
1080 sn_info->date_sec = 0;
1081 sn_info->date_nsec = 0;
1082 sn_info->vm_clock_nsec = 0;
1084 rbd_snap_list_end(snaps);
1085 g_free(snaps);
1087 done:
1088 *psn_tab = sn_tab;
1089 return snap_count;
1092 #ifdef LIBRBD_SUPPORTS_DISCARD
1093 static BlockAIOCB *qemu_rbd_aio_pdiscard(BlockDriverState *bs,
1094 int64_t offset,
1095 int bytes,
1096 BlockCompletionFunc *cb,
1097 void *opaque)
1099 return rbd_start_aio(bs, offset, NULL, bytes, cb, opaque,
1100 RBD_AIO_DISCARD);
1102 #endif
1104 #ifdef LIBRBD_SUPPORTS_INVALIDATE
1105 static void coroutine_fn qemu_rbd_co_invalidate_cache(BlockDriverState *bs,
1106 Error **errp)
1108 BDRVRBDState *s = bs->opaque;
1109 int r = rbd_invalidate_cache(s->image);
1110 if (r < 0) {
1111 error_setg_errno(errp, -r, "Failed to invalidate the cache");
1114 #endif
1116 static QemuOptsList qemu_rbd_create_opts = {
1117 .name = "rbd-create-opts",
1118 .head = QTAILQ_HEAD_INITIALIZER(qemu_rbd_create_opts.head),
1119 .desc = {
1121 .name = BLOCK_OPT_SIZE,
1122 .type = QEMU_OPT_SIZE,
1123 .help = "Virtual disk size"
1126 .name = BLOCK_OPT_CLUSTER_SIZE,
1127 .type = QEMU_OPT_SIZE,
1128 .help = "RBD object size"
1131 .name = "password-secret",
1132 .type = QEMU_OPT_STRING,
1133 .help = "ID of secret providing the password",
1135 { /* end of list */ }
1139 static BlockDriver bdrv_rbd = {
1140 .format_name = "rbd",
1141 .instance_size = sizeof(BDRVRBDState),
1142 .bdrv_parse_filename = qemu_rbd_parse_filename,
1143 .bdrv_file_open = qemu_rbd_open,
1144 .bdrv_close = qemu_rbd_close,
1145 .bdrv_reopen_prepare = qemu_rbd_reopen_prepare,
1146 .bdrv_co_create_opts = qemu_rbd_co_create_opts,
1147 .bdrv_has_zero_init = bdrv_has_zero_init_1,
1148 .bdrv_get_info = qemu_rbd_getinfo,
1149 .create_opts = &qemu_rbd_create_opts,
1150 .bdrv_getlength = qemu_rbd_getlength,
1151 .bdrv_truncate = qemu_rbd_truncate,
1152 .protocol_name = "rbd",
1154 .bdrv_aio_readv = qemu_rbd_aio_readv,
1155 .bdrv_aio_writev = qemu_rbd_aio_writev,
1157 #ifdef LIBRBD_SUPPORTS_AIO_FLUSH
1158 .bdrv_aio_flush = qemu_rbd_aio_flush,
1159 #else
1160 .bdrv_co_flush_to_disk = qemu_rbd_co_flush,
1161 #endif
1163 #ifdef LIBRBD_SUPPORTS_DISCARD
1164 .bdrv_aio_pdiscard = qemu_rbd_aio_pdiscard,
1165 #endif
1167 .bdrv_snapshot_create = qemu_rbd_snap_create,
1168 .bdrv_snapshot_delete = qemu_rbd_snap_remove,
1169 .bdrv_snapshot_list = qemu_rbd_snap_list,
1170 .bdrv_snapshot_goto = qemu_rbd_snap_rollback,
1171 #ifdef LIBRBD_SUPPORTS_INVALIDATE
1172 .bdrv_co_invalidate_cache = qemu_rbd_co_invalidate_cache,
1173 #endif
1176 static void bdrv_rbd_init(void)
1178 bdrv_register(&bdrv_rbd);
1181 block_init(bdrv_rbd_init);