qcow2: Prevent numerical overflow
[qemu/qmp-unstable.git] / block.c
blob07f58fcf1b4e2fc178bc3009989edede2bfa8a91
1 /*
2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
40 #ifdef CONFIG_BSD
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
45 #ifndef __DragonFly__
46 #include <sys/disk.h>
47 #endif
48 #endif
50 #ifdef _WIN32
51 #include <windows.h>
52 #endif
54 struct BdrvDirtyBitmap {
55 HBitmap *bitmap;
56 QLIST_ENTRY(BdrvDirtyBitmap) list;
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63 BlockCompletionFunc *cb, void *opaque);
64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66 BlockCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75 BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78 BdrvRequestFlags flags);
79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
83 BdrvRequestFlags flags,
84 BlockCompletionFunc *cb,
85 void *opaque,
86 bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
111 int is_windows_drive(const char *filename)
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
121 #endif
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
127 int i;
129 throttle_config(&bs->throttle_state, cfg);
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
143 bs->io_limits_enabled = false;
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
151 bs->io_limits_enabled = enabled;
153 return drained;
156 void bdrv_io_limits_disable(BlockDriverState *bs)
158 bs->io_limits_enabled = false;
160 bdrv_start_throttled_reqs(bs);
162 throttle_destroy(&bs->throttle_state);
165 static void bdrv_throttle_read_timer_cb(void *opaque)
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
171 static void bdrv_throttle_write_timer_cb(void *opaque)
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 bdrv_get_aio_context(bs),
183 QEMU_CLOCK_VIRTUAL,
184 bdrv_throttle_read_timer_cb,
185 bdrv_throttle_write_timer_cb,
186 bs);
187 bs->io_limits_enabled = true;
190 /* This function makes an IO wait if needed
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
195 static void bdrv_io_limits_intercept(BlockDriverState *bs,
196 unsigned int bytes,
197 bool is_write)
199 /* does this io must wait */
200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
202 /* if must wait or any request of this type throttled queue the IO */
203 if (must_wait ||
204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
208 /* the IO will be executed, do the accounting */
209 throttle_account(&bs->throttle_state, is_write, bytes);
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214 return;
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
221 size_t bdrv_opt_mem_align(BlockDriverState *bs)
223 if (!bs || !bs->drv) {
224 /* 4k should be on the safe side */
225 return 4096;
228 return bs->bl.opt_mem_alignment;
231 /* check if the path starts with "<protocol>:" */
232 static int path_has_protocol(const char *path)
234 const char *p;
236 #ifdef _WIN32
237 if (is_windows_drive(path) ||
238 is_windows_drive_prefix(path)) {
239 return 0;
241 p = path + strcspn(path, ":/\\");
242 #else
243 p = path + strcspn(path, ":/");
244 #endif
246 return *p == ':';
249 int path_is_absolute(const char *path)
251 #ifdef _WIN32
252 /* specific case for names like: "\\.\d:" */
253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254 return 1;
256 return (*path == '/' || *path == '\\');
257 #else
258 return (*path == '/');
259 #endif
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
264 supported. */
265 void path_combine(char *dest, int dest_size,
266 const char *base_path,
267 const char *filename)
269 const char *p, *p1;
270 int len;
272 if (dest_size <= 0)
273 return;
274 if (path_is_absolute(filename)) {
275 pstrcpy(dest, dest_size, filename);
276 } else {
277 p = strchr(base_path, ':');
278 if (p)
279 p++;
280 else
281 p = base_path;
282 p1 = strrchr(base_path, '/');
283 #ifdef _WIN32
285 const char *p2;
286 p2 = strrchr(base_path, '\\');
287 if (!p1 || p2 > p1)
288 p1 = p2;
290 #endif
291 if (p1)
292 p1++;
293 else
294 p1 = base_path;
295 if (p1 > p)
296 p = p1;
297 len = p - base_path;
298 if (len > dest_size - 1)
299 len = dest_size - 1;
300 memcpy(dest, base_path, len);
301 dest[len] = '\0';
302 pstrcat(dest, dest_size, filename);
306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
308 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309 pstrcpy(dest, sz, bs->backing_file);
310 } else {
311 path_combine(dest, sz, bs->filename, bs->backing_file);
315 void bdrv_register(BlockDriver *bdrv)
317 /* Block drivers without coroutine functions need emulation */
318 if (!bdrv->bdrv_co_readv) {
319 bdrv->bdrv_co_readv = bdrv_co_readv_em;
320 bdrv->bdrv_co_writev = bdrv_co_writev_em;
322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323 * the block driver lacks aio we need to emulate that too.
325 if (!bdrv->bdrv_aio_readv) {
326 /* add AIO emulation layer */
327 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
332 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
335 BlockDriverState *bdrv_new_root(void)
337 BlockDriverState *bs = bdrv_new();
339 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
340 return bs;
343 BlockDriverState *bdrv_new(void)
345 BlockDriverState *bs;
346 int i;
348 bs = g_new0(BlockDriverState, 1);
349 QLIST_INIT(&bs->dirty_bitmaps);
350 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
351 QLIST_INIT(&bs->op_blockers[i]);
353 bdrv_iostatus_disable(bs);
354 notifier_list_init(&bs->close_notifiers);
355 notifier_with_return_list_init(&bs->before_write_notifiers);
356 qemu_co_queue_init(&bs->throttled_reqs[0]);
357 qemu_co_queue_init(&bs->throttled_reqs[1]);
358 bs->refcnt = 1;
359 bs->aio_context = qemu_get_aio_context();
361 return bs;
364 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
366 notifier_list_add(&bs->close_notifiers, notify);
369 BlockDriver *bdrv_find_format(const char *format_name)
371 BlockDriver *drv1;
372 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
373 if (!strcmp(drv1->format_name, format_name)) {
374 return drv1;
377 return NULL;
380 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
382 static const char *whitelist_rw[] = {
383 CONFIG_BDRV_RW_WHITELIST
385 static const char *whitelist_ro[] = {
386 CONFIG_BDRV_RO_WHITELIST
388 const char **p;
390 if (!whitelist_rw[0] && !whitelist_ro[0]) {
391 return 1; /* no whitelist, anything goes */
394 for (p = whitelist_rw; *p; p++) {
395 if (!strcmp(drv->format_name, *p)) {
396 return 1;
399 if (read_only) {
400 for (p = whitelist_ro; *p; p++) {
401 if (!strcmp(drv->format_name, *p)) {
402 return 1;
406 return 0;
409 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
410 bool read_only)
412 BlockDriver *drv = bdrv_find_format(format_name);
413 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
416 typedef struct CreateCo {
417 BlockDriver *drv;
418 char *filename;
419 QemuOpts *opts;
420 int ret;
421 Error *err;
422 } CreateCo;
424 static void coroutine_fn bdrv_create_co_entry(void *opaque)
426 Error *local_err = NULL;
427 int ret;
429 CreateCo *cco = opaque;
430 assert(cco->drv);
432 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
433 if (local_err) {
434 error_propagate(&cco->err, local_err);
436 cco->ret = ret;
439 int bdrv_create(BlockDriver *drv, const char* filename,
440 QemuOpts *opts, Error **errp)
442 int ret;
444 Coroutine *co;
445 CreateCo cco = {
446 .drv = drv,
447 .filename = g_strdup(filename),
448 .opts = opts,
449 .ret = NOT_DONE,
450 .err = NULL,
453 if (!drv->bdrv_create) {
454 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
455 ret = -ENOTSUP;
456 goto out;
459 if (qemu_in_coroutine()) {
460 /* Fast-path if already in coroutine context */
461 bdrv_create_co_entry(&cco);
462 } else {
463 co = qemu_coroutine_create(bdrv_create_co_entry);
464 qemu_coroutine_enter(co, &cco);
465 while (cco.ret == NOT_DONE) {
466 aio_poll(qemu_get_aio_context(), true);
470 ret = cco.ret;
471 if (ret < 0) {
472 if (cco.err) {
473 error_propagate(errp, cco.err);
474 } else {
475 error_setg_errno(errp, -ret, "Could not create image");
479 out:
480 g_free(cco.filename);
481 return ret;
484 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
486 BlockDriver *drv;
487 Error *local_err = NULL;
488 int ret;
490 drv = bdrv_find_protocol(filename, true);
491 if (drv == NULL) {
492 error_setg(errp, "Could not find protocol for file '%s'", filename);
493 return -ENOENT;
496 ret = bdrv_create(drv, filename, opts, &local_err);
497 if (local_err) {
498 error_propagate(errp, local_err);
500 return ret;
503 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
505 BlockDriver *drv = bs->drv;
506 Error *local_err = NULL;
508 memset(&bs->bl, 0, sizeof(bs->bl));
510 if (!drv) {
511 return;
514 /* Take some limits from the children as a default */
515 if (bs->file) {
516 bdrv_refresh_limits(bs->file, &local_err);
517 if (local_err) {
518 error_propagate(errp, local_err);
519 return;
521 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
522 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
523 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
524 } else {
525 bs->bl.opt_mem_alignment = 512;
528 if (bs->backing_hd) {
529 bdrv_refresh_limits(bs->backing_hd, &local_err);
530 if (local_err) {
531 error_propagate(errp, local_err);
532 return;
534 bs->bl.opt_transfer_length =
535 MAX(bs->bl.opt_transfer_length,
536 bs->backing_hd->bl.opt_transfer_length);
537 bs->bl.max_transfer_length =
538 MIN_NON_ZERO(bs->bl.max_transfer_length,
539 bs->backing_hd->bl.max_transfer_length);
540 bs->bl.opt_mem_alignment =
541 MAX(bs->bl.opt_mem_alignment,
542 bs->backing_hd->bl.opt_mem_alignment);
545 /* Then let the driver override it */
546 if (drv->bdrv_refresh_limits) {
547 drv->bdrv_refresh_limits(bs, errp);
552 * Create a uniquely-named empty temporary file.
553 * Return 0 upon success, otherwise a negative errno value.
555 int get_tmp_filename(char *filename, int size)
557 #ifdef _WIN32
558 char temp_dir[MAX_PATH];
559 /* GetTempFileName requires that its output buffer (4th param)
560 have length MAX_PATH or greater. */
561 assert(size >= MAX_PATH);
562 return (GetTempPath(MAX_PATH, temp_dir)
563 && GetTempFileName(temp_dir, "qem", 0, filename)
564 ? 0 : -GetLastError());
565 #else
566 int fd;
567 const char *tmpdir;
568 tmpdir = getenv("TMPDIR");
569 if (!tmpdir) {
570 tmpdir = "/var/tmp";
572 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
573 return -EOVERFLOW;
575 fd = mkstemp(filename);
576 if (fd < 0) {
577 return -errno;
579 if (close(fd) != 0) {
580 unlink(filename);
581 return -errno;
583 return 0;
584 #endif
588 * Detect host devices. By convention, /dev/cdrom[N] is always
589 * recognized as a host CDROM.
591 static BlockDriver *find_hdev_driver(const char *filename)
593 int score_max = 0, score;
594 BlockDriver *drv = NULL, *d;
596 QLIST_FOREACH(d, &bdrv_drivers, list) {
597 if (d->bdrv_probe_device) {
598 score = d->bdrv_probe_device(filename);
599 if (score > score_max) {
600 score_max = score;
601 drv = d;
606 return drv;
609 BlockDriver *bdrv_find_protocol(const char *filename,
610 bool allow_protocol_prefix)
612 BlockDriver *drv1;
613 char protocol[128];
614 int len;
615 const char *p;
617 /* TODO Drivers without bdrv_file_open must be specified explicitly */
620 * XXX(hch): we really should not let host device detection
621 * override an explicit protocol specification, but moving this
622 * later breaks access to device names with colons in them.
623 * Thanks to the brain-dead persistent naming schemes on udev-
624 * based Linux systems those actually are quite common.
626 drv1 = find_hdev_driver(filename);
627 if (drv1) {
628 return drv1;
631 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
632 return &bdrv_file;
635 p = strchr(filename, ':');
636 assert(p != NULL);
637 len = p - filename;
638 if (len > sizeof(protocol) - 1)
639 len = sizeof(protocol) - 1;
640 memcpy(protocol, filename, len);
641 protocol[len] = '\0';
642 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
643 if (drv1->protocol_name &&
644 !strcmp(drv1->protocol_name, protocol)) {
645 return drv1;
648 return NULL;
651 static int find_image_format(BlockDriverState *bs, const char *filename,
652 BlockDriver **pdrv, Error **errp)
654 int score, score_max;
655 BlockDriver *drv1, *drv;
656 uint8_t buf[2048];
657 int ret = 0;
659 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
660 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
661 *pdrv = &bdrv_raw;
662 return ret;
665 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
666 if (ret < 0) {
667 error_setg_errno(errp, -ret, "Could not read image for determining its "
668 "format");
669 *pdrv = NULL;
670 return ret;
673 score_max = 0;
674 drv = NULL;
675 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
676 if (drv1->bdrv_probe) {
677 score = drv1->bdrv_probe(buf, ret, filename);
678 if (score > score_max) {
679 score_max = score;
680 drv = drv1;
684 if (!drv) {
685 error_setg(errp, "Could not determine image format: No compatible "
686 "driver found");
687 ret = -ENOENT;
689 *pdrv = drv;
690 return ret;
694 * Set the current 'total_sectors' value
695 * Return 0 on success, -errno on error.
697 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
699 BlockDriver *drv = bs->drv;
701 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
702 if (bs->sg)
703 return 0;
705 /* query actual device if possible, otherwise just trust the hint */
706 if (drv->bdrv_getlength) {
707 int64_t length = drv->bdrv_getlength(bs);
708 if (length < 0) {
709 return length;
711 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
714 bs->total_sectors = hint;
715 return 0;
719 * Set open flags for a given discard mode
721 * Return 0 on success, -1 if the discard mode was invalid.
723 int bdrv_parse_discard_flags(const char *mode, int *flags)
725 *flags &= ~BDRV_O_UNMAP;
727 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
728 /* do nothing */
729 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
730 *flags |= BDRV_O_UNMAP;
731 } else {
732 return -1;
735 return 0;
739 * Set open flags for a given cache mode
741 * Return 0 on success, -1 if the cache mode was invalid.
743 int bdrv_parse_cache_flags(const char *mode, int *flags)
745 *flags &= ~BDRV_O_CACHE_MASK;
747 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
748 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
749 } else if (!strcmp(mode, "directsync")) {
750 *flags |= BDRV_O_NOCACHE;
751 } else if (!strcmp(mode, "writeback")) {
752 *flags |= BDRV_O_CACHE_WB;
753 } else if (!strcmp(mode, "unsafe")) {
754 *flags |= BDRV_O_CACHE_WB;
755 *flags |= BDRV_O_NO_FLUSH;
756 } else if (!strcmp(mode, "writethrough")) {
757 /* this is the default */
758 } else {
759 return -1;
762 return 0;
766 * The copy-on-read flag is actually a reference count so multiple users may
767 * use the feature without worrying about clobbering its previous state.
768 * Copy-on-read stays enabled until all users have called to disable it.
770 void bdrv_enable_copy_on_read(BlockDriverState *bs)
772 bs->copy_on_read++;
775 void bdrv_disable_copy_on_read(BlockDriverState *bs)
777 assert(bs->copy_on_read > 0);
778 bs->copy_on_read--;
782 * Returns the flags that a temporary snapshot should get, based on the
783 * originally requested flags (the originally requested image will have flags
784 * like a backing file)
786 static int bdrv_temp_snapshot_flags(int flags)
788 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
792 * Returns the flags that bs->file should get, based on the given flags for
793 * the parent BDS
795 static int bdrv_inherited_flags(int flags)
797 /* Enable protocol handling, disable format probing for bs->file */
798 flags |= BDRV_O_PROTOCOL;
800 /* Our block drivers take care to send flushes and respect unmap policy,
801 * so we can enable both unconditionally on lower layers. */
802 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
804 /* Clear flags that only apply to the top layer */
805 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
807 return flags;
811 * Returns the flags that bs->backing_hd should get, based on the given flags
812 * for the parent BDS
814 static int bdrv_backing_flags(int flags)
816 /* backing files always opened read-only */
817 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
819 /* snapshot=on is handled on the top layer */
820 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
822 return flags;
825 static int bdrv_open_flags(BlockDriverState *bs, int flags)
827 int open_flags = flags | BDRV_O_CACHE_WB;
830 * Clear flags that are internal to the block layer before opening the
831 * image.
833 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
836 * Snapshots should be writable.
838 if (flags & BDRV_O_TEMPORARY) {
839 open_flags |= BDRV_O_RDWR;
842 return open_flags;
845 static void bdrv_assign_node_name(BlockDriverState *bs,
846 const char *node_name,
847 Error **errp)
849 if (!node_name) {
850 return;
853 /* Check for empty string or invalid characters */
854 if (!id_wellformed(node_name)) {
855 error_setg(errp, "Invalid node name");
856 return;
859 /* takes care of avoiding namespaces collisions */
860 if (blk_by_name(node_name)) {
861 error_setg(errp, "node-name=%s is conflicting with a device id",
862 node_name);
863 return;
866 /* takes care of avoiding duplicates node names */
867 if (bdrv_find_node(node_name)) {
868 error_setg(errp, "Duplicate node name");
869 return;
872 /* copy node name into the bs and insert it into the graph list */
873 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
874 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
878 * Common part for opening disk images and files
880 * Removes all processed options from *options.
882 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
883 QDict *options, int flags, BlockDriver *drv, Error **errp)
885 int ret, open_flags;
886 const char *filename;
887 const char *node_name = NULL;
888 Error *local_err = NULL;
890 assert(drv != NULL);
891 assert(bs->file == NULL);
892 assert(options != NULL && bs->options != options);
894 if (file != NULL) {
895 filename = file->filename;
896 } else {
897 filename = qdict_get_try_str(options, "filename");
900 if (drv->bdrv_needs_filename && !filename) {
901 error_setg(errp, "The '%s' block driver requires a file name",
902 drv->format_name);
903 return -EINVAL;
906 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
908 node_name = qdict_get_try_str(options, "node-name");
909 bdrv_assign_node_name(bs, node_name, &local_err);
910 if (local_err) {
911 error_propagate(errp, local_err);
912 return -EINVAL;
914 qdict_del(options, "node-name");
916 /* bdrv_open() with directly using a protocol as drv. This layer is already
917 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
918 * and return immediately. */
919 if (file != NULL && drv->bdrv_file_open) {
920 bdrv_swap(file, bs);
921 return 0;
924 bs->open_flags = flags;
925 bs->guest_block_size = 512;
926 bs->request_alignment = 512;
927 bs->zero_beyond_eof = true;
928 open_flags = bdrv_open_flags(bs, flags);
929 bs->read_only = !(open_flags & BDRV_O_RDWR);
930 bs->growable = !!(flags & BDRV_O_PROTOCOL);
932 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
933 error_setg(errp,
934 !bs->read_only && bdrv_is_whitelisted(drv, true)
935 ? "Driver '%s' can only be used for read-only devices"
936 : "Driver '%s' is not whitelisted",
937 drv->format_name);
938 return -ENOTSUP;
941 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
942 if (flags & BDRV_O_COPY_ON_READ) {
943 if (!bs->read_only) {
944 bdrv_enable_copy_on_read(bs);
945 } else {
946 error_setg(errp, "Can't use copy-on-read on read-only device");
947 return -EINVAL;
951 if (filename != NULL) {
952 pstrcpy(bs->filename, sizeof(bs->filename), filename);
953 } else {
954 bs->filename[0] = '\0';
956 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
958 bs->drv = drv;
959 bs->opaque = g_malloc0(drv->instance_size);
961 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
963 /* Open the image, either directly or using a protocol */
964 if (drv->bdrv_file_open) {
965 assert(file == NULL);
966 assert(!drv->bdrv_needs_filename || filename != NULL);
967 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
968 } else {
969 if (file == NULL) {
970 error_setg(errp, "Can't use '%s' as a block driver for the "
971 "protocol level", drv->format_name);
972 ret = -EINVAL;
973 goto free_and_fail;
975 bs->file = file;
976 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
979 if (ret < 0) {
980 if (local_err) {
981 error_propagate(errp, local_err);
982 } else if (bs->filename[0]) {
983 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
984 } else {
985 error_setg_errno(errp, -ret, "Could not open image");
987 goto free_and_fail;
990 ret = refresh_total_sectors(bs, bs->total_sectors);
991 if (ret < 0) {
992 error_setg_errno(errp, -ret, "Could not refresh total sector count");
993 goto free_and_fail;
996 bdrv_refresh_limits(bs, &local_err);
997 if (local_err) {
998 error_propagate(errp, local_err);
999 ret = -EINVAL;
1000 goto free_and_fail;
1003 assert(bdrv_opt_mem_align(bs) != 0);
1004 assert((bs->request_alignment != 0) || bs->sg);
1005 return 0;
1007 free_and_fail:
1008 bs->file = NULL;
1009 g_free(bs->opaque);
1010 bs->opaque = NULL;
1011 bs->drv = NULL;
1012 return ret;
1015 static QDict *parse_json_filename(const char *filename, Error **errp)
1017 QObject *options_obj;
1018 QDict *options;
1019 int ret;
1021 ret = strstart(filename, "json:", &filename);
1022 assert(ret);
1024 options_obj = qobject_from_json(filename);
1025 if (!options_obj) {
1026 error_setg(errp, "Could not parse the JSON options");
1027 return NULL;
1030 if (qobject_type(options_obj) != QTYPE_QDICT) {
1031 qobject_decref(options_obj);
1032 error_setg(errp, "Invalid JSON object given");
1033 return NULL;
1036 options = qobject_to_qdict(options_obj);
1037 qdict_flatten(options);
1039 return options;
1043 * Fills in default options for opening images and converts the legacy
1044 * filename/flags pair to option QDict entries.
1046 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1047 BlockDriver *drv, Error **errp)
1049 const char *filename = *pfilename;
1050 const char *drvname;
1051 bool protocol = flags & BDRV_O_PROTOCOL;
1052 bool parse_filename = false;
1053 Error *local_err = NULL;
1055 /* Parse json: pseudo-protocol */
1056 if (filename && g_str_has_prefix(filename, "json:")) {
1057 QDict *json_options = parse_json_filename(filename, &local_err);
1058 if (local_err) {
1059 error_propagate(errp, local_err);
1060 return -EINVAL;
1063 /* Options given in the filename have lower priority than options
1064 * specified directly */
1065 qdict_join(*options, json_options, false);
1066 QDECREF(json_options);
1067 *pfilename = filename = NULL;
1070 /* Fetch the file name from the options QDict if necessary */
1071 if (protocol && filename) {
1072 if (!qdict_haskey(*options, "filename")) {
1073 qdict_put(*options, "filename", qstring_from_str(filename));
1074 parse_filename = true;
1075 } else {
1076 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1077 "the same time");
1078 return -EINVAL;
1082 /* Find the right block driver */
1083 filename = qdict_get_try_str(*options, "filename");
1084 drvname = qdict_get_try_str(*options, "driver");
1086 if (drv) {
1087 if (drvname) {
1088 error_setg(errp, "Driver specified twice");
1089 return -EINVAL;
1091 drvname = drv->format_name;
1092 qdict_put(*options, "driver", qstring_from_str(drvname));
1093 } else {
1094 if (!drvname && protocol) {
1095 if (filename) {
1096 drv = bdrv_find_protocol(filename, parse_filename);
1097 if (!drv) {
1098 error_setg(errp, "Unknown protocol");
1099 return -EINVAL;
1102 drvname = drv->format_name;
1103 qdict_put(*options, "driver", qstring_from_str(drvname));
1104 } else {
1105 error_setg(errp, "Must specify either driver or file");
1106 return -EINVAL;
1108 } else if (drvname) {
1109 drv = bdrv_find_format(drvname);
1110 if (!drv) {
1111 error_setg(errp, "Unknown driver '%s'", drvname);
1112 return -ENOENT;
1117 assert(drv || !protocol);
1119 /* Driver-specific filename parsing */
1120 if (drv && drv->bdrv_parse_filename && parse_filename) {
1121 drv->bdrv_parse_filename(filename, *options, &local_err);
1122 if (local_err) {
1123 error_propagate(errp, local_err);
1124 return -EINVAL;
1127 if (!drv->bdrv_needs_filename) {
1128 qdict_del(*options, "filename");
1132 return 0;
1135 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1138 if (bs->backing_hd) {
1139 assert(bs->backing_blocker);
1140 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1141 } else if (backing_hd) {
1142 error_setg(&bs->backing_blocker,
1143 "device is used as backing hd of '%s'",
1144 bdrv_get_device_name(bs));
1147 bs->backing_hd = backing_hd;
1148 if (!backing_hd) {
1149 error_free(bs->backing_blocker);
1150 bs->backing_blocker = NULL;
1151 goto out;
1153 bs->open_flags &= ~BDRV_O_NO_BACKING;
1154 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1155 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1156 backing_hd->drv ? backing_hd->drv->format_name : "");
1158 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1159 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1160 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1161 bs->backing_blocker);
1162 out:
1163 bdrv_refresh_limits(bs, NULL);
1167 * Opens the backing file for a BlockDriverState if not yet open
1169 * options is a QDict of options to pass to the block drivers, or NULL for an
1170 * empty set of options. The reference to the QDict is transferred to this
1171 * function (even on failure), so if the caller intends to reuse the dictionary,
1172 * it needs to use QINCREF() before calling bdrv_file_open.
1174 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1176 char *backing_filename = g_malloc0(PATH_MAX);
1177 int ret = 0;
1178 BlockDriver *back_drv = NULL;
1179 BlockDriverState *backing_hd;
1180 Error *local_err = NULL;
1182 if (bs->backing_hd != NULL) {
1183 QDECREF(options);
1184 goto free_exit;
1187 /* NULL means an empty set of options */
1188 if (options == NULL) {
1189 options = qdict_new();
1192 bs->open_flags &= ~BDRV_O_NO_BACKING;
1193 if (qdict_haskey(options, "file.filename")) {
1194 backing_filename[0] = '\0';
1195 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1196 QDECREF(options);
1197 goto free_exit;
1198 } else {
1199 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1202 if (!bs->drv || !bs->drv->supports_backing) {
1203 ret = -EINVAL;
1204 error_setg(errp, "Driver doesn't support backing files");
1205 QDECREF(options);
1206 goto free_exit;
1209 backing_hd = bdrv_new();
1211 if (bs->backing_format[0] != '\0') {
1212 back_drv = bdrv_find_format(bs->backing_format);
1215 assert(bs->backing_hd == NULL);
1216 ret = bdrv_open(&backing_hd,
1217 *backing_filename ? backing_filename : NULL, NULL, options,
1218 bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1219 if (ret < 0) {
1220 bdrv_unref(backing_hd);
1221 backing_hd = NULL;
1222 bs->open_flags |= BDRV_O_NO_BACKING;
1223 error_setg(errp, "Could not open backing file: %s",
1224 error_get_pretty(local_err));
1225 error_free(local_err);
1226 goto free_exit;
1228 bdrv_set_backing_hd(bs, backing_hd);
1230 free_exit:
1231 g_free(backing_filename);
1232 return ret;
1236 * Opens a disk image whose options are given as BlockdevRef in another block
1237 * device's options.
1239 * If allow_none is true, no image will be opened if filename is false and no
1240 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1242 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1243 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1244 * itself, all options starting with "${bdref_key}." are considered part of the
1245 * BlockdevRef.
1247 * The BlockdevRef will be removed from the options QDict.
1249 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1251 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1252 QDict *options, const char *bdref_key, int flags,
1253 bool allow_none, Error **errp)
1255 QDict *image_options;
1256 int ret;
1257 char *bdref_key_dot;
1258 const char *reference;
1260 assert(pbs);
1261 assert(*pbs == NULL);
1263 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1264 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1265 g_free(bdref_key_dot);
1267 reference = qdict_get_try_str(options, bdref_key);
1268 if (!filename && !reference && !qdict_size(image_options)) {
1269 if (allow_none) {
1270 ret = 0;
1271 } else {
1272 error_setg(errp, "A block device must be specified for \"%s\"",
1273 bdref_key);
1274 ret = -EINVAL;
1276 QDECREF(image_options);
1277 goto done;
1280 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1282 done:
1283 qdict_del(options, bdref_key);
1284 return ret;
1287 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1289 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1290 char *tmp_filename = g_malloc0(PATH_MAX + 1);
1291 int64_t total_size;
1292 QemuOpts *opts = NULL;
1293 QDict *snapshot_options;
1294 BlockDriverState *bs_snapshot;
1295 Error *local_err;
1296 int ret;
1298 /* if snapshot, we create a temporary backing file and open it
1299 instead of opening 'filename' directly */
1301 /* Get the required size from the image */
1302 total_size = bdrv_getlength(bs);
1303 if (total_size < 0) {
1304 ret = total_size;
1305 error_setg_errno(errp, -total_size, "Could not get image size");
1306 goto out;
1309 /* Create the temporary image */
1310 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1311 if (ret < 0) {
1312 error_setg_errno(errp, -ret, "Could not get temporary filename");
1313 goto out;
1316 opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
1317 &error_abort);
1318 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1319 ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, &local_err);
1320 qemu_opts_del(opts);
1321 if (ret < 0) {
1322 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1323 "'%s': %s", tmp_filename,
1324 error_get_pretty(local_err));
1325 error_free(local_err);
1326 goto out;
1329 /* Prepare a new options QDict for the temporary file */
1330 snapshot_options = qdict_new();
1331 qdict_put(snapshot_options, "file.driver",
1332 qstring_from_str("file"));
1333 qdict_put(snapshot_options, "file.filename",
1334 qstring_from_str(tmp_filename));
1336 bs_snapshot = bdrv_new();
1338 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1339 flags, &bdrv_qcow2, &local_err);
1340 if (ret < 0) {
1341 error_propagate(errp, local_err);
1342 goto out;
1345 bdrv_append(bs_snapshot, bs);
1347 out:
1348 g_free(tmp_filename);
1349 return ret;
1353 * Opens a disk image (raw, qcow2, vmdk, ...)
1355 * options is a QDict of options to pass to the block drivers, or NULL for an
1356 * empty set of options. The reference to the QDict belongs to the block layer
1357 * after the call (even on failure), so if the caller intends to reuse the
1358 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1360 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1361 * If it is not NULL, the referenced BDS will be reused.
1363 * The reference parameter may be used to specify an existing block device which
1364 * should be opened. If specified, neither options nor a filename may be given,
1365 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1367 int bdrv_open(BlockDriverState **pbs, const char *filename,
1368 const char *reference, QDict *options, int flags,
1369 BlockDriver *drv, Error **errp)
1371 int ret;
1372 BlockDriverState *file = NULL, *bs;
1373 const char *drvname;
1374 Error *local_err = NULL;
1375 int snapshot_flags = 0;
1377 assert(pbs);
1379 if (reference) {
1380 bool options_non_empty = options ? qdict_size(options) : false;
1381 QDECREF(options);
1383 if (*pbs) {
1384 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1385 "another block device");
1386 return -EINVAL;
1389 if (filename || options_non_empty) {
1390 error_setg(errp, "Cannot reference an existing block device with "
1391 "additional options or a new filename");
1392 return -EINVAL;
1395 bs = bdrv_lookup_bs(reference, reference, errp);
1396 if (!bs) {
1397 return -ENODEV;
1399 bdrv_ref(bs);
1400 *pbs = bs;
1401 return 0;
1404 if (*pbs) {
1405 bs = *pbs;
1406 } else {
1407 bs = bdrv_new();
1410 /* NULL means an empty set of options */
1411 if (options == NULL) {
1412 options = qdict_new();
1415 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1416 if (local_err) {
1417 goto fail;
1420 /* Find the right image format driver */
1421 drv = NULL;
1422 drvname = qdict_get_try_str(options, "driver");
1423 if (drvname) {
1424 drv = bdrv_find_format(drvname);
1425 qdict_del(options, "driver");
1426 if (!drv) {
1427 error_setg(errp, "Unknown driver: '%s'", drvname);
1428 ret = -EINVAL;
1429 goto fail;
1433 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1434 if (drv && !drv->bdrv_file_open) {
1435 /* If the user explicitly wants a format driver here, we'll need to add
1436 * another layer for the protocol in bs->file */
1437 flags &= ~BDRV_O_PROTOCOL;
1440 bs->options = options;
1441 options = qdict_clone_shallow(options);
1443 /* Open image file without format layer */
1444 if ((flags & BDRV_O_PROTOCOL) == 0) {
1445 if (flags & BDRV_O_RDWR) {
1446 flags |= BDRV_O_ALLOW_RDWR;
1448 if (flags & BDRV_O_SNAPSHOT) {
1449 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1450 flags = bdrv_backing_flags(flags);
1453 assert(file == NULL);
1454 ret = bdrv_open_image(&file, filename, options, "file",
1455 bdrv_inherited_flags(flags),
1456 true, &local_err);
1457 if (ret < 0) {
1458 goto fail;
1462 /* Image format probing */
1463 if (!drv && file) {
1464 ret = find_image_format(file, filename, &drv, &local_err);
1465 if (ret < 0) {
1466 goto fail;
1468 } else if (!drv) {
1469 error_setg(errp, "Must specify either driver or file");
1470 ret = -EINVAL;
1471 goto fail;
1474 /* Open the image */
1475 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1476 if (ret < 0) {
1477 goto fail;
1480 if (file && (bs->file != file)) {
1481 bdrv_unref(file);
1482 file = NULL;
1485 /* If there is a backing file, use it */
1486 if ((flags & BDRV_O_NO_BACKING) == 0) {
1487 QDict *backing_options;
1489 qdict_extract_subqdict(options, &backing_options, "backing.");
1490 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1491 if (ret < 0) {
1492 goto close_and_fail;
1496 bdrv_refresh_filename(bs);
1498 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1499 * temporary snapshot afterwards. */
1500 if (snapshot_flags) {
1501 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1502 if (local_err) {
1503 goto close_and_fail;
1507 /* Check if any unknown options were used */
1508 if (options && (qdict_size(options) != 0)) {
1509 const QDictEntry *entry = qdict_first(options);
1510 if (flags & BDRV_O_PROTOCOL) {
1511 error_setg(errp, "Block protocol '%s' doesn't support the option "
1512 "'%s'", drv->format_name, entry->key);
1513 } else {
1514 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1515 "support the option '%s'", drv->format_name,
1516 bdrv_get_device_name(bs), entry->key);
1519 ret = -EINVAL;
1520 goto close_and_fail;
1523 if (!bdrv_key_required(bs)) {
1524 if (bs->blk) {
1525 blk_dev_change_media_cb(bs->blk, true);
1527 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1528 && !runstate_check(RUN_STATE_INMIGRATE)
1529 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1530 error_setg(errp,
1531 "Guest must be stopped for opening of encrypted image");
1532 ret = -EBUSY;
1533 goto close_and_fail;
1536 QDECREF(options);
1537 *pbs = bs;
1538 return 0;
1540 fail:
1541 if (file != NULL) {
1542 bdrv_unref(file);
1544 QDECREF(bs->options);
1545 QDECREF(options);
1546 bs->options = NULL;
1547 if (!*pbs) {
1548 /* If *pbs is NULL, a new BDS has been created in this function and
1549 needs to be freed now. Otherwise, it does not need to be closed,
1550 since it has not really been opened yet. */
1551 bdrv_unref(bs);
1553 if (local_err) {
1554 error_propagate(errp, local_err);
1556 return ret;
1558 close_and_fail:
1559 /* See fail path, but now the BDS has to be always closed */
1560 if (*pbs) {
1561 bdrv_close(bs);
1562 } else {
1563 bdrv_unref(bs);
1565 QDECREF(options);
1566 if (local_err) {
1567 error_propagate(errp, local_err);
1569 return ret;
1572 typedef struct BlockReopenQueueEntry {
1573 bool prepared;
1574 BDRVReopenState state;
1575 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1576 } BlockReopenQueueEntry;
1579 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1580 * reopen of multiple devices.
1582 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1583 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1584 * be created and initialized. This newly created BlockReopenQueue should be
1585 * passed back in for subsequent calls that are intended to be of the same
1586 * atomic 'set'.
1588 * bs is the BlockDriverState to add to the reopen queue.
1590 * flags contains the open flags for the associated bs
1592 * returns a pointer to bs_queue, which is either the newly allocated
1593 * bs_queue, or the existing bs_queue being used.
1596 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1597 BlockDriverState *bs, int flags)
1599 assert(bs != NULL);
1601 BlockReopenQueueEntry *bs_entry;
1602 if (bs_queue == NULL) {
1603 bs_queue = g_new0(BlockReopenQueue, 1);
1604 QSIMPLEQ_INIT(bs_queue);
1607 /* bdrv_open() masks this flag out */
1608 flags &= ~BDRV_O_PROTOCOL;
1610 if (bs->file) {
1611 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1614 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1615 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1617 bs_entry->state.bs = bs;
1618 bs_entry->state.flags = flags;
1620 return bs_queue;
1624 * Reopen multiple BlockDriverStates atomically & transactionally.
1626 * The queue passed in (bs_queue) must have been built up previous
1627 * via bdrv_reopen_queue().
1629 * Reopens all BDS specified in the queue, with the appropriate
1630 * flags. All devices are prepared for reopen, and failure of any
1631 * device will cause all device changes to be abandonded, and intermediate
1632 * data cleaned up.
1634 * If all devices prepare successfully, then the changes are committed
1635 * to all devices.
1638 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1640 int ret = -1;
1641 BlockReopenQueueEntry *bs_entry, *next;
1642 Error *local_err = NULL;
1644 assert(bs_queue != NULL);
1646 bdrv_drain_all();
1648 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1649 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1650 error_propagate(errp, local_err);
1651 goto cleanup;
1653 bs_entry->prepared = true;
1656 /* If we reach this point, we have success and just need to apply the
1657 * changes
1659 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1660 bdrv_reopen_commit(&bs_entry->state);
1663 ret = 0;
1665 cleanup:
1666 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1667 if (ret && bs_entry->prepared) {
1668 bdrv_reopen_abort(&bs_entry->state);
1670 g_free(bs_entry);
1672 g_free(bs_queue);
1673 return ret;
1677 /* Reopen a single BlockDriverState with the specified flags. */
1678 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1680 int ret = -1;
1681 Error *local_err = NULL;
1682 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1684 ret = bdrv_reopen_multiple(queue, &local_err);
1685 if (local_err != NULL) {
1686 error_propagate(errp, local_err);
1688 return ret;
1693 * Prepares a BlockDriverState for reopen. All changes are staged in the
1694 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1695 * the block driver layer .bdrv_reopen_prepare()
1697 * bs is the BlockDriverState to reopen
1698 * flags are the new open flags
1699 * queue is the reopen queue
1701 * Returns 0 on success, non-zero on error. On error errp will be set
1702 * as well.
1704 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1705 * It is the responsibility of the caller to then call the abort() or
1706 * commit() for any other BDS that have been left in a prepare() state
1709 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1710 Error **errp)
1712 int ret = -1;
1713 Error *local_err = NULL;
1714 BlockDriver *drv;
1716 assert(reopen_state != NULL);
1717 assert(reopen_state->bs->drv != NULL);
1718 drv = reopen_state->bs->drv;
1720 /* if we are to stay read-only, do not allow permission change
1721 * to r/w */
1722 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1723 reopen_state->flags & BDRV_O_RDWR) {
1724 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1725 bdrv_get_device_name(reopen_state->bs));
1726 goto error;
1730 ret = bdrv_flush(reopen_state->bs);
1731 if (ret) {
1732 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1733 strerror(-ret));
1734 goto error;
1737 if (drv->bdrv_reopen_prepare) {
1738 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1739 if (ret) {
1740 if (local_err != NULL) {
1741 error_propagate(errp, local_err);
1742 } else {
1743 error_setg(errp, "failed while preparing to reopen image '%s'",
1744 reopen_state->bs->filename);
1746 goto error;
1748 } else {
1749 /* It is currently mandatory to have a bdrv_reopen_prepare()
1750 * handler for each supported drv. */
1751 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1752 drv->format_name, bdrv_get_device_name(reopen_state->bs),
1753 "reopening of file");
1754 ret = -1;
1755 goto error;
1758 ret = 0;
1760 error:
1761 return ret;
1765 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1766 * makes them final by swapping the staging BlockDriverState contents into
1767 * the active BlockDriverState contents.
1769 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1771 BlockDriver *drv;
1773 assert(reopen_state != NULL);
1774 drv = reopen_state->bs->drv;
1775 assert(drv != NULL);
1777 /* If there are any driver level actions to take */
1778 if (drv->bdrv_reopen_commit) {
1779 drv->bdrv_reopen_commit(reopen_state);
1782 /* set BDS specific flags now */
1783 reopen_state->bs->open_flags = reopen_state->flags;
1784 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1785 BDRV_O_CACHE_WB);
1786 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1788 bdrv_refresh_limits(reopen_state->bs, NULL);
1792 * Abort the reopen, and delete and free the staged changes in
1793 * reopen_state
1795 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1797 BlockDriver *drv;
1799 assert(reopen_state != NULL);
1800 drv = reopen_state->bs->drv;
1801 assert(drv != NULL);
1803 if (drv->bdrv_reopen_abort) {
1804 drv->bdrv_reopen_abort(reopen_state);
1809 void bdrv_close(BlockDriverState *bs)
1811 BdrvAioNotifier *ban, *ban_next;
1813 if (bs->job) {
1814 block_job_cancel_sync(bs->job);
1816 bdrv_drain_all(); /* complete I/O */
1817 bdrv_flush(bs);
1818 bdrv_drain_all(); /* in case flush left pending I/O */
1819 notifier_list_notify(&bs->close_notifiers, bs);
1821 if (bs->drv) {
1822 if (bs->backing_hd) {
1823 BlockDriverState *backing_hd = bs->backing_hd;
1824 bdrv_set_backing_hd(bs, NULL);
1825 bdrv_unref(backing_hd);
1827 bs->drv->bdrv_close(bs);
1828 g_free(bs->opaque);
1829 bs->opaque = NULL;
1830 bs->drv = NULL;
1831 bs->copy_on_read = 0;
1832 bs->backing_file[0] = '\0';
1833 bs->backing_format[0] = '\0';
1834 bs->total_sectors = 0;
1835 bs->encrypted = 0;
1836 bs->valid_key = 0;
1837 bs->sg = 0;
1838 bs->growable = 0;
1839 bs->zero_beyond_eof = false;
1840 QDECREF(bs->options);
1841 bs->options = NULL;
1842 QDECREF(bs->full_open_options);
1843 bs->full_open_options = NULL;
1845 if (bs->file != NULL) {
1846 bdrv_unref(bs->file);
1847 bs->file = NULL;
1851 if (bs->blk) {
1852 blk_dev_change_media_cb(bs->blk, false);
1855 /*throttling disk I/O limits*/
1856 if (bs->io_limits_enabled) {
1857 bdrv_io_limits_disable(bs);
1860 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1861 g_free(ban);
1863 QLIST_INIT(&bs->aio_notifiers);
1866 void bdrv_close_all(void)
1868 BlockDriverState *bs;
1870 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1871 AioContext *aio_context = bdrv_get_aio_context(bs);
1873 aio_context_acquire(aio_context);
1874 bdrv_close(bs);
1875 aio_context_release(aio_context);
1879 /* Check if any requests are in-flight (including throttled requests) */
1880 static bool bdrv_requests_pending(BlockDriverState *bs)
1882 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1883 return true;
1885 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1886 return true;
1888 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1889 return true;
1891 if (bs->file && bdrv_requests_pending(bs->file)) {
1892 return true;
1894 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1895 return true;
1897 return false;
1900 static bool bdrv_drain_one(BlockDriverState *bs)
1902 bool bs_busy;
1904 bdrv_flush_io_queue(bs);
1905 bdrv_start_throttled_reqs(bs);
1906 bs_busy = bdrv_requests_pending(bs);
1907 bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
1908 return bs_busy;
1912 * Wait for pending requests to complete on a single BlockDriverState subtree
1914 * See the warning in bdrv_drain_all(). This function can only be called if
1915 * you are sure nothing can generate I/O because you have op blockers
1916 * installed.
1918 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
1919 * AioContext.
1921 void bdrv_drain(BlockDriverState *bs)
1923 while (bdrv_drain_one(bs)) {
1924 /* Keep iterating */
1929 * Wait for pending requests to complete across all BlockDriverStates
1931 * This function does not flush data to disk, use bdrv_flush_all() for that
1932 * after calling this function.
1934 * Note that completion of an asynchronous I/O operation can trigger any
1935 * number of other I/O operations on other devices---for example a coroutine
1936 * can be arbitrarily complex and a constant flow of I/O can come until the
1937 * coroutine is complete. Because of this, it is not possible to have a
1938 * function to drain a single device's I/O queue.
1940 void bdrv_drain_all(void)
1942 /* Always run first iteration so any pending completion BHs run */
1943 bool busy = true;
1944 BlockDriverState *bs;
1946 while (busy) {
1947 busy = false;
1949 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1950 AioContext *aio_context = bdrv_get_aio_context(bs);
1952 aio_context_acquire(aio_context);
1953 busy |= bdrv_drain_one(bs);
1954 aio_context_release(aio_context);
1959 /* make a BlockDriverState anonymous by removing from bdrv_state and
1960 * graph_bdrv_state list.
1961 Also, NULL terminate the device_name to prevent double remove */
1962 void bdrv_make_anon(BlockDriverState *bs)
1965 * Take care to remove bs from bdrv_states only when it's actually
1966 * in it. Note that bs->device_list.tqe_prev is initially null,
1967 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
1968 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1969 * resetting it to null on remove.
1971 if (bs->device_list.tqe_prev) {
1972 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1973 bs->device_list.tqe_prev = NULL;
1975 if (bs->node_name[0] != '\0') {
1976 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1978 bs->node_name[0] = '\0';
1981 static void bdrv_rebind(BlockDriverState *bs)
1983 if (bs->drv && bs->drv->bdrv_rebind) {
1984 bs->drv->bdrv_rebind(bs);
1988 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1989 BlockDriverState *bs_src)
1991 /* move some fields that need to stay attached to the device */
1993 /* dev info */
1994 bs_dest->guest_block_size = bs_src->guest_block_size;
1995 bs_dest->copy_on_read = bs_src->copy_on_read;
1997 bs_dest->enable_write_cache = bs_src->enable_write_cache;
1999 /* i/o throttled req */
2000 memcpy(&bs_dest->throttle_state,
2001 &bs_src->throttle_state,
2002 sizeof(ThrottleState));
2003 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
2004 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
2005 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
2007 /* r/w error */
2008 bs_dest->on_read_error = bs_src->on_read_error;
2009 bs_dest->on_write_error = bs_src->on_write_error;
2011 /* i/o status */
2012 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2013 bs_dest->iostatus = bs_src->iostatus;
2015 /* dirty bitmap */
2016 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
2018 /* reference count */
2019 bs_dest->refcnt = bs_src->refcnt;
2021 /* job */
2022 bs_dest->job = bs_src->job;
2024 /* keep the same entry in bdrv_states */
2025 bs_dest->device_list = bs_src->device_list;
2026 bs_dest->blk = bs_src->blk;
2028 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2029 sizeof(bs_dest->op_blockers));
2033 * Swap bs contents for two image chains while they are live,
2034 * while keeping required fields on the BlockDriverState that is
2035 * actually attached to a device.
2037 * This will modify the BlockDriverState fields, and swap contents
2038 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2040 * bs_new must not be attached to a BlockBackend.
2042 * This function does not create any image files.
2044 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2046 BlockDriverState tmp;
2048 /* The code needs to swap the node_name but simply swapping node_list won't
2049 * work so first remove the nodes from the graph list, do the swap then
2050 * insert them back if needed.
2052 if (bs_new->node_name[0] != '\0') {
2053 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2055 if (bs_old->node_name[0] != '\0') {
2056 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2059 /* bs_new must be unattached and shouldn't have anything fancy enabled */
2060 assert(!bs_new->blk);
2061 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2062 assert(bs_new->job == NULL);
2063 assert(bs_new->io_limits_enabled == false);
2064 assert(!throttle_have_timer(&bs_new->throttle_state));
2066 tmp = *bs_new;
2067 *bs_new = *bs_old;
2068 *bs_old = tmp;
2070 /* there are some fields that should not be swapped, move them back */
2071 bdrv_move_feature_fields(&tmp, bs_old);
2072 bdrv_move_feature_fields(bs_old, bs_new);
2073 bdrv_move_feature_fields(bs_new, &tmp);
2075 /* bs_new must remain unattached */
2076 assert(!bs_new->blk);
2078 /* Check a few fields that should remain attached to the device */
2079 assert(bs_new->job == NULL);
2080 assert(bs_new->io_limits_enabled == false);
2081 assert(!throttle_have_timer(&bs_new->throttle_state));
2083 /* insert the nodes back into the graph node list if needed */
2084 if (bs_new->node_name[0] != '\0') {
2085 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2087 if (bs_old->node_name[0] != '\0') {
2088 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2091 bdrv_rebind(bs_new);
2092 bdrv_rebind(bs_old);
2096 * Add new bs contents at the top of an image chain while the chain is
2097 * live, while keeping required fields on the top layer.
2099 * This will modify the BlockDriverState fields, and swap contents
2100 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2102 * bs_new must not be attached to a BlockBackend.
2104 * This function does not create any image files.
2106 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2108 bdrv_swap(bs_new, bs_top);
2110 /* The contents of 'tmp' will become bs_top, as we are
2111 * swapping bs_new and bs_top contents. */
2112 bdrv_set_backing_hd(bs_top, bs_new);
2115 static void bdrv_delete(BlockDriverState *bs)
2117 assert(!bs->job);
2118 assert(bdrv_op_blocker_is_empty(bs));
2119 assert(!bs->refcnt);
2120 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2122 bdrv_close(bs);
2124 /* remove from list, if necessary */
2125 bdrv_make_anon(bs);
2127 g_free(bs);
2131 * Run consistency checks on an image
2133 * Returns 0 if the check could be completed (it doesn't mean that the image is
2134 * free of errors) or -errno when an internal error occurred. The results of the
2135 * check are stored in res.
2137 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2139 if (bs->drv == NULL) {
2140 return -ENOMEDIUM;
2142 if (bs->drv->bdrv_check == NULL) {
2143 return -ENOTSUP;
2146 memset(res, 0, sizeof(*res));
2147 return bs->drv->bdrv_check(bs, res, fix);
2150 #define COMMIT_BUF_SECTORS 2048
2152 /* commit COW file into the raw image */
2153 int bdrv_commit(BlockDriverState *bs)
2155 BlockDriver *drv = bs->drv;
2156 int64_t sector, total_sectors, length, backing_length;
2157 int n, ro, open_flags;
2158 int ret = 0;
2159 uint8_t *buf = NULL;
2160 char filename[PATH_MAX];
2162 if (!drv)
2163 return -ENOMEDIUM;
2165 if (!bs->backing_hd) {
2166 return -ENOTSUP;
2169 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2170 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2171 return -EBUSY;
2174 ro = bs->backing_hd->read_only;
2175 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2176 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2177 open_flags = bs->backing_hd->open_flags;
2179 if (ro) {
2180 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2181 return -EACCES;
2185 length = bdrv_getlength(bs);
2186 if (length < 0) {
2187 ret = length;
2188 goto ro_cleanup;
2191 backing_length = bdrv_getlength(bs->backing_hd);
2192 if (backing_length < 0) {
2193 ret = backing_length;
2194 goto ro_cleanup;
2197 /* If our top snapshot is larger than the backing file image,
2198 * grow the backing file image if possible. If not possible,
2199 * we must return an error */
2200 if (length > backing_length) {
2201 ret = bdrv_truncate(bs->backing_hd, length);
2202 if (ret < 0) {
2203 goto ro_cleanup;
2207 total_sectors = length >> BDRV_SECTOR_BITS;
2209 /* qemu_try_blockalign() for bs will choose an alignment that works for
2210 * bs->backing_hd as well, so no need to compare the alignment manually. */
2211 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2212 if (buf == NULL) {
2213 ret = -ENOMEM;
2214 goto ro_cleanup;
2217 for (sector = 0; sector < total_sectors; sector += n) {
2218 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2219 if (ret < 0) {
2220 goto ro_cleanup;
2222 if (ret) {
2223 ret = bdrv_read(bs, sector, buf, n);
2224 if (ret < 0) {
2225 goto ro_cleanup;
2228 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2229 if (ret < 0) {
2230 goto ro_cleanup;
2235 if (drv->bdrv_make_empty) {
2236 ret = drv->bdrv_make_empty(bs);
2237 if (ret < 0) {
2238 goto ro_cleanup;
2240 bdrv_flush(bs);
2244 * Make sure all data we wrote to the backing device is actually
2245 * stable on disk.
2247 if (bs->backing_hd) {
2248 bdrv_flush(bs->backing_hd);
2251 ret = 0;
2252 ro_cleanup:
2253 qemu_vfree(buf);
2255 if (ro) {
2256 /* ignoring error return here */
2257 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2260 return ret;
2263 int bdrv_commit_all(void)
2265 BlockDriverState *bs;
2267 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2268 AioContext *aio_context = bdrv_get_aio_context(bs);
2270 aio_context_acquire(aio_context);
2271 if (bs->drv && bs->backing_hd) {
2272 int ret = bdrv_commit(bs);
2273 if (ret < 0) {
2274 aio_context_release(aio_context);
2275 return ret;
2278 aio_context_release(aio_context);
2280 return 0;
2284 * Remove an active request from the tracked requests list
2286 * This function should be called when a tracked request is completing.
2288 static void tracked_request_end(BdrvTrackedRequest *req)
2290 if (req->serialising) {
2291 req->bs->serialising_in_flight--;
2294 QLIST_REMOVE(req, list);
2295 qemu_co_queue_restart_all(&req->wait_queue);
2299 * Add an active request to the tracked requests list
2301 static void tracked_request_begin(BdrvTrackedRequest *req,
2302 BlockDriverState *bs,
2303 int64_t offset,
2304 unsigned int bytes, bool is_write)
2306 *req = (BdrvTrackedRequest){
2307 .bs = bs,
2308 .offset = offset,
2309 .bytes = bytes,
2310 .is_write = is_write,
2311 .co = qemu_coroutine_self(),
2312 .serialising = false,
2313 .overlap_offset = offset,
2314 .overlap_bytes = bytes,
2317 qemu_co_queue_init(&req->wait_queue);
2319 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2322 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2324 int64_t overlap_offset = req->offset & ~(align - 1);
2325 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2326 - overlap_offset;
2328 if (!req->serialising) {
2329 req->bs->serialising_in_flight++;
2330 req->serialising = true;
2333 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2334 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2338 * Round a region to cluster boundaries
2340 void bdrv_round_to_clusters(BlockDriverState *bs,
2341 int64_t sector_num, int nb_sectors,
2342 int64_t *cluster_sector_num,
2343 int *cluster_nb_sectors)
2345 BlockDriverInfo bdi;
2347 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2348 *cluster_sector_num = sector_num;
2349 *cluster_nb_sectors = nb_sectors;
2350 } else {
2351 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2352 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2353 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2354 nb_sectors, c);
2358 static int bdrv_get_cluster_size(BlockDriverState *bs)
2360 BlockDriverInfo bdi;
2361 int ret;
2363 ret = bdrv_get_info(bs, &bdi);
2364 if (ret < 0 || bdi.cluster_size == 0) {
2365 return bs->request_alignment;
2366 } else {
2367 return bdi.cluster_size;
2371 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2372 int64_t offset, unsigned int bytes)
2374 /* aaaa bbbb */
2375 if (offset >= req->overlap_offset + req->overlap_bytes) {
2376 return false;
2378 /* bbbb aaaa */
2379 if (req->overlap_offset >= offset + bytes) {
2380 return false;
2382 return true;
2385 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2387 BlockDriverState *bs = self->bs;
2388 BdrvTrackedRequest *req;
2389 bool retry;
2390 bool waited = false;
2392 if (!bs->serialising_in_flight) {
2393 return false;
2396 do {
2397 retry = false;
2398 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2399 if (req == self || (!req->serialising && !self->serialising)) {
2400 continue;
2402 if (tracked_request_overlaps(req, self->overlap_offset,
2403 self->overlap_bytes))
2405 /* Hitting this means there was a reentrant request, for
2406 * example, a block driver issuing nested requests. This must
2407 * never happen since it means deadlock.
2409 assert(qemu_coroutine_self() != req->co);
2411 /* If the request is already (indirectly) waiting for us, or
2412 * will wait for us as soon as it wakes up, then just go on
2413 * (instead of producing a deadlock in the former case). */
2414 if (!req->waiting_for) {
2415 self->waiting_for = req;
2416 qemu_co_queue_wait(&req->wait_queue);
2417 self->waiting_for = NULL;
2418 retry = true;
2419 waited = true;
2420 break;
2424 } while (retry);
2426 return waited;
2430 * Return values:
2431 * 0 - success
2432 * -EINVAL - backing format specified, but no file
2433 * -ENOSPC - can't update the backing file because no space is left in the
2434 * image file header
2435 * -ENOTSUP - format driver doesn't support changing the backing file
2437 int bdrv_change_backing_file(BlockDriverState *bs,
2438 const char *backing_file, const char *backing_fmt)
2440 BlockDriver *drv = bs->drv;
2441 int ret;
2443 /* Backing file format doesn't make sense without a backing file */
2444 if (backing_fmt && !backing_file) {
2445 return -EINVAL;
2448 if (drv->bdrv_change_backing_file != NULL) {
2449 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2450 } else {
2451 ret = -ENOTSUP;
2454 if (ret == 0) {
2455 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2456 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2458 return ret;
2462 * Finds the image layer in the chain that has 'bs' as its backing file.
2464 * active is the current topmost image.
2466 * Returns NULL if bs is not found in active's image chain,
2467 * or if active == bs.
2469 * Returns the bottommost base image if bs == NULL.
2471 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2472 BlockDriverState *bs)
2474 while (active && bs != active->backing_hd) {
2475 active = active->backing_hd;
2478 return active;
2481 /* Given a BDS, searches for the base layer. */
2482 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2484 return bdrv_find_overlay(bs, NULL);
2487 typedef struct BlkIntermediateStates {
2488 BlockDriverState *bs;
2489 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2490 } BlkIntermediateStates;
2494 * Drops images above 'base' up to and including 'top', and sets the image
2495 * above 'top' to have base as its backing file.
2497 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2498 * information in 'bs' can be properly updated.
2500 * E.g., this will convert the following chain:
2501 * bottom <- base <- intermediate <- top <- active
2503 * to
2505 * bottom <- base <- active
2507 * It is allowed for bottom==base, in which case it converts:
2509 * base <- intermediate <- top <- active
2511 * to
2513 * base <- active
2515 * If backing_file_str is non-NULL, it will be used when modifying top's
2516 * overlay image metadata.
2518 * Error conditions:
2519 * if active == top, that is considered an error
2522 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2523 BlockDriverState *base, const char *backing_file_str)
2525 BlockDriverState *intermediate;
2526 BlockDriverState *base_bs = NULL;
2527 BlockDriverState *new_top_bs = NULL;
2528 BlkIntermediateStates *intermediate_state, *next;
2529 int ret = -EIO;
2531 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2532 QSIMPLEQ_INIT(&states_to_delete);
2534 if (!top->drv || !base->drv) {
2535 goto exit;
2538 new_top_bs = bdrv_find_overlay(active, top);
2540 if (new_top_bs == NULL) {
2541 /* we could not find the image above 'top', this is an error */
2542 goto exit;
2545 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2546 * to do, no intermediate images */
2547 if (new_top_bs->backing_hd == base) {
2548 ret = 0;
2549 goto exit;
2552 intermediate = top;
2554 /* now we will go down through the list, and add each BDS we find
2555 * into our deletion queue, until we hit the 'base'
2557 while (intermediate) {
2558 intermediate_state = g_new0(BlkIntermediateStates, 1);
2559 intermediate_state->bs = intermediate;
2560 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2562 if (intermediate->backing_hd == base) {
2563 base_bs = intermediate->backing_hd;
2564 break;
2566 intermediate = intermediate->backing_hd;
2568 if (base_bs == NULL) {
2569 /* something went wrong, we did not end at the base. safely
2570 * unravel everything, and exit with error */
2571 goto exit;
2574 /* success - we can delete the intermediate states, and link top->base */
2575 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2576 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2577 base_bs->drv ? base_bs->drv->format_name : "");
2578 if (ret) {
2579 goto exit;
2581 bdrv_set_backing_hd(new_top_bs, base_bs);
2583 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2584 /* so that bdrv_close() does not recursively close the chain */
2585 bdrv_set_backing_hd(intermediate_state->bs, NULL);
2586 bdrv_unref(intermediate_state->bs);
2588 ret = 0;
2590 exit:
2591 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2592 g_free(intermediate_state);
2594 return ret;
2598 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2599 size_t size)
2601 int64_t len;
2603 if (size > INT_MAX) {
2604 return -EIO;
2607 if (!bdrv_is_inserted(bs))
2608 return -ENOMEDIUM;
2610 if (bs->growable)
2611 return 0;
2613 len = bdrv_getlength(bs);
2615 if (offset < 0)
2616 return -EIO;
2618 if ((offset > len) || (len - offset < size))
2619 return -EIO;
2621 return 0;
2624 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2625 int nb_sectors)
2627 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2628 return -EIO;
2631 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2632 nb_sectors * BDRV_SECTOR_SIZE);
2635 typedef struct RwCo {
2636 BlockDriverState *bs;
2637 int64_t offset;
2638 QEMUIOVector *qiov;
2639 bool is_write;
2640 int ret;
2641 BdrvRequestFlags flags;
2642 } RwCo;
2644 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2646 RwCo *rwco = opaque;
2648 if (!rwco->is_write) {
2649 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2650 rwco->qiov->size, rwco->qiov,
2651 rwco->flags);
2652 } else {
2653 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2654 rwco->qiov->size, rwco->qiov,
2655 rwco->flags);
2660 * Process a vectored synchronous request using coroutines
2662 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2663 QEMUIOVector *qiov, bool is_write,
2664 BdrvRequestFlags flags)
2666 Coroutine *co;
2667 RwCo rwco = {
2668 .bs = bs,
2669 .offset = offset,
2670 .qiov = qiov,
2671 .is_write = is_write,
2672 .ret = NOT_DONE,
2673 .flags = flags,
2677 * In sync call context, when the vcpu is blocked, this throttling timer
2678 * will not fire; so the I/O throttling function has to be disabled here
2679 * if it has been enabled.
2681 if (bs->io_limits_enabled) {
2682 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2683 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2684 bdrv_io_limits_disable(bs);
2687 if (qemu_in_coroutine()) {
2688 /* Fast-path if already in coroutine context */
2689 bdrv_rw_co_entry(&rwco);
2690 } else {
2691 AioContext *aio_context = bdrv_get_aio_context(bs);
2693 co = qemu_coroutine_create(bdrv_rw_co_entry);
2694 qemu_coroutine_enter(co, &rwco);
2695 while (rwco.ret == NOT_DONE) {
2696 aio_poll(aio_context, true);
2699 return rwco.ret;
2703 * Process a synchronous request using coroutines
2705 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2706 int nb_sectors, bool is_write, BdrvRequestFlags flags)
2708 QEMUIOVector qiov;
2709 struct iovec iov = {
2710 .iov_base = (void *)buf,
2711 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2714 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2715 return -EINVAL;
2718 qemu_iovec_init_external(&qiov, &iov, 1);
2719 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2720 &qiov, is_write, flags);
2723 /* return < 0 if error. See bdrv_write() for the return codes */
2724 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2725 uint8_t *buf, int nb_sectors)
2727 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2730 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2731 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2732 uint8_t *buf, int nb_sectors)
2734 bool enabled;
2735 int ret;
2737 enabled = bs->io_limits_enabled;
2738 bs->io_limits_enabled = false;
2739 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2740 bs->io_limits_enabled = enabled;
2741 return ret;
2744 /* Return < 0 if error. Important errors are:
2745 -EIO generic I/O error (may happen for all errors)
2746 -ENOMEDIUM No media inserted.
2747 -EINVAL Invalid sector number or nb_sectors
2748 -EACCES Trying to write a read-only device
2750 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2751 const uint8_t *buf, int nb_sectors)
2753 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2756 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2757 int nb_sectors, BdrvRequestFlags flags)
2759 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2760 BDRV_REQ_ZERO_WRITE | flags);
2764 * Completely zero out a block device with the help of bdrv_write_zeroes.
2765 * The operation is sped up by checking the block status and only writing
2766 * zeroes to the device if they currently do not return zeroes. Optional
2767 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2769 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2771 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2773 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2774 int n;
2776 target_sectors = bdrv_nb_sectors(bs);
2777 if (target_sectors < 0) {
2778 return target_sectors;
2781 for (;;) {
2782 nb_sectors = target_sectors - sector_num;
2783 if (nb_sectors <= 0) {
2784 return 0;
2786 if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2787 nb_sectors = INT_MAX / BDRV_SECTOR_SIZE;
2789 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2790 if (ret < 0) {
2791 error_report("error getting block status at sector %" PRId64 ": %s",
2792 sector_num, strerror(-ret));
2793 return ret;
2795 if (ret & BDRV_BLOCK_ZERO) {
2796 sector_num += n;
2797 continue;
2799 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2800 if (ret < 0) {
2801 error_report("error writing zeroes at sector %" PRId64 ": %s",
2802 sector_num, strerror(-ret));
2803 return ret;
2805 sector_num += n;
2809 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2811 QEMUIOVector qiov;
2812 struct iovec iov = {
2813 .iov_base = (void *)buf,
2814 .iov_len = bytes,
2816 int ret;
2818 if (bytes < 0) {
2819 return -EINVAL;
2822 qemu_iovec_init_external(&qiov, &iov, 1);
2823 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2824 if (ret < 0) {
2825 return ret;
2828 return bytes;
2831 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2833 int ret;
2835 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2836 if (ret < 0) {
2837 return ret;
2840 return qiov->size;
2843 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2844 const void *buf, int bytes)
2846 QEMUIOVector qiov;
2847 struct iovec iov = {
2848 .iov_base = (void *) buf,
2849 .iov_len = bytes,
2852 if (bytes < 0) {
2853 return -EINVAL;
2856 qemu_iovec_init_external(&qiov, &iov, 1);
2857 return bdrv_pwritev(bs, offset, &qiov);
2861 * Writes to the file and ensures that no writes are reordered across this
2862 * request (acts as a barrier)
2864 * Returns 0 on success, -errno in error cases.
2866 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2867 const void *buf, int count)
2869 int ret;
2871 ret = bdrv_pwrite(bs, offset, buf, count);
2872 if (ret < 0) {
2873 return ret;
2876 /* No flush needed for cache modes that already do it */
2877 if (bs->enable_write_cache) {
2878 bdrv_flush(bs);
2881 return 0;
2884 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2885 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2887 /* Perform I/O through a temporary buffer so that users who scribble over
2888 * their read buffer while the operation is in progress do not end up
2889 * modifying the image file. This is critical for zero-copy guest I/O
2890 * where anything might happen inside guest memory.
2892 void *bounce_buffer;
2894 BlockDriver *drv = bs->drv;
2895 struct iovec iov;
2896 QEMUIOVector bounce_qiov;
2897 int64_t cluster_sector_num;
2898 int cluster_nb_sectors;
2899 size_t skip_bytes;
2900 int ret;
2902 /* Cover entire cluster so no additional backing file I/O is required when
2903 * allocating cluster in the image file.
2905 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2906 &cluster_sector_num, &cluster_nb_sectors);
2908 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2909 cluster_sector_num, cluster_nb_sectors);
2911 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2912 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2913 if (bounce_buffer == NULL) {
2914 ret = -ENOMEM;
2915 goto err;
2918 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2920 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2921 &bounce_qiov);
2922 if (ret < 0) {
2923 goto err;
2926 if (drv->bdrv_co_write_zeroes &&
2927 buffer_is_zero(bounce_buffer, iov.iov_len)) {
2928 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2929 cluster_nb_sectors, 0);
2930 } else {
2931 /* This does not change the data on the disk, it is not necessary
2932 * to flush even in cache=writethrough mode.
2934 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2935 &bounce_qiov);
2938 if (ret < 0) {
2939 /* It might be okay to ignore write errors for guest requests. If this
2940 * is a deliberate copy-on-read then we don't want to ignore the error.
2941 * Simply report it in all cases.
2943 goto err;
2946 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2947 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2948 nb_sectors * BDRV_SECTOR_SIZE);
2950 err:
2951 qemu_vfree(bounce_buffer);
2952 return ret;
2956 * Forwards an already correctly aligned request to the BlockDriver. This
2957 * handles copy on read and zeroing after EOF; any other features must be
2958 * implemented by the caller.
2960 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2961 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2962 int64_t align, QEMUIOVector *qiov, int flags)
2964 BlockDriver *drv = bs->drv;
2965 int ret;
2967 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2968 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2970 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2971 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2972 assert(!qiov || bytes == qiov->size);
2974 /* Handle Copy on Read and associated serialisation */
2975 if (flags & BDRV_REQ_COPY_ON_READ) {
2976 /* If we touch the same cluster it counts as an overlap. This
2977 * guarantees that allocating writes will be serialized and not race
2978 * with each other for the same cluster. For example, in copy-on-read
2979 * it ensures that the CoR read and write operations are atomic and
2980 * guest writes cannot interleave between them. */
2981 mark_request_serialising(req, bdrv_get_cluster_size(bs));
2984 wait_serialising_requests(req);
2986 if (flags & BDRV_REQ_COPY_ON_READ) {
2987 int pnum;
2989 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2990 if (ret < 0) {
2991 goto out;
2994 if (!ret || pnum != nb_sectors) {
2995 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2996 goto out;
3000 /* Forward the request to the BlockDriver */
3001 if (!(bs->zero_beyond_eof && bs->growable)) {
3002 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3003 } else {
3004 /* Read zeros after EOF of growable BDSes */
3005 int64_t total_sectors, max_nb_sectors;
3007 total_sectors = bdrv_nb_sectors(bs);
3008 if (total_sectors < 0) {
3009 ret = total_sectors;
3010 goto out;
3013 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3014 align >> BDRV_SECTOR_BITS);
3015 if (max_nb_sectors > 0) {
3016 QEMUIOVector local_qiov;
3017 size_t local_sectors;
3019 max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3020 local_sectors = MIN(max_nb_sectors, nb_sectors);
3022 qemu_iovec_init(&local_qiov, qiov->niov);
3023 qemu_iovec_concat(&local_qiov, qiov, 0,
3024 local_sectors * BDRV_SECTOR_SIZE);
3026 ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3027 &local_qiov);
3029 qemu_iovec_destroy(&local_qiov);
3030 } else {
3031 ret = 0;
3034 /* Reading beyond end of file is supposed to produce zeroes */
3035 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3036 uint64_t offset = MAX(0, total_sectors - sector_num);
3037 uint64_t bytes = (sector_num + nb_sectors - offset) *
3038 BDRV_SECTOR_SIZE;
3039 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3043 out:
3044 return ret;
3048 * Handle a read request in coroutine context
3050 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3051 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3052 BdrvRequestFlags flags)
3054 BlockDriver *drv = bs->drv;
3055 BdrvTrackedRequest req;
3057 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3058 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3059 uint8_t *head_buf = NULL;
3060 uint8_t *tail_buf = NULL;
3061 QEMUIOVector local_qiov;
3062 bool use_local_qiov = false;
3063 int ret;
3065 if (!drv) {
3066 return -ENOMEDIUM;
3068 if (bdrv_check_byte_request(bs, offset, bytes)) {
3069 return -EIO;
3072 if (bs->copy_on_read) {
3073 flags |= BDRV_REQ_COPY_ON_READ;
3076 /* throttling disk I/O */
3077 if (bs->io_limits_enabled) {
3078 bdrv_io_limits_intercept(bs, bytes, false);
3081 /* Align read if necessary by padding qiov */
3082 if (offset & (align - 1)) {
3083 head_buf = qemu_blockalign(bs, align);
3084 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3085 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3086 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3087 use_local_qiov = true;
3089 bytes += offset & (align - 1);
3090 offset = offset & ~(align - 1);
3093 if ((offset + bytes) & (align - 1)) {
3094 if (!use_local_qiov) {
3095 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3096 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3097 use_local_qiov = true;
3099 tail_buf = qemu_blockalign(bs, align);
3100 qemu_iovec_add(&local_qiov, tail_buf,
3101 align - ((offset + bytes) & (align - 1)));
3103 bytes = ROUND_UP(bytes, align);
3106 tracked_request_begin(&req, bs, offset, bytes, false);
3107 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3108 use_local_qiov ? &local_qiov : qiov,
3109 flags);
3110 tracked_request_end(&req);
3112 if (use_local_qiov) {
3113 qemu_iovec_destroy(&local_qiov);
3114 qemu_vfree(head_buf);
3115 qemu_vfree(tail_buf);
3118 return ret;
3121 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3122 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3123 BdrvRequestFlags flags)
3125 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3126 return -EINVAL;
3129 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3130 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3133 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3134 int nb_sectors, QEMUIOVector *qiov)
3136 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3138 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3141 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3142 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3144 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3146 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3147 BDRV_REQ_COPY_ON_READ);
3150 /* if no limit is specified in the BlockLimits use a default
3151 * of 32768 512-byte sectors (16 MiB) per request.
3153 #define MAX_WRITE_ZEROES_DEFAULT 32768
3155 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3156 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3158 BlockDriver *drv = bs->drv;
3159 QEMUIOVector qiov;
3160 struct iovec iov = {0};
3161 int ret = 0;
3163 int max_write_zeroes = bs->bl.max_write_zeroes ?
3164 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3166 while (nb_sectors > 0 && !ret) {
3167 int num = nb_sectors;
3169 /* Align request. Block drivers can expect the "bulk" of the request
3170 * to be aligned.
3172 if (bs->bl.write_zeroes_alignment
3173 && num > bs->bl.write_zeroes_alignment) {
3174 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3175 /* Make a small request up to the first aligned sector. */
3176 num = bs->bl.write_zeroes_alignment;
3177 num -= sector_num % bs->bl.write_zeroes_alignment;
3178 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3179 /* Shorten the request to the last aligned sector. num cannot
3180 * underflow because num > bs->bl.write_zeroes_alignment.
3182 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3186 /* limit request size */
3187 if (num > max_write_zeroes) {
3188 num = max_write_zeroes;
3191 ret = -ENOTSUP;
3192 /* First try the efficient write zeroes operation */
3193 if (drv->bdrv_co_write_zeroes) {
3194 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3197 if (ret == -ENOTSUP) {
3198 /* Fall back to bounce buffer if write zeroes is unsupported */
3199 iov.iov_len = num * BDRV_SECTOR_SIZE;
3200 if (iov.iov_base == NULL) {
3201 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3202 if (iov.iov_base == NULL) {
3203 ret = -ENOMEM;
3204 goto fail;
3206 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3208 qemu_iovec_init_external(&qiov, &iov, 1);
3210 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3212 /* Keep bounce buffer around if it is big enough for all
3213 * all future requests.
3215 if (num < max_write_zeroes) {
3216 qemu_vfree(iov.iov_base);
3217 iov.iov_base = NULL;
3221 sector_num += num;
3222 nb_sectors -= num;
3225 fail:
3226 qemu_vfree(iov.iov_base);
3227 return ret;
3231 * Forwards an already correctly aligned write request to the BlockDriver.
3233 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3234 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3235 QEMUIOVector *qiov, int flags)
3237 BlockDriver *drv = bs->drv;
3238 bool waited;
3239 int ret;
3241 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3242 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3244 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3245 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3246 assert(!qiov || bytes == qiov->size);
3248 waited = wait_serialising_requests(req);
3249 assert(!waited || !req->serialising);
3250 assert(req->overlap_offset <= offset);
3251 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3253 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3255 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3256 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3257 qemu_iovec_is_zero(qiov)) {
3258 flags |= BDRV_REQ_ZERO_WRITE;
3259 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3260 flags |= BDRV_REQ_MAY_UNMAP;
3264 if (ret < 0) {
3265 /* Do nothing, write notifier decided to fail this request */
3266 } else if (flags & BDRV_REQ_ZERO_WRITE) {
3267 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3268 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3269 } else {
3270 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3271 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3273 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3275 if (ret == 0 && !bs->enable_write_cache) {
3276 ret = bdrv_co_flush(bs);
3279 bdrv_set_dirty(bs, sector_num, nb_sectors);
3281 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3283 if (bs->growable && ret >= 0) {
3284 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3287 return ret;
3291 * Handle a write request in coroutine context
3293 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3294 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3295 BdrvRequestFlags flags)
3297 BdrvTrackedRequest req;
3298 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3299 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3300 uint8_t *head_buf = NULL;
3301 uint8_t *tail_buf = NULL;
3302 QEMUIOVector local_qiov;
3303 bool use_local_qiov = false;
3304 int ret;
3306 if (!bs->drv) {
3307 return -ENOMEDIUM;
3309 if (bs->read_only) {
3310 return -EACCES;
3312 if (bdrv_check_byte_request(bs, offset, bytes)) {
3313 return -EIO;
3316 /* throttling disk I/O */
3317 if (bs->io_limits_enabled) {
3318 bdrv_io_limits_intercept(bs, bytes, true);
3322 * Align write if necessary by performing a read-modify-write cycle.
3323 * Pad qiov with the read parts and be sure to have a tracked request not
3324 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3326 tracked_request_begin(&req, bs, offset, bytes, true);
3328 if (offset & (align - 1)) {
3329 QEMUIOVector head_qiov;
3330 struct iovec head_iov;
3332 mark_request_serialising(&req, align);
3333 wait_serialising_requests(&req);
3335 head_buf = qemu_blockalign(bs, align);
3336 head_iov = (struct iovec) {
3337 .iov_base = head_buf,
3338 .iov_len = align,
3340 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3342 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3343 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3344 align, &head_qiov, 0);
3345 if (ret < 0) {
3346 goto fail;
3348 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3350 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3351 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3352 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3353 use_local_qiov = true;
3355 bytes += offset & (align - 1);
3356 offset = offset & ~(align - 1);
3359 if ((offset + bytes) & (align - 1)) {
3360 QEMUIOVector tail_qiov;
3361 struct iovec tail_iov;
3362 size_t tail_bytes;
3363 bool waited;
3365 mark_request_serialising(&req, align);
3366 waited = wait_serialising_requests(&req);
3367 assert(!waited || !use_local_qiov);
3369 tail_buf = qemu_blockalign(bs, align);
3370 tail_iov = (struct iovec) {
3371 .iov_base = tail_buf,
3372 .iov_len = align,
3374 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3376 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3377 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3378 align, &tail_qiov, 0);
3379 if (ret < 0) {
3380 goto fail;
3382 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3384 if (!use_local_qiov) {
3385 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3386 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3387 use_local_qiov = true;
3390 tail_bytes = (offset + bytes) & (align - 1);
3391 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3393 bytes = ROUND_UP(bytes, align);
3396 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3397 use_local_qiov ? &local_qiov : qiov,
3398 flags);
3400 fail:
3401 tracked_request_end(&req);
3403 if (use_local_qiov) {
3404 qemu_iovec_destroy(&local_qiov);
3406 qemu_vfree(head_buf);
3407 qemu_vfree(tail_buf);
3409 return ret;
3412 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3413 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3414 BdrvRequestFlags flags)
3416 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3417 return -EINVAL;
3420 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3421 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3424 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3425 int nb_sectors, QEMUIOVector *qiov)
3427 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3429 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3432 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3433 int64_t sector_num, int nb_sectors,
3434 BdrvRequestFlags flags)
3436 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3438 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3439 flags &= ~BDRV_REQ_MAY_UNMAP;
3442 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3443 BDRV_REQ_ZERO_WRITE | flags);
3447 * Truncate file to 'offset' bytes (needed only for file protocols)
3449 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3451 BlockDriver *drv = bs->drv;
3452 int ret;
3453 if (!drv)
3454 return -ENOMEDIUM;
3455 if (!drv->bdrv_truncate)
3456 return -ENOTSUP;
3457 if (bs->read_only)
3458 return -EACCES;
3460 ret = drv->bdrv_truncate(bs, offset);
3461 if (ret == 0) {
3462 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3463 if (bs->blk) {
3464 blk_dev_resize_cb(bs->blk);
3467 return ret;
3471 * Length of a allocated file in bytes. Sparse files are counted by actual
3472 * allocated space. Return < 0 if error or unknown.
3474 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3476 BlockDriver *drv = bs->drv;
3477 if (!drv) {
3478 return -ENOMEDIUM;
3480 if (drv->bdrv_get_allocated_file_size) {
3481 return drv->bdrv_get_allocated_file_size(bs);
3483 if (bs->file) {
3484 return bdrv_get_allocated_file_size(bs->file);
3486 return -ENOTSUP;
3490 * Return number of sectors on success, -errno on error.
3492 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3494 BlockDriver *drv = bs->drv;
3496 if (!drv)
3497 return -ENOMEDIUM;
3499 if (drv->has_variable_length) {
3500 int ret = refresh_total_sectors(bs, bs->total_sectors);
3501 if (ret < 0) {
3502 return ret;
3505 return bs->total_sectors;
3509 * Return length in bytes on success, -errno on error.
3510 * The length is always a multiple of BDRV_SECTOR_SIZE.
3512 int64_t bdrv_getlength(BlockDriverState *bs)
3514 int64_t ret = bdrv_nb_sectors(bs);
3516 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3519 /* return 0 as number of sectors if no device present or error */
3520 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3522 int64_t nb_sectors = bdrv_nb_sectors(bs);
3524 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3527 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3528 BlockdevOnError on_write_error)
3530 bs->on_read_error = on_read_error;
3531 bs->on_write_error = on_write_error;
3534 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3536 return is_read ? bs->on_read_error : bs->on_write_error;
3539 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3541 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3543 switch (on_err) {
3544 case BLOCKDEV_ON_ERROR_ENOSPC:
3545 return (error == ENOSPC) ?
3546 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3547 case BLOCKDEV_ON_ERROR_STOP:
3548 return BLOCK_ERROR_ACTION_STOP;
3549 case BLOCKDEV_ON_ERROR_REPORT:
3550 return BLOCK_ERROR_ACTION_REPORT;
3551 case BLOCKDEV_ON_ERROR_IGNORE:
3552 return BLOCK_ERROR_ACTION_IGNORE;
3553 default:
3554 abort();
3558 static void send_qmp_error_event(BlockDriverState *bs,
3559 BlockErrorAction action,
3560 bool is_read, int error)
3562 IoOperationType optype;
3564 optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3565 qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
3566 bdrv_iostatus_is_enabled(bs),
3567 error == ENOSPC, strerror(error),
3568 &error_abort);
3571 /* This is done by device models because, while the block layer knows
3572 * about the error, it does not know whether an operation comes from
3573 * the device or the block layer (from a job, for example).
3575 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3576 bool is_read, int error)
3578 assert(error >= 0);
3580 if (action == BLOCK_ERROR_ACTION_STOP) {
3581 /* First set the iostatus, so that "info block" returns an iostatus
3582 * that matches the events raised so far (an additional error iostatus
3583 * is fine, but not a lost one).
3585 bdrv_iostatus_set_err(bs, error);
3587 /* Then raise the request to stop the VM and the event.
3588 * qemu_system_vmstop_request_prepare has two effects. First,
3589 * it ensures that the STOP event always comes after the
3590 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3591 * can observe the STOP event and do a "cont" before the STOP
3592 * event is issued, the VM will not stop. In this case, vm_start()
3593 * also ensures that the STOP/RESUME pair of events is emitted.
3595 qemu_system_vmstop_request_prepare();
3596 send_qmp_error_event(bs, action, is_read, error);
3597 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3598 } else {
3599 send_qmp_error_event(bs, action, is_read, error);
3603 int bdrv_is_read_only(BlockDriverState *bs)
3605 return bs->read_only;
3608 int bdrv_is_sg(BlockDriverState *bs)
3610 return bs->sg;
3613 int bdrv_enable_write_cache(BlockDriverState *bs)
3615 return bs->enable_write_cache;
3618 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3620 bs->enable_write_cache = wce;
3622 /* so a reopen() will preserve wce */
3623 if (wce) {
3624 bs->open_flags |= BDRV_O_CACHE_WB;
3625 } else {
3626 bs->open_flags &= ~BDRV_O_CACHE_WB;
3630 int bdrv_is_encrypted(BlockDriverState *bs)
3632 if (bs->backing_hd && bs->backing_hd->encrypted)
3633 return 1;
3634 return bs->encrypted;
3637 int bdrv_key_required(BlockDriverState *bs)
3639 BlockDriverState *backing_hd = bs->backing_hd;
3641 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3642 return 1;
3643 return (bs->encrypted && !bs->valid_key);
3646 int bdrv_set_key(BlockDriverState *bs, const char *key)
3648 int ret;
3649 if (bs->backing_hd && bs->backing_hd->encrypted) {
3650 ret = bdrv_set_key(bs->backing_hd, key);
3651 if (ret < 0)
3652 return ret;
3653 if (!bs->encrypted)
3654 return 0;
3656 if (!bs->encrypted) {
3657 return -EINVAL;
3658 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3659 return -ENOMEDIUM;
3661 ret = bs->drv->bdrv_set_key(bs, key);
3662 if (ret < 0) {
3663 bs->valid_key = 0;
3664 } else if (!bs->valid_key) {
3665 bs->valid_key = 1;
3666 if (bs->blk) {
3667 /* call the change callback now, we skipped it on open */
3668 blk_dev_change_media_cb(bs->blk, true);
3671 return ret;
3674 const char *bdrv_get_format_name(BlockDriverState *bs)
3676 return bs->drv ? bs->drv->format_name : NULL;
3679 static int qsort_strcmp(const void *a, const void *b)
3681 return strcmp(a, b);
3684 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3685 void *opaque)
3687 BlockDriver *drv;
3688 int count = 0;
3689 int i;
3690 const char **formats = NULL;
3692 QLIST_FOREACH(drv, &bdrv_drivers, list) {
3693 if (drv->format_name) {
3694 bool found = false;
3695 int i = count;
3696 while (formats && i && !found) {
3697 found = !strcmp(formats[--i], drv->format_name);
3700 if (!found) {
3701 formats = g_renew(const char *, formats, count + 1);
3702 formats[count++] = drv->format_name;
3707 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3709 for (i = 0; i < count; i++) {
3710 it(opaque, formats[i]);
3713 g_free(formats);
3716 /* This function is to find block backend bs */
3717 /* TODO convert callers to blk_by_name(), then remove */
3718 BlockDriverState *bdrv_find(const char *name)
3720 BlockBackend *blk = blk_by_name(name);
3722 return blk ? blk_bs(blk) : NULL;
3725 /* This function is to find a node in the bs graph */
3726 BlockDriverState *bdrv_find_node(const char *node_name)
3728 BlockDriverState *bs;
3730 assert(node_name);
3732 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3733 if (!strcmp(node_name, bs->node_name)) {
3734 return bs;
3737 return NULL;
3740 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3741 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3743 BlockDeviceInfoList *list, *entry;
3744 BlockDriverState *bs;
3746 list = NULL;
3747 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3748 entry = g_malloc0(sizeof(*entry));
3749 entry->value = bdrv_block_device_info(bs);
3750 entry->next = list;
3751 list = entry;
3754 return list;
3757 BlockDriverState *bdrv_lookup_bs(const char *device,
3758 const char *node_name,
3759 Error **errp)
3761 BlockBackend *blk;
3762 BlockDriverState *bs;
3764 if (device) {
3765 blk = blk_by_name(device);
3767 if (blk) {
3768 return blk_bs(blk);
3772 if (node_name) {
3773 bs = bdrv_find_node(node_name);
3775 if (bs) {
3776 return bs;
3780 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3781 device ? device : "",
3782 node_name ? node_name : "");
3783 return NULL;
3786 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3787 * return false. If either argument is NULL, return false. */
3788 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3790 while (top && top != base) {
3791 top = top->backing_hd;
3794 return top != NULL;
3797 BlockDriverState *bdrv_next(BlockDriverState *bs)
3799 if (!bs) {
3800 return QTAILQ_FIRST(&bdrv_states);
3802 return QTAILQ_NEXT(bs, device_list);
3805 /* TODO check what callers really want: bs->node_name or blk_name() */
3806 const char *bdrv_get_device_name(const BlockDriverState *bs)
3808 return bs->blk ? blk_name(bs->blk) : "";
3811 int bdrv_get_flags(BlockDriverState *bs)
3813 return bs->open_flags;
3816 int bdrv_flush_all(void)
3818 BlockDriverState *bs;
3819 int result = 0;
3821 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3822 AioContext *aio_context = bdrv_get_aio_context(bs);
3823 int ret;
3825 aio_context_acquire(aio_context);
3826 ret = bdrv_flush(bs);
3827 if (ret < 0 && !result) {
3828 result = ret;
3830 aio_context_release(aio_context);
3833 return result;
3836 int bdrv_has_zero_init_1(BlockDriverState *bs)
3838 return 1;
3841 int bdrv_has_zero_init(BlockDriverState *bs)
3843 assert(bs->drv);
3845 /* If BS is a copy on write image, it is initialized to
3846 the contents of the base image, which may not be zeroes. */
3847 if (bs->backing_hd) {
3848 return 0;
3850 if (bs->drv->bdrv_has_zero_init) {
3851 return bs->drv->bdrv_has_zero_init(bs);
3854 /* safe default */
3855 return 0;
3858 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3860 BlockDriverInfo bdi;
3862 if (bs->backing_hd) {
3863 return false;
3866 if (bdrv_get_info(bs, &bdi) == 0) {
3867 return bdi.unallocated_blocks_are_zero;
3870 return false;
3873 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3875 BlockDriverInfo bdi;
3877 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3878 return false;
3881 if (bdrv_get_info(bs, &bdi) == 0) {
3882 return bdi.can_write_zeroes_with_unmap;
3885 return false;
3888 typedef struct BdrvCoGetBlockStatusData {
3889 BlockDriverState *bs;
3890 BlockDriverState *base;
3891 int64_t sector_num;
3892 int nb_sectors;
3893 int *pnum;
3894 int64_t ret;
3895 bool done;
3896 } BdrvCoGetBlockStatusData;
3899 * Returns the allocation status of the specified sectors.
3900 * Drivers not implementing the functionality are assumed to not support
3901 * backing files, hence all their sectors are reported as allocated.
3903 * If 'sector_num' is beyond the end of the disk image the return value is 0
3904 * and 'pnum' is set to 0.
3906 * 'pnum' is set to the number of sectors (including and immediately following
3907 * the specified sector) that are known to be in the same
3908 * allocated/unallocated state.
3910 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3911 * beyond the end of the disk image it will be clamped.
3913 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3914 int64_t sector_num,
3915 int nb_sectors, int *pnum)
3917 int64_t total_sectors;
3918 int64_t n;
3919 int64_t ret, ret2;
3921 total_sectors = bdrv_nb_sectors(bs);
3922 if (total_sectors < 0) {
3923 return total_sectors;
3926 if (sector_num >= total_sectors) {
3927 *pnum = 0;
3928 return 0;
3931 n = total_sectors - sector_num;
3932 if (n < nb_sectors) {
3933 nb_sectors = n;
3936 if (!bs->drv->bdrv_co_get_block_status) {
3937 *pnum = nb_sectors;
3938 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3939 if (bs->drv->protocol_name) {
3940 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3942 return ret;
3945 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3946 if (ret < 0) {
3947 *pnum = 0;
3948 return ret;
3951 if (ret & BDRV_BLOCK_RAW) {
3952 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3953 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3954 *pnum, pnum);
3957 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
3958 ret |= BDRV_BLOCK_ALLOCATED;
3961 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3962 if (bdrv_unallocated_blocks_are_zero(bs)) {
3963 ret |= BDRV_BLOCK_ZERO;
3964 } else if (bs->backing_hd) {
3965 BlockDriverState *bs2 = bs->backing_hd;
3966 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
3967 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
3968 ret |= BDRV_BLOCK_ZERO;
3973 if (bs->file &&
3974 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3975 (ret & BDRV_BLOCK_OFFSET_VALID)) {
3976 int file_pnum;
3978 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3979 *pnum, &file_pnum);
3980 if (ret2 >= 0) {
3981 /* Ignore errors. This is just providing extra information, it
3982 * is useful but not necessary.
3984 if (!file_pnum) {
3985 /* !file_pnum indicates an offset at or beyond the EOF; it is
3986 * perfectly valid for the format block driver to point to such
3987 * offsets, so catch it and mark everything as zero */
3988 ret |= BDRV_BLOCK_ZERO;
3989 } else {
3990 /* Limit request to the range reported by the protocol driver */
3991 *pnum = file_pnum;
3992 ret |= (ret2 & BDRV_BLOCK_ZERO);
3997 return ret;
4000 /* Coroutine wrapper for bdrv_get_block_status() */
4001 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4003 BdrvCoGetBlockStatusData *data = opaque;
4004 BlockDriverState *bs = data->bs;
4006 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4007 data->pnum);
4008 data->done = true;
4012 * Synchronous wrapper around bdrv_co_get_block_status().
4014 * See bdrv_co_get_block_status() for details.
4016 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4017 int nb_sectors, int *pnum)
4019 Coroutine *co;
4020 BdrvCoGetBlockStatusData data = {
4021 .bs = bs,
4022 .sector_num = sector_num,
4023 .nb_sectors = nb_sectors,
4024 .pnum = pnum,
4025 .done = false,
4028 if (qemu_in_coroutine()) {
4029 /* Fast-path if already in coroutine context */
4030 bdrv_get_block_status_co_entry(&data);
4031 } else {
4032 AioContext *aio_context = bdrv_get_aio_context(bs);
4034 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4035 qemu_coroutine_enter(co, &data);
4036 while (!data.done) {
4037 aio_poll(aio_context, true);
4040 return data.ret;
4043 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4044 int nb_sectors, int *pnum)
4046 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4047 if (ret < 0) {
4048 return ret;
4050 return !!(ret & BDRV_BLOCK_ALLOCATED);
4054 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4056 * Return true if the given sector is allocated in any image between
4057 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4058 * sector is allocated in any image of the chain. Return false otherwise.
4060 * 'pnum' is set to the number of sectors (including and immediately following
4061 * the specified sector) that are known to be in the same
4062 * allocated/unallocated state.
4065 int bdrv_is_allocated_above(BlockDriverState *top,
4066 BlockDriverState *base,
4067 int64_t sector_num,
4068 int nb_sectors, int *pnum)
4070 BlockDriverState *intermediate;
4071 int ret, n = nb_sectors;
4073 intermediate = top;
4074 while (intermediate && intermediate != base) {
4075 int pnum_inter;
4076 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4077 &pnum_inter);
4078 if (ret < 0) {
4079 return ret;
4080 } else if (ret) {
4081 *pnum = pnum_inter;
4082 return 1;
4086 * [sector_num, nb_sectors] is unallocated on top but intermediate
4087 * might have
4089 * [sector_num+x, nr_sectors] allocated.
4091 if (n > pnum_inter &&
4092 (intermediate == top ||
4093 sector_num + pnum_inter < intermediate->total_sectors)) {
4094 n = pnum_inter;
4097 intermediate = intermediate->backing_hd;
4100 *pnum = n;
4101 return 0;
4104 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4106 if (bs->backing_hd && bs->backing_hd->encrypted)
4107 return bs->backing_file;
4108 else if (bs->encrypted)
4109 return bs->filename;
4110 else
4111 return NULL;
4114 void bdrv_get_backing_filename(BlockDriverState *bs,
4115 char *filename, int filename_size)
4117 pstrcpy(filename, filename_size, bs->backing_file);
4120 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4121 const uint8_t *buf, int nb_sectors)
4123 BlockDriver *drv = bs->drv;
4124 if (!drv)
4125 return -ENOMEDIUM;
4126 if (!drv->bdrv_write_compressed)
4127 return -ENOTSUP;
4128 if (bdrv_check_request(bs, sector_num, nb_sectors))
4129 return -EIO;
4131 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4133 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4136 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4138 BlockDriver *drv = bs->drv;
4139 if (!drv)
4140 return -ENOMEDIUM;
4141 if (!drv->bdrv_get_info)
4142 return -ENOTSUP;
4143 memset(bdi, 0, sizeof(*bdi));
4144 return drv->bdrv_get_info(bs, bdi);
4147 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4149 BlockDriver *drv = bs->drv;
4150 if (drv && drv->bdrv_get_specific_info) {
4151 return drv->bdrv_get_specific_info(bs);
4153 return NULL;
4156 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4157 int64_t pos, int size)
4159 QEMUIOVector qiov;
4160 struct iovec iov = {
4161 .iov_base = (void *) buf,
4162 .iov_len = size,
4165 qemu_iovec_init_external(&qiov, &iov, 1);
4166 return bdrv_writev_vmstate(bs, &qiov, pos);
4169 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4171 BlockDriver *drv = bs->drv;
4173 if (!drv) {
4174 return -ENOMEDIUM;
4175 } else if (drv->bdrv_save_vmstate) {
4176 return drv->bdrv_save_vmstate(bs, qiov, pos);
4177 } else if (bs->file) {
4178 return bdrv_writev_vmstate(bs->file, qiov, pos);
4181 return -ENOTSUP;
4184 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4185 int64_t pos, int size)
4187 BlockDriver *drv = bs->drv;
4188 if (!drv)
4189 return -ENOMEDIUM;
4190 if (drv->bdrv_load_vmstate)
4191 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4192 if (bs->file)
4193 return bdrv_load_vmstate(bs->file, buf, pos, size);
4194 return -ENOTSUP;
4197 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4199 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4200 return;
4203 bs->drv->bdrv_debug_event(bs, event);
4206 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4207 const char *tag)
4209 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4210 bs = bs->file;
4213 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4214 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4217 return -ENOTSUP;
4220 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4222 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4223 bs = bs->file;
4226 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4227 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4230 return -ENOTSUP;
4233 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4235 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4236 bs = bs->file;
4239 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4240 return bs->drv->bdrv_debug_resume(bs, tag);
4243 return -ENOTSUP;
4246 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4248 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4249 bs = bs->file;
4252 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4253 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4256 return false;
4259 int bdrv_is_snapshot(BlockDriverState *bs)
4261 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4264 /* backing_file can either be relative, or absolute, or a protocol. If it is
4265 * relative, it must be relative to the chain. So, passing in bs->filename
4266 * from a BDS as backing_file should not be done, as that may be relative to
4267 * the CWD rather than the chain. */
4268 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4269 const char *backing_file)
4271 char *filename_full = NULL;
4272 char *backing_file_full = NULL;
4273 char *filename_tmp = NULL;
4274 int is_protocol = 0;
4275 BlockDriverState *curr_bs = NULL;
4276 BlockDriverState *retval = NULL;
4278 if (!bs || !bs->drv || !backing_file) {
4279 return NULL;
4282 filename_full = g_malloc(PATH_MAX);
4283 backing_file_full = g_malloc(PATH_MAX);
4284 filename_tmp = g_malloc(PATH_MAX);
4286 is_protocol = path_has_protocol(backing_file);
4288 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4290 /* If either of the filename paths is actually a protocol, then
4291 * compare unmodified paths; otherwise make paths relative */
4292 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4293 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4294 retval = curr_bs->backing_hd;
4295 break;
4297 } else {
4298 /* If not an absolute filename path, make it relative to the current
4299 * image's filename path */
4300 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4301 backing_file);
4303 /* We are going to compare absolute pathnames */
4304 if (!realpath(filename_tmp, filename_full)) {
4305 continue;
4308 /* We need to make sure the backing filename we are comparing against
4309 * is relative to the current image filename (or absolute) */
4310 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4311 curr_bs->backing_file);
4313 if (!realpath(filename_tmp, backing_file_full)) {
4314 continue;
4317 if (strcmp(backing_file_full, filename_full) == 0) {
4318 retval = curr_bs->backing_hd;
4319 break;
4324 g_free(filename_full);
4325 g_free(backing_file_full);
4326 g_free(filename_tmp);
4327 return retval;
4330 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4332 if (!bs->drv) {
4333 return 0;
4336 if (!bs->backing_hd) {
4337 return 0;
4340 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4343 /**************************************************************/
4344 /* async I/Os */
4346 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4347 QEMUIOVector *qiov, int nb_sectors,
4348 BlockCompletionFunc *cb, void *opaque)
4350 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4352 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4353 cb, opaque, false);
4356 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4357 QEMUIOVector *qiov, int nb_sectors,
4358 BlockCompletionFunc *cb, void *opaque)
4360 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4362 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4363 cb, opaque, true);
4366 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4367 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4368 BlockCompletionFunc *cb, void *opaque)
4370 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4372 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4373 BDRV_REQ_ZERO_WRITE | flags,
4374 cb, opaque, true);
4378 typedef struct MultiwriteCB {
4379 int error;
4380 int num_requests;
4381 int num_callbacks;
4382 struct {
4383 BlockCompletionFunc *cb;
4384 void *opaque;
4385 QEMUIOVector *free_qiov;
4386 } callbacks[];
4387 } MultiwriteCB;
4389 static void multiwrite_user_cb(MultiwriteCB *mcb)
4391 int i;
4393 for (i = 0; i < mcb->num_callbacks; i++) {
4394 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4395 if (mcb->callbacks[i].free_qiov) {
4396 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4398 g_free(mcb->callbacks[i].free_qiov);
4402 static void multiwrite_cb(void *opaque, int ret)
4404 MultiwriteCB *mcb = opaque;
4406 trace_multiwrite_cb(mcb, ret);
4408 if (ret < 0 && !mcb->error) {
4409 mcb->error = ret;
4412 mcb->num_requests--;
4413 if (mcb->num_requests == 0) {
4414 multiwrite_user_cb(mcb);
4415 g_free(mcb);
4419 static int multiwrite_req_compare(const void *a, const void *b)
4421 const BlockRequest *req1 = a, *req2 = b;
4424 * Note that we can't simply subtract req2->sector from req1->sector
4425 * here as that could overflow the return value.
4427 if (req1->sector > req2->sector) {
4428 return 1;
4429 } else if (req1->sector < req2->sector) {
4430 return -1;
4431 } else {
4432 return 0;
4437 * Takes a bunch of requests and tries to merge them. Returns the number of
4438 * requests that remain after merging.
4440 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4441 int num_reqs, MultiwriteCB *mcb)
4443 int i, outidx;
4445 // Sort requests by start sector
4446 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4448 // Check if adjacent requests touch the same clusters. If so, combine them,
4449 // filling up gaps with zero sectors.
4450 outidx = 0;
4451 for (i = 1; i < num_reqs; i++) {
4452 int merge = 0;
4453 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4455 // Handle exactly sequential writes and overlapping writes.
4456 if (reqs[i].sector <= oldreq_last) {
4457 merge = 1;
4460 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4461 merge = 0;
4464 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4465 reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4466 merge = 0;
4469 if (merge) {
4470 size_t size;
4471 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4472 qemu_iovec_init(qiov,
4473 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4475 // Add the first request to the merged one. If the requests are
4476 // overlapping, drop the last sectors of the first request.
4477 size = (reqs[i].sector - reqs[outidx].sector) << 9;
4478 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4480 // We should need to add any zeros between the two requests
4481 assert (reqs[i].sector <= oldreq_last);
4483 // Add the second request
4484 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4486 // Add tail of first request, if necessary
4487 if (qiov->size < reqs[outidx].qiov->size) {
4488 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4489 reqs[outidx].qiov->size - qiov->size);
4492 reqs[outidx].nb_sectors = qiov->size >> 9;
4493 reqs[outidx].qiov = qiov;
4495 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4496 } else {
4497 outidx++;
4498 reqs[outidx].sector = reqs[i].sector;
4499 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4500 reqs[outidx].qiov = reqs[i].qiov;
4504 return outidx + 1;
4508 * Submit multiple AIO write requests at once.
4510 * On success, the function returns 0 and all requests in the reqs array have
4511 * been submitted. In error case this function returns -1, and any of the
4512 * requests may or may not be submitted yet. In particular, this means that the
4513 * callback will be called for some of the requests, for others it won't. The
4514 * caller must check the error field of the BlockRequest to wait for the right
4515 * callbacks (if error != 0, no callback will be called).
4517 * The implementation may modify the contents of the reqs array, e.g. to merge
4518 * requests. However, the fields opaque and error are left unmodified as they
4519 * are used to signal failure for a single request to the caller.
4521 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4523 MultiwriteCB *mcb;
4524 int i;
4526 /* don't submit writes if we don't have a medium */
4527 if (bs->drv == NULL) {
4528 for (i = 0; i < num_reqs; i++) {
4529 reqs[i].error = -ENOMEDIUM;
4531 return -1;
4534 if (num_reqs == 0) {
4535 return 0;
4538 // Create MultiwriteCB structure
4539 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4540 mcb->num_requests = 0;
4541 mcb->num_callbacks = num_reqs;
4543 for (i = 0; i < num_reqs; i++) {
4544 mcb->callbacks[i].cb = reqs[i].cb;
4545 mcb->callbacks[i].opaque = reqs[i].opaque;
4548 // Check for mergable requests
4549 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4551 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4553 /* Run the aio requests. */
4554 mcb->num_requests = num_reqs;
4555 for (i = 0; i < num_reqs; i++) {
4556 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4557 reqs[i].nb_sectors, reqs[i].flags,
4558 multiwrite_cb, mcb,
4559 true);
4562 return 0;
4565 void bdrv_aio_cancel(BlockAIOCB *acb)
4567 qemu_aio_ref(acb);
4568 bdrv_aio_cancel_async(acb);
4569 while (acb->refcnt > 1) {
4570 if (acb->aiocb_info->get_aio_context) {
4571 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4572 } else if (acb->bs) {
4573 aio_poll(bdrv_get_aio_context(acb->bs), true);
4574 } else {
4575 abort();
4578 qemu_aio_unref(acb);
4581 /* Async version of aio cancel. The caller is not blocked if the acb implements
4582 * cancel_async, otherwise we do nothing and let the request normally complete.
4583 * In either case the completion callback must be called. */
4584 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4586 if (acb->aiocb_info->cancel_async) {
4587 acb->aiocb_info->cancel_async(acb);
4591 /**************************************************************/
4592 /* async block device emulation */
4594 typedef struct BlockAIOCBSync {
4595 BlockAIOCB common;
4596 QEMUBH *bh;
4597 int ret;
4598 /* vector translation state */
4599 QEMUIOVector *qiov;
4600 uint8_t *bounce;
4601 int is_write;
4602 } BlockAIOCBSync;
4604 static const AIOCBInfo bdrv_em_aiocb_info = {
4605 .aiocb_size = sizeof(BlockAIOCBSync),
4608 static void bdrv_aio_bh_cb(void *opaque)
4610 BlockAIOCBSync *acb = opaque;
4612 if (!acb->is_write && acb->ret >= 0) {
4613 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4615 qemu_vfree(acb->bounce);
4616 acb->common.cb(acb->common.opaque, acb->ret);
4617 qemu_bh_delete(acb->bh);
4618 acb->bh = NULL;
4619 qemu_aio_unref(acb);
4622 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4623 int64_t sector_num,
4624 QEMUIOVector *qiov,
4625 int nb_sectors,
4626 BlockCompletionFunc *cb,
4627 void *opaque,
4628 int is_write)
4631 BlockAIOCBSync *acb;
4633 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4634 acb->is_write = is_write;
4635 acb->qiov = qiov;
4636 acb->bounce = qemu_try_blockalign(bs, qiov->size);
4637 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4639 if (acb->bounce == NULL) {
4640 acb->ret = -ENOMEM;
4641 } else if (is_write) {
4642 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4643 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4644 } else {
4645 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4648 qemu_bh_schedule(acb->bh);
4650 return &acb->common;
4653 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4654 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4655 BlockCompletionFunc *cb, void *opaque)
4657 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4660 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4661 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4662 BlockCompletionFunc *cb, void *opaque)
4664 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4668 typedef struct BlockAIOCBCoroutine {
4669 BlockAIOCB common;
4670 BlockRequest req;
4671 bool is_write;
4672 bool *done;
4673 QEMUBH* bh;
4674 } BlockAIOCBCoroutine;
4676 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4677 .aiocb_size = sizeof(BlockAIOCBCoroutine),
4680 static void bdrv_co_em_bh(void *opaque)
4682 BlockAIOCBCoroutine *acb = opaque;
4684 acb->common.cb(acb->common.opaque, acb->req.error);
4686 qemu_bh_delete(acb->bh);
4687 qemu_aio_unref(acb);
4690 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4691 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4693 BlockAIOCBCoroutine *acb = opaque;
4694 BlockDriverState *bs = acb->common.bs;
4696 if (!acb->is_write) {
4697 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4698 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4699 } else {
4700 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4701 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4704 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4705 qemu_bh_schedule(acb->bh);
4708 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4709 int64_t sector_num,
4710 QEMUIOVector *qiov,
4711 int nb_sectors,
4712 BdrvRequestFlags flags,
4713 BlockCompletionFunc *cb,
4714 void *opaque,
4715 bool is_write)
4717 Coroutine *co;
4718 BlockAIOCBCoroutine *acb;
4720 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4721 acb->req.sector = sector_num;
4722 acb->req.nb_sectors = nb_sectors;
4723 acb->req.qiov = qiov;
4724 acb->req.flags = flags;
4725 acb->is_write = is_write;
4727 co = qemu_coroutine_create(bdrv_co_do_rw);
4728 qemu_coroutine_enter(co, acb);
4730 return &acb->common;
4733 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4735 BlockAIOCBCoroutine *acb = opaque;
4736 BlockDriverState *bs = acb->common.bs;
4738 acb->req.error = bdrv_co_flush(bs);
4739 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4740 qemu_bh_schedule(acb->bh);
4743 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4744 BlockCompletionFunc *cb, void *opaque)
4746 trace_bdrv_aio_flush(bs, opaque);
4748 Coroutine *co;
4749 BlockAIOCBCoroutine *acb;
4751 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4753 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4754 qemu_coroutine_enter(co, acb);
4756 return &acb->common;
4759 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4761 BlockAIOCBCoroutine *acb = opaque;
4762 BlockDriverState *bs = acb->common.bs;
4764 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4765 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4766 qemu_bh_schedule(acb->bh);
4769 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4770 int64_t sector_num, int nb_sectors,
4771 BlockCompletionFunc *cb, void *opaque)
4773 Coroutine *co;
4774 BlockAIOCBCoroutine *acb;
4776 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4778 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4779 acb->req.sector = sector_num;
4780 acb->req.nb_sectors = nb_sectors;
4781 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4782 qemu_coroutine_enter(co, acb);
4784 return &acb->common;
4787 void bdrv_init(void)
4789 module_call_init(MODULE_INIT_BLOCK);
4792 void bdrv_init_with_whitelist(void)
4794 use_bdrv_whitelist = 1;
4795 bdrv_init();
4798 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4799 BlockCompletionFunc *cb, void *opaque)
4801 BlockAIOCB *acb;
4803 acb = g_slice_alloc(aiocb_info->aiocb_size);
4804 acb->aiocb_info = aiocb_info;
4805 acb->bs = bs;
4806 acb->cb = cb;
4807 acb->opaque = opaque;
4808 acb->refcnt = 1;
4809 return acb;
4812 void qemu_aio_ref(void *p)
4814 BlockAIOCB *acb = p;
4815 acb->refcnt++;
4818 void qemu_aio_unref(void *p)
4820 BlockAIOCB *acb = p;
4821 assert(acb->refcnt > 0);
4822 if (--acb->refcnt == 0) {
4823 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4827 /**************************************************************/
4828 /* Coroutine block device emulation */
4830 typedef struct CoroutineIOCompletion {
4831 Coroutine *coroutine;
4832 int ret;
4833 } CoroutineIOCompletion;
4835 static void bdrv_co_io_em_complete(void *opaque, int ret)
4837 CoroutineIOCompletion *co = opaque;
4839 co->ret = ret;
4840 qemu_coroutine_enter(co->coroutine, NULL);
4843 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4844 int nb_sectors, QEMUIOVector *iov,
4845 bool is_write)
4847 CoroutineIOCompletion co = {
4848 .coroutine = qemu_coroutine_self(),
4850 BlockAIOCB *acb;
4852 if (is_write) {
4853 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4854 bdrv_co_io_em_complete, &co);
4855 } else {
4856 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4857 bdrv_co_io_em_complete, &co);
4860 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4861 if (!acb) {
4862 return -EIO;
4864 qemu_coroutine_yield();
4866 return co.ret;
4869 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4870 int64_t sector_num, int nb_sectors,
4871 QEMUIOVector *iov)
4873 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4876 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4877 int64_t sector_num, int nb_sectors,
4878 QEMUIOVector *iov)
4880 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4883 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4885 RwCo *rwco = opaque;
4887 rwco->ret = bdrv_co_flush(rwco->bs);
4890 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4892 int ret;
4894 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4895 return 0;
4898 /* Write back cached data to the OS even with cache=unsafe */
4899 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4900 if (bs->drv->bdrv_co_flush_to_os) {
4901 ret = bs->drv->bdrv_co_flush_to_os(bs);
4902 if (ret < 0) {
4903 return ret;
4907 /* But don't actually force it to the disk with cache=unsafe */
4908 if (bs->open_flags & BDRV_O_NO_FLUSH) {
4909 goto flush_parent;
4912 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4913 if (bs->drv->bdrv_co_flush_to_disk) {
4914 ret = bs->drv->bdrv_co_flush_to_disk(bs);
4915 } else if (bs->drv->bdrv_aio_flush) {
4916 BlockAIOCB *acb;
4917 CoroutineIOCompletion co = {
4918 .coroutine = qemu_coroutine_self(),
4921 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4922 if (acb == NULL) {
4923 ret = -EIO;
4924 } else {
4925 qemu_coroutine_yield();
4926 ret = co.ret;
4928 } else {
4930 * Some block drivers always operate in either writethrough or unsafe
4931 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4932 * know how the server works (because the behaviour is hardcoded or
4933 * depends on server-side configuration), so we can't ensure that
4934 * everything is safe on disk. Returning an error doesn't work because
4935 * that would break guests even if the server operates in writethrough
4936 * mode.
4938 * Let's hope the user knows what he's doing.
4940 ret = 0;
4942 if (ret < 0) {
4943 return ret;
4946 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4947 * in the case of cache=unsafe, so there are no useless flushes.
4949 flush_parent:
4950 return bdrv_co_flush(bs->file);
4953 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4955 Error *local_err = NULL;
4956 int ret;
4958 if (!bs->drv) {
4959 return;
4962 if (!(bs->open_flags & BDRV_O_INCOMING)) {
4963 return;
4965 bs->open_flags &= ~BDRV_O_INCOMING;
4967 if (bs->drv->bdrv_invalidate_cache) {
4968 bs->drv->bdrv_invalidate_cache(bs, &local_err);
4969 } else if (bs->file) {
4970 bdrv_invalidate_cache(bs->file, &local_err);
4972 if (local_err) {
4973 error_propagate(errp, local_err);
4974 return;
4977 ret = refresh_total_sectors(bs, bs->total_sectors);
4978 if (ret < 0) {
4979 error_setg_errno(errp, -ret, "Could not refresh total sector count");
4980 return;
4984 void bdrv_invalidate_cache_all(Error **errp)
4986 BlockDriverState *bs;
4987 Error *local_err = NULL;
4989 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4990 AioContext *aio_context = bdrv_get_aio_context(bs);
4992 aio_context_acquire(aio_context);
4993 bdrv_invalidate_cache(bs, &local_err);
4994 aio_context_release(aio_context);
4995 if (local_err) {
4996 error_propagate(errp, local_err);
4997 return;
5002 int bdrv_flush(BlockDriverState *bs)
5004 Coroutine *co;
5005 RwCo rwco = {
5006 .bs = bs,
5007 .ret = NOT_DONE,
5010 if (qemu_in_coroutine()) {
5011 /* Fast-path if already in coroutine context */
5012 bdrv_flush_co_entry(&rwco);
5013 } else {
5014 AioContext *aio_context = bdrv_get_aio_context(bs);
5016 co = qemu_coroutine_create(bdrv_flush_co_entry);
5017 qemu_coroutine_enter(co, &rwco);
5018 while (rwco.ret == NOT_DONE) {
5019 aio_poll(aio_context, true);
5023 return rwco.ret;
5026 typedef struct DiscardCo {
5027 BlockDriverState *bs;
5028 int64_t sector_num;
5029 int nb_sectors;
5030 int ret;
5031 } DiscardCo;
5032 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5034 DiscardCo *rwco = opaque;
5036 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5039 /* if no limit is specified in the BlockLimits use a default
5040 * of 32768 512-byte sectors (16 MiB) per request.
5042 #define MAX_DISCARD_DEFAULT 32768
5044 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5045 int nb_sectors)
5047 int max_discard;
5049 if (!bs->drv) {
5050 return -ENOMEDIUM;
5051 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5052 return -EIO;
5053 } else if (bs->read_only) {
5054 return -EROFS;
5057 bdrv_reset_dirty(bs, sector_num, nb_sectors);
5059 /* Do nothing if disabled. */
5060 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5061 return 0;
5064 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5065 return 0;
5068 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5069 while (nb_sectors > 0) {
5070 int ret;
5071 int num = nb_sectors;
5073 /* align request */
5074 if (bs->bl.discard_alignment &&
5075 num >= bs->bl.discard_alignment &&
5076 sector_num % bs->bl.discard_alignment) {
5077 if (num > bs->bl.discard_alignment) {
5078 num = bs->bl.discard_alignment;
5080 num -= sector_num % bs->bl.discard_alignment;
5083 /* limit request size */
5084 if (num > max_discard) {
5085 num = max_discard;
5088 if (bs->drv->bdrv_co_discard) {
5089 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5090 } else {
5091 BlockAIOCB *acb;
5092 CoroutineIOCompletion co = {
5093 .coroutine = qemu_coroutine_self(),
5096 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5097 bdrv_co_io_em_complete, &co);
5098 if (acb == NULL) {
5099 return -EIO;
5100 } else {
5101 qemu_coroutine_yield();
5102 ret = co.ret;
5105 if (ret && ret != -ENOTSUP) {
5106 return ret;
5109 sector_num += num;
5110 nb_sectors -= num;
5112 return 0;
5115 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5117 Coroutine *co;
5118 DiscardCo rwco = {
5119 .bs = bs,
5120 .sector_num = sector_num,
5121 .nb_sectors = nb_sectors,
5122 .ret = NOT_DONE,
5125 if (qemu_in_coroutine()) {
5126 /* Fast-path if already in coroutine context */
5127 bdrv_discard_co_entry(&rwco);
5128 } else {
5129 AioContext *aio_context = bdrv_get_aio_context(bs);
5131 co = qemu_coroutine_create(bdrv_discard_co_entry);
5132 qemu_coroutine_enter(co, &rwco);
5133 while (rwco.ret == NOT_DONE) {
5134 aio_poll(aio_context, true);
5138 return rwco.ret;
5141 /**************************************************************/
5142 /* removable device support */
5145 * Return TRUE if the media is present
5147 int bdrv_is_inserted(BlockDriverState *bs)
5149 BlockDriver *drv = bs->drv;
5151 if (!drv)
5152 return 0;
5153 if (!drv->bdrv_is_inserted)
5154 return 1;
5155 return drv->bdrv_is_inserted(bs);
5159 * Return whether the media changed since the last call to this
5160 * function, or -ENOTSUP if we don't know. Most drivers don't know.
5162 int bdrv_media_changed(BlockDriverState *bs)
5164 BlockDriver *drv = bs->drv;
5166 if (drv && drv->bdrv_media_changed) {
5167 return drv->bdrv_media_changed(bs);
5169 return -ENOTSUP;
5173 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5175 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5177 BlockDriver *drv = bs->drv;
5178 const char *device_name;
5180 if (drv && drv->bdrv_eject) {
5181 drv->bdrv_eject(bs, eject_flag);
5184 device_name = bdrv_get_device_name(bs);
5185 if (device_name[0] != '\0') {
5186 qapi_event_send_device_tray_moved(device_name,
5187 eject_flag, &error_abort);
5192 * Lock or unlock the media (if it is locked, the user won't be able
5193 * to eject it manually).
5195 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5197 BlockDriver *drv = bs->drv;
5199 trace_bdrv_lock_medium(bs, locked);
5201 if (drv && drv->bdrv_lock_medium) {
5202 drv->bdrv_lock_medium(bs, locked);
5206 /* needed for generic scsi interface */
5208 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5210 BlockDriver *drv = bs->drv;
5212 if (drv && drv->bdrv_ioctl)
5213 return drv->bdrv_ioctl(bs, req, buf);
5214 return -ENOTSUP;
5217 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5218 unsigned long int req, void *buf,
5219 BlockCompletionFunc *cb, void *opaque)
5221 BlockDriver *drv = bs->drv;
5223 if (drv && drv->bdrv_aio_ioctl)
5224 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5225 return NULL;
5228 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5230 bs->guest_block_size = align;
5233 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5235 return qemu_memalign(bdrv_opt_mem_align(bs), size);
5238 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5240 return memset(qemu_blockalign(bs, size), 0, size);
5243 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5245 size_t align = bdrv_opt_mem_align(bs);
5247 /* Ensure that NULL is never returned on success */
5248 assert(align > 0);
5249 if (size == 0) {
5250 size = align;
5253 return qemu_try_memalign(align, size);
5256 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5258 void *mem = qemu_try_blockalign(bs, size);
5260 if (mem) {
5261 memset(mem, 0, size);
5264 return mem;
5268 * Check if all memory in this vector is sector aligned.
5270 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5272 int i;
5273 size_t alignment = bdrv_opt_mem_align(bs);
5275 for (i = 0; i < qiov->niov; i++) {
5276 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5277 return false;
5279 if (qiov->iov[i].iov_len % alignment) {
5280 return false;
5284 return true;
5287 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5288 Error **errp)
5290 int64_t bitmap_size;
5291 BdrvDirtyBitmap *bitmap;
5293 assert((granularity & (granularity - 1)) == 0);
5295 granularity >>= BDRV_SECTOR_BITS;
5296 assert(granularity);
5297 bitmap_size = bdrv_nb_sectors(bs);
5298 if (bitmap_size < 0) {
5299 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5300 errno = -bitmap_size;
5301 return NULL;
5303 bitmap = g_new0(BdrvDirtyBitmap, 1);
5304 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5305 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5306 return bitmap;
5309 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5311 BdrvDirtyBitmap *bm, *next;
5312 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5313 if (bm == bitmap) {
5314 QLIST_REMOVE(bitmap, list);
5315 hbitmap_free(bitmap->bitmap);
5316 g_free(bitmap);
5317 return;
5322 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5324 BdrvDirtyBitmap *bm;
5325 BlockDirtyInfoList *list = NULL;
5326 BlockDirtyInfoList **plist = &list;
5328 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5329 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5330 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5331 info->count = bdrv_get_dirty_count(bs, bm);
5332 info->granularity =
5333 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5334 entry->value = info;
5335 *plist = entry;
5336 plist = &entry->next;
5339 return list;
5342 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5344 if (bitmap) {
5345 return hbitmap_get(bitmap->bitmap, sector);
5346 } else {
5347 return 0;
5351 void bdrv_dirty_iter_init(BlockDriverState *bs,
5352 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5354 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5357 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5358 int nr_sectors)
5360 BdrvDirtyBitmap *bitmap;
5361 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5362 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5366 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5368 BdrvDirtyBitmap *bitmap;
5369 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5370 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5374 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5376 return hbitmap_count(bitmap->bitmap);
5379 /* Get a reference to bs */
5380 void bdrv_ref(BlockDriverState *bs)
5382 bs->refcnt++;
5385 /* Release a previously grabbed reference to bs.
5386 * If after releasing, reference count is zero, the BlockDriverState is
5387 * deleted. */
5388 void bdrv_unref(BlockDriverState *bs)
5390 if (!bs) {
5391 return;
5393 assert(bs->refcnt > 0);
5394 if (--bs->refcnt == 0) {
5395 bdrv_delete(bs);
5399 struct BdrvOpBlocker {
5400 Error *reason;
5401 QLIST_ENTRY(BdrvOpBlocker) list;
5404 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5406 BdrvOpBlocker *blocker;
5407 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5408 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5409 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5410 if (errp) {
5411 error_setg(errp, "Device '%s' is busy: %s",
5412 bdrv_get_device_name(bs),
5413 error_get_pretty(blocker->reason));
5415 return true;
5417 return false;
5420 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5422 BdrvOpBlocker *blocker;
5423 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5425 blocker = g_new0(BdrvOpBlocker, 1);
5426 blocker->reason = reason;
5427 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5430 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5432 BdrvOpBlocker *blocker, *next;
5433 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5434 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5435 if (blocker->reason == reason) {
5436 QLIST_REMOVE(blocker, list);
5437 g_free(blocker);
5442 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5444 int i;
5445 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5446 bdrv_op_block(bs, i, reason);
5450 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5452 int i;
5453 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5454 bdrv_op_unblock(bs, i, reason);
5458 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5460 int i;
5462 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5463 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5464 return false;
5467 return true;
5470 void bdrv_iostatus_enable(BlockDriverState *bs)
5472 bs->iostatus_enabled = true;
5473 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5476 /* The I/O status is only enabled if the drive explicitly
5477 * enables it _and_ the VM is configured to stop on errors */
5478 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5480 return (bs->iostatus_enabled &&
5481 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5482 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5483 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5486 void bdrv_iostatus_disable(BlockDriverState *bs)
5488 bs->iostatus_enabled = false;
5491 void bdrv_iostatus_reset(BlockDriverState *bs)
5493 if (bdrv_iostatus_is_enabled(bs)) {
5494 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5495 if (bs->job) {
5496 block_job_iostatus_reset(bs->job);
5501 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5503 assert(bdrv_iostatus_is_enabled(bs));
5504 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5505 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5506 BLOCK_DEVICE_IO_STATUS_FAILED;
5510 void bdrv_img_create(const char *filename, const char *fmt,
5511 const char *base_filename, const char *base_fmt,
5512 char *options, uint64_t img_size, int flags,
5513 Error **errp, bool quiet)
5515 QemuOptsList *create_opts = NULL;
5516 QemuOpts *opts = NULL;
5517 const char *backing_fmt, *backing_file;
5518 int64_t size;
5519 BlockDriver *drv, *proto_drv;
5520 BlockDriver *backing_drv = NULL;
5521 Error *local_err = NULL;
5522 int ret = 0;
5524 /* Find driver and parse its options */
5525 drv = bdrv_find_format(fmt);
5526 if (!drv) {
5527 error_setg(errp, "Unknown file format '%s'", fmt);
5528 return;
5531 proto_drv = bdrv_find_protocol(filename, true);
5532 if (!proto_drv) {
5533 error_setg(errp, "Unknown protocol '%s'", filename);
5534 return;
5537 if (!drv->create_opts) {
5538 error_setg(errp, "Format driver '%s' does not support image creation",
5539 drv->format_name);
5540 return;
5543 if (!proto_drv->create_opts) {
5544 error_setg(errp, "Protocol driver '%s' does not support image creation",
5545 proto_drv->format_name);
5546 return;
5549 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5550 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5552 /* Create parameter list with default values */
5553 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5554 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5556 /* Parse -o options */
5557 if (options) {
5558 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5559 error_setg(errp, "Invalid options for file format '%s'", fmt);
5560 goto out;
5564 if (base_filename) {
5565 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5566 error_setg(errp, "Backing file not supported for file format '%s'",
5567 fmt);
5568 goto out;
5572 if (base_fmt) {
5573 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5574 error_setg(errp, "Backing file format not supported for file "
5575 "format '%s'", fmt);
5576 goto out;
5580 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5581 if (backing_file) {
5582 if (!strcmp(filename, backing_file)) {
5583 error_setg(errp, "Error: Trying to create an image with the "
5584 "same filename as the backing file");
5585 goto out;
5589 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5590 if (backing_fmt) {
5591 backing_drv = bdrv_find_format(backing_fmt);
5592 if (!backing_drv) {
5593 error_setg(errp, "Unknown backing file format '%s'",
5594 backing_fmt);
5595 goto out;
5599 // The size for the image must always be specified, with one exception:
5600 // If we are using a backing file, we can obtain the size from there
5601 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5602 if (size == -1) {
5603 if (backing_file) {
5604 BlockDriverState *bs;
5605 int64_t size;
5606 int back_flags;
5608 /* backing files always opened read-only */
5609 back_flags =
5610 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5612 bs = NULL;
5613 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5614 backing_drv, &local_err);
5615 if (ret < 0) {
5616 goto out;
5618 size = bdrv_getlength(bs);
5619 if (size < 0) {
5620 error_setg_errno(errp, -size, "Could not get size of '%s'",
5621 backing_file);
5622 bdrv_unref(bs);
5623 goto out;
5626 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5628 bdrv_unref(bs);
5629 } else {
5630 error_setg(errp, "Image creation needs a size parameter");
5631 goto out;
5635 if (!quiet) {
5636 printf("Formatting '%s', fmt=%s ", filename, fmt);
5637 qemu_opts_print(opts);
5638 puts("");
5641 ret = bdrv_create(drv, filename, opts, &local_err);
5643 if (ret == -EFBIG) {
5644 /* This is generally a better message than whatever the driver would
5645 * deliver (especially because of the cluster_size_hint), since that
5646 * is most probably not much different from "image too large". */
5647 const char *cluster_size_hint = "";
5648 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5649 cluster_size_hint = " (try using a larger cluster size)";
5651 error_setg(errp, "The image size is too large for file format '%s'"
5652 "%s", fmt, cluster_size_hint);
5653 error_free(local_err);
5654 local_err = NULL;
5657 out:
5658 qemu_opts_del(opts);
5659 qemu_opts_free(create_opts);
5660 if (local_err) {
5661 error_propagate(errp, local_err);
5665 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5667 return bs->aio_context;
5670 void bdrv_detach_aio_context(BlockDriverState *bs)
5672 BdrvAioNotifier *baf;
5674 if (!bs->drv) {
5675 return;
5678 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5679 baf->detach_aio_context(baf->opaque);
5682 if (bs->io_limits_enabled) {
5683 throttle_detach_aio_context(&bs->throttle_state);
5685 if (bs->drv->bdrv_detach_aio_context) {
5686 bs->drv->bdrv_detach_aio_context(bs);
5688 if (bs->file) {
5689 bdrv_detach_aio_context(bs->file);
5691 if (bs->backing_hd) {
5692 bdrv_detach_aio_context(bs->backing_hd);
5695 bs->aio_context = NULL;
5698 void bdrv_attach_aio_context(BlockDriverState *bs,
5699 AioContext *new_context)
5701 BdrvAioNotifier *ban;
5703 if (!bs->drv) {
5704 return;
5707 bs->aio_context = new_context;
5709 if (bs->backing_hd) {
5710 bdrv_attach_aio_context(bs->backing_hd, new_context);
5712 if (bs->file) {
5713 bdrv_attach_aio_context(bs->file, new_context);
5715 if (bs->drv->bdrv_attach_aio_context) {
5716 bs->drv->bdrv_attach_aio_context(bs, new_context);
5718 if (bs->io_limits_enabled) {
5719 throttle_attach_aio_context(&bs->throttle_state, new_context);
5722 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5723 ban->attached_aio_context(new_context, ban->opaque);
5727 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5729 bdrv_drain_all(); /* ensure there are no in-flight requests */
5731 bdrv_detach_aio_context(bs);
5733 /* This function executes in the old AioContext so acquire the new one in
5734 * case it runs in a different thread.
5736 aio_context_acquire(new_context);
5737 bdrv_attach_aio_context(bs, new_context);
5738 aio_context_release(new_context);
5741 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5742 void (*attached_aio_context)(AioContext *new_context, void *opaque),
5743 void (*detach_aio_context)(void *opaque), void *opaque)
5745 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5746 *ban = (BdrvAioNotifier){
5747 .attached_aio_context = attached_aio_context,
5748 .detach_aio_context = detach_aio_context,
5749 .opaque = opaque
5752 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5755 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5756 void (*attached_aio_context)(AioContext *,
5757 void *),
5758 void (*detach_aio_context)(void *),
5759 void *opaque)
5761 BdrvAioNotifier *ban, *ban_next;
5763 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5764 if (ban->attached_aio_context == attached_aio_context &&
5765 ban->detach_aio_context == detach_aio_context &&
5766 ban->opaque == opaque)
5768 QLIST_REMOVE(ban, list);
5769 g_free(ban);
5771 return;
5775 abort();
5778 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5779 NotifierWithReturn *notifier)
5781 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5784 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
5785 BlockDriverAmendStatusCB *status_cb)
5787 if (!bs->drv->bdrv_amend_options) {
5788 return -ENOTSUP;
5790 return bs->drv->bdrv_amend_options(bs, opts, status_cb);
5793 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5794 * of block filter and by bdrv_is_first_non_filter.
5795 * It is used to test if the given bs is the candidate or recurse more in the
5796 * node graph.
5798 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5799 BlockDriverState *candidate)
5801 /* return false if basic checks fails */
5802 if (!bs || !bs->drv) {
5803 return false;
5806 /* the code reached a non block filter driver -> check if the bs is
5807 * the same as the candidate. It's the recursion termination condition.
5809 if (!bs->drv->is_filter) {
5810 return bs == candidate;
5812 /* Down this path the driver is a block filter driver */
5814 /* If the block filter recursion method is defined use it to recurse down
5815 * the node graph.
5817 if (bs->drv->bdrv_recurse_is_first_non_filter) {
5818 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5821 /* the driver is a block filter but don't allow to recurse -> return false
5823 return false;
5826 /* This function checks if the candidate is the first non filter bs down it's
5827 * bs chain. Since we don't have pointers to parents it explore all bs chains
5828 * from the top. Some filters can choose not to pass down the recursion.
5830 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5832 BlockDriverState *bs;
5834 /* walk down the bs forest recursively */
5835 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5836 bool perm;
5838 /* try to recurse in this top level bs */
5839 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5841 /* candidate is the first non filter */
5842 if (perm) {
5843 return true;
5847 return false;
5850 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5852 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5853 AioContext *aio_context;
5855 if (!to_replace_bs) {
5856 error_setg(errp, "Node name '%s' not found", node_name);
5857 return NULL;
5860 aio_context = bdrv_get_aio_context(to_replace_bs);
5861 aio_context_acquire(aio_context);
5863 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5864 to_replace_bs = NULL;
5865 goto out;
5868 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5869 * most non filter in order to prevent data corruption.
5870 * Another benefit is that this tests exclude backing files which are
5871 * blocked by the backing blockers.
5873 if (!bdrv_is_first_non_filter(to_replace_bs)) {
5874 error_setg(errp, "Only top most non filter can be replaced");
5875 to_replace_bs = NULL;
5876 goto out;
5879 out:
5880 aio_context_release(aio_context);
5881 return to_replace_bs;
5884 void bdrv_io_plug(BlockDriverState *bs)
5886 BlockDriver *drv = bs->drv;
5887 if (drv && drv->bdrv_io_plug) {
5888 drv->bdrv_io_plug(bs);
5889 } else if (bs->file) {
5890 bdrv_io_plug(bs->file);
5894 void bdrv_io_unplug(BlockDriverState *bs)
5896 BlockDriver *drv = bs->drv;
5897 if (drv && drv->bdrv_io_unplug) {
5898 drv->bdrv_io_unplug(bs);
5899 } else if (bs->file) {
5900 bdrv_io_unplug(bs->file);
5904 void bdrv_flush_io_queue(BlockDriverState *bs)
5906 BlockDriver *drv = bs->drv;
5907 if (drv && drv->bdrv_flush_io_queue) {
5908 drv->bdrv_flush_io_queue(bs);
5909 } else if (bs->file) {
5910 bdrv_flush_io_queue(bs->file);
5914 static bool append_open_options(QDict *d, BlockDriverState *bs)
5916 const QDictEntry *entry;
5917 bool found_any = false;
5919 for (entry = qdict_first(bs->options); entry;
5920 entry = qdict_next(bs->options, entry))
5922 /* Only take options for this level and exclude all non-driver-specific
5923 * options */
5924 if (!strchr(qdict_entry_key(entry), '.') &&
5925 strcmp(qdict_entry_key(entry), "node-name"))
5927 qobject_incref(qdict_entry_value(entry));
5928 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5929 found_any = true;
5933 return found_any;
5936 /* Updates the following BDS fields:
5937 * - exact_filename: A filename which may be used for opening a block device
5938 * which (mostly) equals the given BDS (even without any
5939 * other options; so reading and writing must return the same
5940 * results, but caching etc. may be different)
5941 * - full_open_options: Options which, when given when opening a block device
5942 * (without a filename), result in a BDS (mostly)
5943 * equalling the given one
5944 * - filename: If exact_filename is set, it is copied here. Otherwise,
5945 * full_open_options is converted to a JSON object, prefixed with
5946 * "json:" (for use through the JSON pseudo protocol) and put here.
5948 void bdrv_refresh_filename(BlockDriverState *bs)
5950 BlockDriver *drv = bs->drv;
5951 QDict *opts;
5953 if (!drv) {
5954 return;
5957 /* This BDS's file name will most probably depend on its file's name, so
5958 * refresh that first */
5959 if (bs->file) {
5960 bdrv_refresh_filename(bs->file);
5963 if (drv->bdrv_refresh_filename) {
5964 /* Obsolete information is of no use here, so drop the old file name
5965 * information before refreshing it */
5966 bs->exact_filename[0] = '\0';
5967 if (bs->full_open_options) {
5968 QDECREF(bs->full_open_options);
5969 bs->full_open_options = NULL;
5972 drv->bdrv_refresh_filename(bs);
5973 } else if (bs->file) {
5974 /* Try to reconstruct valid information from the underlying file */
5975 bool has_open_options;
5977 bs->exact_filename[0] = '\0';
5978 if (bs->full_open_options) {
5979 QDECREF(bs->full_open_options);
5980 bs->full_open_options = NULL;
5983 opts = qdict_new();
5984 has_open_options = append_open_options(opts, bs);
5986 /* If no specific options have been given for this BDS, the filename of
5987 * the underlying file should suffice for this one as well */
5988 if (bs->file->exact_filename[0] && !has_open_options) {
5989 strcpy(bs->exact_filename, bs->file->exact_filename);
5991 /* Reconstructing the full options QDict is simple for most format block
5992 * drivers, as long as the full options are known for the underlying
5993 * file BDS. The full options QDict of that file BDS should somehow
5994 * contain a representation of the filename, therefore the following
5995 * suffices without querying the (exact_)filename of this BDS. */
5996 if (bs->file->full_open_options) {
5997 qdict_put_obj(opts, "driver",
5998 QOBJECT(qstring_from_str(drv->format_name)));
5999 QINCREF(bs->file->full_open_options);
6000 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6002 bs->full_open_options = opts;
6003 } else {
6004 QDECREF(opts);
6006 } else if (!bs->full_open_options && qdict_size(bs->options)) {
6007 /* There is no underlying file BDS (at least referenced by BDS.file),
6008 * so the full options QDict should be equal to the options given
6009 * specifically for this block device when it was opened (plus the
6010 * driver specification).
6011 * Because those options don't change, there is no need to update
6012 * full_open_options when it's already set. */
6014 opts = qdict_new();
6015 append_open_options(opts, bs);
6016 qdict_put_obj(opts, "driver",
6017 QOBJECT(qstring_from_str(drv->format_name)));
6019 if (bs->exact_filename[0]) {
6020 /* This may not work for all block protocol drivers (some may
6021 * require this filename to be parsed), but we have to find some
6022 * default solution here, so just include it. If some block driver
6023 * does not support pure options without any filename at all or
6024 * needs some special format of the options QDict, it needs to
6025 * implement the driver-specific bdrv_refresh_filename() function.
6027 qdict_put_obj(opts, "filename",
6028 QOBJECT(qstring_from_str(bs->exact_filename)));
6031 bs->full_open_options = opts;
6034 if (bs->exact_filename[0]) {
6035 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6036 } else if (bs->full_open_options) {
6037 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6038 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6039 qstring_get_str(json));
6040 QDECREF(json);
6044 /* This accessor function purpose is to allow the device models to access the
6045 * BlockAcctStats structure embedded inside a BlockDriverState without being
6046 * aware of the BlockDriverState structure layout.
6047 * It will go away when the BlockAcctStats structure will be moved inside
6048 * the device models.
6050 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6052 return &bs->stats;