qcow2.py: Add required padding for header extensions
[qemu/cris-port.git] / block.c
blob35f7a0a0ab4efbef2fccf2c69276f9f602372fc8
1 /*
2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
40 #ifdef CONFIG_BSD
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
45 #ifndef __DragonFly__
46 #include <sys/disk.h>
47 #endif
48 #endif
50 #ifdef _WIN32
51 #include <windows.h>
52 #endif
54 struct BdrvDirtyBitmap {
55 HBitmap *bitmap;
56 QLIST_ENTRY(BdrvDirtyBitmap) list;
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63 BlockCompletionFunc *cb, void *opaque);
64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66 BlockCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75 BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78 BdrvRequestFlags flags);
79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
83 BdrvRequestFlags flags,
84 BlockCompletionFunc *cb,
85 void *opaque,
86 bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
111 int is_windows_drive(const char *filename)
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
121 #endif
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
127 int i;
129 throttle_config(&bs->throttle_state, cfg);
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
143 bs->io_limits_enabled = false;
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
151 bs->io_limits_enabled = enabled;
153 return drained;
156 void bdrv_io_limits_disable(BlockDriverState *bs)
158 bs->io_limits_enabled = false;
160 bdrv_start_throttled_reqs(bs);
162 throttle_destroy(&bs->throttle_state);
165 static void bdrv_throttle_read_timer_cb(void *opaque)
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
171 static void bdrv_throttle_write_timer_cb(void *opaque)
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 bdrv_get_aio_context(bs),
183 QEMU_CLOCK_VIRTUAL,
184 bdrv_throttle_read_timer_cb,
185 bdrv_throttle_write_timer_cb,
186 bs);
187 bs->io_limits_enabled = true;
190 /* This function makes an IO wait if needed
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
195 static void bdrv_io_limits_intercept(BlockDriverState *bs,
196 unsigned int bytes,
197 bool is_write)
199 /* does this io must wait */
200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
202 /* if must wait or any request of this type throttled queue the IO */
203 if (must_wait ||
204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
208 /* the IO will be executed, do the accounting */
209 throttle_account(&bs->throttle_state, is_write, bytes);
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214 return;
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
221 size_t bdrv_opt_mem_align(BlockDriverState *bs)
223 if (!bs || !bs->drv) {
224 /* 4k should be on the safe side */
225 return 4096;
228 return bs->bl.opt_mem_alignment;
231 /* check if the path starts with "<protocol>:" */
232 static int path_has_protocol(const char *path)
234 const char *p;
236 #ifdef _WIN32
237 if (is_windows_drive(path) ||
238 is_windows_drive_prefix(path)) {
239 return 0;
241 p = path + strcspn(path, ":/\\");
242 #else
243 p = path + strcspn(path, ":/");
244 #endif
246 return *p == ':';
249 int path_is_absolute(const char *path)
251 #ifdef _WIN32
252 /* specific case for names like: "\\.\d:" */
253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254 return 1;
256 return (*path == '/' || *path == '\\');
257 #else
258 return (*path == '/');
259 #endif
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
264 supported. */
265 void path_combine(char *dest, int dest_size,
266 const char *base_path,
267 const char *filename)
269 const char *p, *p1;
270 int len;
272 if (dest_size <= 0)
273 return;
274 if (path_is_absolute(filename)) {
275 pstrcpy(dest, dest_size, filename);
276 } else {
277 p = strchr(base_path, ':');
278 if (p)
279 p++;
280 else
281 p = base_path;
282 p1 = strrchr(base_path, '/');
283 #ifdef _WIN32
285 const char *p2;
286 p2 = strrchr(base_path, '\\');
287 if (!p1 || p2 > p1)
288 p1 = p2;
290 #endif
291 if (p1)
292 p1++;
293 else
294 p1 = base_path;
295 if (p1 > p)
296 p = p1;
297 len = p - base_path;
298 if (len > dest_size - 1)
299 len = dest_size - 1;
300 memcpy(dest, base_path, len);
301 dest[len] = '\0';
302 pstrcat(dest, dest_size, filename);
306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
308 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309 pstrcpy(dest, sz, bs->backing_file);
310 } else {
311 path_combine(dest, sz, bs->filename, bs->backing_file);
315 void bdrv_register(BlockDriver *bdrv)
317 /* Block drivers without coroutine functions need emulation */
318 if (!bdrv->bdrv_co_readv) {
319 bdrv->bdrv_co_readv = bdrv_co_readv_em;
320 bdrv->bdrv_co_writev = bdrv_co_writev_em;
322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323 * the block driver lacks aio we need to emulate that too.
325 if (!bdrv->bdrv_aio_readv) {
326 /* add AIO emulation layer */
327 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
332 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
335 BlockDriverState *bdrv_new_root(void)
337 BlockDriverState *bs = bdrv_new();
339 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
340 return bs;
343 BlockDriverState *bdrv_new(void)
345 BlockDriverState *bs;
346 int i;
348 bs = g_new0(BlockDriverState, 1);
349 QLIST_INIT(&bs->dirty_bitmaps);
350 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
351 QLIST_INIT(&bs->op_blockers[i]);
353 bdrv_iostatus_disable(bs);
354 notifier_list_init(&bs->close_notifiers);
355 notifier_with_return_list_init(&bs->before_write_notifiers);
356 qemu_co_queue_init(&bs->throttled_reqs[0]);
357 qemu_co_queue_init(&bs->throttled_reqs[1]);
358 bs->refcnt = 1;
359 bs->aio_context = qemu_get_aio_context();
361 return bs;
364 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
366 notifier_list_add(&bs->close_notifiers, notify);
369 BlockDriver *bdrv_find_format(const char *format_name)
371 BlockDriver *drv1;
372 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
373 if (!strcmp(drv1->format_name, format_name)) {
374 return drv1;
377 return NULL;
380 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
382 static const char *whitelist_rw[] = {
383 CONFIG_BDRV_RW_WHITELIST
385 static const char *whitelist_ro[] = {
386 CONFIG_BDRV_RO_WHITELIST
388 const char **p;
390 if (!whitelist_rw[0] && !whitelist_ro[0]) {
391 return 1; /* no whitelist, anything goes */
394 for (p = whitelist_rw; *p; p++) {
395 if (!strcmp(drv->format_name, *p)) {
396 return 1;
399 if (read_only) {
400 for (p = whitelist_ro; *p; p++) {
401 if (!strcmp(drv->format_name, *p)) {
402 return 1;
406 return 0;
409 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
410 bool read_only)
412 BlockDriver *drv = bdrv_find_format(format_name);
413 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
416 typedef struct CreateCo {
417 BlockDriver *drv;
418 char *filename;
419 QemuOpts *opts;
420 int ret;
421 Error *err;
422 } CreateCo;
424 static void coroutine_fn bdrv_create_co_entry(void *opaque)
426 Error *local_err = NULL;
427 int ret;
429 CreateCo *cco = opaque;
430 assert(cco->drv);
432 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
433 if (local_err) {
434 error_propagate(&cco->err, local_err);
436 cco->ret = ret;
439 int bdrv_create(BlockDriver *drv, const char* filename,
440 QemuOpts *opts, Error **errp)
442 int ret;
444 Coroutine *co;
445 CreateCo cco = {
446 .drv = drv,
447 .filename = g_strdup(filename),
448 .opts = opts,
449 .ret = NOT_DONE,
450 .err = NULL,
453 if (!drv->bdrv_create) {
454 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
455 ret = -ENOTSUP;
456 goto out;
459 if (qemu_in_coroutine()) {
460 /* Fast-path if already in coroutine context */
461 bdrv_create_co_entry(&cco);
462 } else {
463 co = qemu_coroutine_create(bdrv_create_co_entry);
464 qemu_coroutine_enter(co, &cco);
465 while (cco.ret == NOT_DONE) {
466 aio_poll(qemu_get_aio_context(), true);
470 ret = cco.ret;
471 if (ret < 0) {
472 if (cco.err) {
473 error_propagate(errp, cco.err);
474 } else {
475 error_setg_errno(errp, -ret, "Could not create image");
479 out:
480 g_free(cco.filename);
481 return ret;
484 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
486 BlockDriver *drv;
487 Error *local_err = NULL;
488 int ret;
490 drv = bdrv_find_protocol(filename, true);
491 if (drv == NULL) {
492 error_setg(errp, "Could not find protocol for file '%s'", filename);
493 return -ENOENT;
496 ret = bdrv_create(drv, filename, opts, &local_err);
497 if (local_err) {
498 error_propagate(errp, local_err);
500 return ret;
503 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
505 BlockDriver *drv = bs->drv;
506 Error *local_err = NULL;
508 memset(&bs->bl, 0, sizeof(bs->bl));
510 if (!drv) {
511 return;
514 /* Take some limits from the children as a default */
515 if (bs->file) {
516 bdrv_refresh_limits(bs->file, &local_err);
517 if (local_err) {
518 error_propagate(errp, local_err);
519 return;
521 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
522 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
523 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
524 } else {
525 bs->bl.opt_mem_alignment = 512;
528 if (bs->backing_hd) {
529 bdrv_refresh_limits(bs->backing_hd, &local_err);
530 if (local_err) {
531 error_propagate(errp, local_err);
532 return;
534 bs->bl.opt_transfer_length =
535 MAX(bs->bl.opt_transfer_length,
536 bs->backing_hd->bl.opt_transfer_length);
537 bs->bl.max_transfer_length =
538 MIN_NON_ZERO(bs->bl.max_transfer_length,
539 bs->backing_hd->bl.max_transfer_length);
540 bs->bl.opt_mem_alignment =
541 MAX(bs->bl.opt_mem_alignment,
542 bs->backing_hd->bl.opt_mem_alignment);
545 /* Then let the driver override it */
546 if (drv->bdrv_refresh_limits) {
547 drv->bdrv_refresh_limits(bs, errp);
552 * Create a uniquely-named empty temporary file.
553 * Return 0 upon success, otherwise a negative errno value.
555 int get_tmp_filename(char *filename, int size)
557 #ifdef _WIN32
558 char temp_dir[MAX_PATH];
559 /* GetTempFileName requires that its output buffer (4th param)
560 have length MAX_PATH or greater. */
561 assert(size >= MAX_PATH);
562 return (GetTempPath(MAX_PATH, temp_dir)
563 && GetTempFileName(temp_dir, "qem", 0, filename)
564 ? 0 : -GetLastError());
565 #else
566 int fd;
567 const char *tmpdir;
568 tmpdir = getenv("TMPDIR");
569 if (!tmpdir) {
570 tmpdir = "/var/tmp";
572 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
573 return -EOVERFLOW;
575 fd = mkstemp(filename);
576 if (fd < 0) {
577 return -errno;
579 if (close(fd) != 0) {
580 unlink(filename);
581 return -errno;
583 return 0;
584 #endif
588 * Detect host devices. By convention, /dev/cdrom[N] is always
589 * recognized as a host CDROM.
591 static BlockDriver *find_hdev_driver(const char *filename)
593 int score_max = 0, score;
594 BlockDriver *drv = NULL, *d;
596 QLIST_FOREACH(d, &bdrv_drivers, list) {
597 if (d->bdrv_probe_device) {
598 score = d->bdrv_probe_device(filename);
599 if (score > score_max) {
600 score_max = score;
601 drv = d;
606 return drv;
609 BlockDriver *bdrv_find_protocol(const char *filename,
610 bool allow_protocol_prefix)
612 BlockDriver *drv1;
613 char protocol[128];
614 int len;
615 const char *p;
617 /* TODO Drivers without bdrv_file_open must be specified explicitly */
620 * XXX(hch): we really should not let host device detection
621 * override an explicit protocol specification, but moving this
622 * later breaks access to device names with colons in them.
623 * Thanks to the brain-dead persistent naming schemes on udev-
624 * based Linux systems those actually are quite common.
626 drv1 = find_hdev_driver(filename);
627 if (drv1) {
628 return drv1;
631 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
632 return bdrv_find_format("file");
635 p = strchr(filename, ':');
636 assert(p != NULL);
637 len = p - filename;
638 if (len > sizeof(protocol) - 1)
639 len = sizeof(protocol) - 1;
640 memcpy(protocol, filename, len);
641 protocol[len] = '\0';
642 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
643 if (drv1->protocol_name &&
644 !strcmp(drv1->protocol_name, protocol)) {
645 return drv1;
648 return NULL;
652 * Guess image format by probing its contents.
653 * This is not a good idea when your image is raw (CVE-2008-2004), but
654 * we do it anyway for backward compatibility.
656 * @buf contains the image's first @buf_size bytes.
657 * @buf_size is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
658 * but can be smaller if the image file is smaller)
659 * @filename is its filename.
661 * For all block drivers, call the bdrv_probe() method to get its
662 * probing score.
663 * Return the first block driver with the highest probing score.
665 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
666 const char *filename)
668 int score_max = 0, score;
669 BlockDriver *drv = NULL, *d;
671 QLIST_FOREACH(d, &bdrv_drivers, list) {
672 if (d->bdrv_probe) {
673 score = d->bdrv_probe(buf, buf_size, filename);
674 if (score > score_max) {
675 score_max = score;
676 drv = d;
681 return drv;
684 static int find_image_format(BlockDriverState *bs, const char *filename,
685 BlockDriver **pdrv, Error **errp)
687 BlockDriver *drv;
688 uint8_t buf[BLOCK_PROBE_BUF_SIZE];
689 int ret = 0;
691 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
692 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
693 drv = bdrv_find_format("raw");
694 if (!drv) {
695 error_setg(errp, "Could not find raw image format");
696 ret = -ENOENT;
698 *pdrv = drv;
699 return ret;
702 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
703 if (ret < 0) {
704 error_setg_errno(errp, -ret, "Could not read image for determining its "
705 "format");
706 *pdrv = NULL;
707 return ret;
710 drv = bdrv_probe_all(buf, ret, filename);
711 if (!drv) {
712 error_setg(errp, "Could not determine image format: No compatible "
713 "driver found");
714 ret = -ENOENT;
716 *pdrv = drv;
717 return ret;
721 * Set the current 'total_sectors' value
722 * Return 0 on success, -errno on error.
724 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
726 BlockDriver *drv = bs->drv;
728 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
729 if (bs->sg)
730 return 0;
732 /* query actual device if possible, otherwise just trust the hint */
733 if (drv->bdrv_getlength) {
734 int64_t length = drv->bdrv_getlength(bs);
735 if (length < 0) {
736 return length;
738 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
741 bs->total_sectors = hint;
742 return 0;
746 * Set open flags for a given discard mode
748 * Return 0 on success, -1 if the discard mode was invalid.
750 int bdrv_parse_discard_flags(const char *mode, int *flags)
752 *flags &= ~BDRV_O_UNMAP;
754 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
755 /* do nothing */
756 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
757 *flags |= BDRV_O_UNMAP;
758 } else {
759 return -1;
762 return 0;
766 * Set open flags for a given cache mode
768 * Return 0 on success, -1 if the cache mode was invalid.
770 int bdrv_parse_cache_flags(const char *mode, int *flags)
772 *flags &= ~BDRV_O_CACHE_MASK;
774 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
775 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
776 } else if (!strcmp(mode, "directsync")) {
777 *flags |= BDRV_O_NOCACHE;
778 } else if (!strcmp(mode, "writeback")) {
779 *flags |= BDRV_O_CACHE_WB;
780 } else if (!strcmp(mode, "unsafe")) {
781 *flags |= BDRV_O_CACHE_WB;
782 *flags |= BDRV_O_NO_FLUSH;
783 } else if (!strcmp(mode, "writethrough")) {
784 /* this is the default */
785 } else {
786 return -1;
789 return 0;
793 * The copy-on-read flag is actually a reference count so multiple users may
794 * use the feature without worrying about clobbering its previous state.
795 * Copy-on-read stays enabled until all users have called to disable it.
797 void bdrv_enable_copy_on_read(BlockDriverState *bs)
799 bs->copy_on_read++;
802 void bdrv_disable_copy_on_read(BlockDriverState *bs)
804 assert(bs->copy_on_read > 0);
805 bs->copy_on_read--;
809 * Returns the flags that a temporary snapshot should get, based on the
810 * originally requested flags (the originally requested image will have flags
811 * like a backing file)
813 static int bdrv_temp_snapshot_flags(int flags)
815 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
819 * Returns the flags that bs->file should get, based on the given flags for
820 * the parent BDS
822 static int bdrv_inherited_flags(int flags)
824 /* Enable protocol handling, disable format probing for bs->file */
825 flags |= BDRV_O_PROTOCOL;
827 /* Our block drivers take care to send flushes and respect unmap policy,
828 * so we can enable both unconditionally on lower layers. */
829 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
831 /* Clear flags that only apply to the top layer */
832 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
834 return flags;
838 * Returns the flags that bs->backing_hd should get, based on the given flags
839 * for the parent BDS
841 static int bdrv_backing_flags(int flags)
843 /* backing files always opened read-only */
844 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
846 /* snapshot=on is handled on the top layer */
847 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
849 return flags;
852 static int bdrv_open_flags(BlockDriverState *bs, int flags)
854 int open_flags = flags | BDRV_O_CACHE_WB;
857 * Clear flags that are internal to the block layer before opening the
858 * image.
860 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
863 * Snapshots should be writable.
865 if (flags & BDRV_O_TEMPORARY) {
866 open_flags |= BDRV_O_RDWR;
869 return open_flags;
872 static void bdrv_assign_node_name(BlockDriverState *bs,
873 const char *node_name,
874 Error **errp)
876 if (!node_name) {
877 return;
880 /* Check for empty string or invalid characters */
881 if (!id_wellformed(node_name)) {
882 error_setg(errp, "Invalid node name");
883 return;
886 /* takes care of avoiding namespaces collisions */
887 if (blk_by_name(node_name)) {
888 error_setg(errp, "node-name=%s is conflicting with a device id",
889 node_name);
890 return;
893 /* takes care of avoiding duplicates node names */
894 if (bdrv_find_node(node_name)) {
895 error_setg(errp, "Duplicate node name");
896 return;
899 /* copy node name into the bs and insert it into the graph list */
900 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
901 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
905 * Common part for opening disk images and files
907 * Removes all processed options from *options.
909 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
910 QDict *options, int flags, BlockDriver *drv, Error **errp)
912 int ret, open_flags;
913 const char *filename;
914 const char *node_name = NULL;
915 Error *local_err = NULL;
917 assert(drv != NULL);
918 assert(bs->file == NULL);
919 assert(options != NULL && bs->options != options);
921 if (file != NULL) {
922 filename = file->filename;
923 } else {
924 filename = qdict_get_try_str(options, "filename");
927 if (drv->bdrv_needs_filename && !filename) {
928 error_setg(errp, "The '%s' block driver requires a file name",
929 drv->format_name);
930 return -EINVAL;
933 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
935 node_name = qdict_get_try_str(options, "node-name");
936 bdrv_assign_node_name(bs, node_name, &local_err);
937 if (local_err) {
938 error_propagate(errp, local_err);
939 return -EINVAL;
941 qdict_del(options, "node-name");
943 /* bdrv_open() with directly using a protocol as drv. This layer is already
944 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
945 * and return immediately. */
946 if (file != NULL && drv->bdrv_file_open) {
947 bdrv_swap(file, bs);
948 return 0;
951 bs->open_flags = flags;
952 bs->guest_block_size = 512;
953 bs->request_alignment = 512;
954 bs->zero_beyond_eof = true;
955 open_flags = bdrv_open_flags(bs, flags);
956 bs->read_only = !(open_flags & BDRV_O_RDWR);
957 bs->growable = !!(flags & BDRV_O_PROTOCOL);
959 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
960 error_setg(errp,
961 !bs->read_only && bdrv_is_whitelisted(drv, true)
962 ? "Driver '%s' can only be used for read-only devices"
963 : "Driver '%s' is not whitelisted",
964 drv->format_name);
965 return -ENOTSUP;
968 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
969 if (flags & BDRV_O_COPY_ON_READ) {
970 if (!bs->read_only) {
971 bdrv_enable_copy_on_read(bs);
972 } else {
973 error_setg(errp, "Can't use copy-on-read on read-only device");
974 return -EINVAL;
978 if (filename != NULL) {
979 pstrcpy(bs->filename, sizeof(bs->filename), filename);
980 } else {
981 bs->filename[0] = '\0';
983 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
985 bs->drv = drv;
986 bs->opaque = g_malloc0(drv->instance_size);
988 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
990 /* Open the image, either directly or using a protocol */
991 if (drv->bdrv_file_open) {
992 assert(file == NULL);
993 assert(!drv->bdrv_needs_filename || filename != NULL);
994 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
995 } else {
996 if (file == NULL) {
997 error_setg(errp, "Can't use '%s' as a block driver for the "
998 "protocol level", drv->format_name);
999 ret = -EINVAL;
1000 goto free_and_fail;
1002 bs->file = file;
1003 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1006 if (ret < 0) {
1007 if (local_err) {
1008 error_propagate(errp, local_err);
1009 } else if (bs->filename[0]) {
1010 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1011 } else {
1012 error_setg_errno(errp, -ret, "Could not open image");
1014 goto free_and_fail;
1017 ret = refresh_total_sectors(bs, bs->total_sectors);
1018 if (ret < 0) {
1019 error_setg_errno(errp, -ret, "Could not refresh total sector count");
1020 goto free_and_fail;
1023 bdrv_refresh_limits(bs, &local_err);
1024 if (local_err) {
1025 error_propagate(errp, local_err);
1026 ret = -EINVAL;
1027 goto free_and_fail;
1030 assert(bdrv_opt_mem_align(bs) != 0);
1031 assert((bs->request_alignment != 0) || bs->sg);
1032 return 0;
1034 free_and_fail:
1035 bs->file = NULL;
1036 g_free(bs->opaque);
1037 bs->opaque = NULL;
1038 bs->drv = NULL;
1039 return ret;
1042 static QDict *parse_json_filename(const char *filename, Error **errp)
1044 QObject *options_obj;
1045 QDict *options;
1046 int ret;
1048 ret = strstart(filename, "json:", &filename);
1049 assert(ret);
1051 options_obj = qobject_from_json(filename);
1052 if (!options_obj) {
1053 error_setg(errp, "Could not parse the JSON options");
1054 return NULL;
1057 if (qobject_type(options_obj) != QTYPE_QDICT) {
1058 qobject_decref(options_obj);
1059 error_setg(errp, "Invalid JSON object given");
1060 return NULL;
1063 options = qobject_to_qdict(options_obj);
1064 qdict_flatten(options);
1066 return options;
1070 * Fills in default options for opening images and converts the legacy
1071 * filename/flags pair to option QDict entries.
1073 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1074 BlockDriver *drv, Error **errp)
1076 const char *filename = *pfilename;
1077 const char *drvname;
1078 bool protocol = flags & BDRV_O_PROTOCOL;
1079 bool parse_filename = false;
1080 Error *local_err = NULL;
1082 /* Parse json: pseudo-protocol */
1083 if (filename && g_str_has_prefix(filename, "json:")) {
1084 QDict *json_options = parse_json_filename(filename, &local_err);
1085 if (local_err) {
1086 error_propagate(errp, local_err);
1087 return -EINVAL;
1090 /* Options given in the filename have lower priority than options
1091 * specified directly */
1092 qdict_join(*options, json_options, false);
1093 QDECREF(json_options);
1094 *pfilename = filename = NULL;
1097 /* Fetch the file name from the options QDict if necessary */
1098 if (protocol && filename) {
1099 if (!qdict_haskey(*options, "filename")) {
1100 qdict_put(*options, "filename", qstring_from_str(filename));
1101 parse_filename = true;
1102 } else {
1103 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1104 "the same time");
1105 return -EINVAL;
1109 /* Find the right block driver */
1110 filename = qdict_get_try_str(*options, "filename");
1111 drvname = qdict_get_try_str(*options, "driver");
1113 if (drv) {
1114 if (drvname) {
1115 error_setg(errp, "Driver specified twice");
1116 return -EINVAL;
1118 drvname = drv->format_name;
1119 qdict_put(*options, "driver", qstring_from_str(drvname));
1120 } else {
1121 if (!drvname && protocol) {
1122 if (filename) {
1123 drv = bdrv_find_protocol(filename, parse_filename);
1124 if (!drv) {
1125 error_setg(errp, "Unknown protocol");
1126 return -EINVAL;
1129 drvname = drv->format_name;
1130 qdict_put(*options, "driver", qstring_from_str(drvname));
1131 } else {
1132 error_setg(errp, "Must specify either driver or file");
1133 return -EINVAL;
1135 } else if (drvname) {
1136 drv = bdrv_find_format(drvname);
1137 if (!drv) {
1138 error_setg(errp, "Unknown driver '%s'", drvname);
1139 return -ENOENT;
1144 assert(drv || !protocol);
1146 /* Driver-specific filename parsing */
1147 if (drv && drv->bdrv_parse_filename && parse_filename) {
1148 drv->bdrv_parse_filename(filename, *options, &local_err);
1149 if (local_err) {
1150 error_propagate(errp, local_err);
1151 return -EINVAL;
1154 if (!drv->bdrv_needs_filename) {
1155 qdict_del(*options, "filename");
1159 return 0;
1162 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1165 if (bs->backing_hd) {
1166 assert(bs->backing_blocker);
1167 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1168 } else if (backing_hd) {
1169 error_setg(&bs->backing_blocker,
1170 "device is used as backing hd of '%s'",
1171 bdrv_get_device_name(bs));
1174 bs->backing_hd = backing_hd;
1175 if (!backing_hd) {
1176 error_free(bs->backing_blocker);
1177 bs->backing_blocker = NULL;
1178 goto out;
1180 bs->open_flags &= ~BDRV_O_NO_BACKING;
1181 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1182 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1183 backing_hd->drv ? backing_hd->drv->format_name : "");
1185 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1186 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1187 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1188 bs->backing_blocker);
1189 out:
1190 bdrv_refresh_limits(bs, NULL);
1194 * Opens the backing file for a BlockDriverState if not yet open
1196 * options is a QDict of options to pass to the block drivers, or NULL for an
1197 * empty set of options. The reference to the QDict is transferred to this
1198 * function (even on failure), so if the caller intends to reuse the dictionary,
1199 * it needs to use QINCREF() before calling bdrv_file_open.
1201 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1203 char *backing_filename = g_malloc0(PATH_MAX);
1204 int ret = 0;
1205 BlockDriver *back_drv = NULL;
1206 BlockDriverState *backing_hd;
1207 Error *local_err = NULL;
1209 if (bs->backing_hd != NULL) {
1210 QDECREF(options);
1211 goto free_exit;
1214 /* NULL means an empty set of options */
1215 if (options == NULL) {
1216 options = qdict_new();
1219 bs->open_flags &= ~BDRV_O_NO_BACKING;
1220 if (qdict_haskey(options, "file.filename")) {
1221 backing_filename[0] = '\0';
1222 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1223 QDECREF(options);
1224 goto free_exit;
1225 } else {
1226 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1229 if (!bs->drv || !bs->drv->supports_backing) {
1230 ret = -EINVAL;
1231 error_setg(errp, "Driver doesn't support backing files");
1232 QDECREF(options);
1233 goto free_exit;
1236 backing_hd = bdrv_new();
1238 if (bs->backing_format[0] != '\0') {
1239 back_drv = bdrv_find_format(bs->backing_format);
1242 assert(bs->backing_hd == NULL);
1243 ret = bdrv_open(&backing_hd,
1244 *backing_filename ? backing_filename : NULL, NULL, options,
1245 bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1246 if (ret < 0) {
1247 bdrv_unref(backing_hd);
1248 backing_hd = NULL;
1249 bs->open_flags |= BDRV_O_NO_BACKING;
1250 error_setg(errp, "Could not open backing file: %s",
1251 error_get_pretty(local_err));
1252 error_free(local_err);
1253 goto free_exit;
1255 bdrv_set_backing_hd(bs, backing_hd);
1257 free_exit:
1258 g_free(backing_filename);
1259 return ret;
1263 * Opens a disk image whose options are given as BlockdevRef in another block
1264 * device's options.
1266 * If allow_none is true, no image will be opened if filename is false and no
1267 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1269 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1270 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1271 * itself, all options starting with "${bdref_key}." are considered part of the
1272 * BlockdevRef.
1274 * The BlockdevRef will be removed from the options QDict.
1276 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1278 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1279 QDict *options, const char *bdref_key, int flags,
1280 bool allow_none, Error **errp)
1282 QDict *image_options;
1283 int ret;
1284 char *bdref_key_dot;
1285 const char *reference;
1287 assert(pbs);
1288 assert(*pbs == NULL);
1290 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1291 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1292 g_free(bdref_key_dot);
1294 reference = qdict_get_try_str(options, bdref_key);
1295 if (!filename && !reference && !qdict_size(image_options)) {
1296 if (allow_none) {
1297 ret = 0;
1298 } else {
1299 error_setg(errp, "A block device must be specified for \"%s\"",
1300 bdref_key);
1301 ret = -EINVAL;
1303 QDECREF(image_options);
1304 goto done;
1307 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1309 done:
1310 qdict_del(options, bdref_key);
1311 return ret;
1314 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1316 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1317 char *tmp_filename = g_malloc0(PATH_MAX + 1);
1318 int64_t total_size;
1319 BlockDriver *bdrv_qcow2;
1320 QemuOpts *opts = NULL;
1321 QDict *snapshot_options;
1322 BlockDriverState *bs_snapshot;
1323 Error *local_err;
1324 int ret;
1326 /* if snapshot, we create a temporary backing file and open it
1327 instead of opening 'filename' directly */
1329 /* Get the required size from the image */
1330 total_size = bdrv_getlength(bs);
1331 if (total_size < 0) {
1332 ret = total_size;
1333 error_setg_errno(errp, -total_size, "Could not get image size");
1334 goto out;
1337 /* Create the temporary image */
1338 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1339 if (ret < 0) {
1340 error_setg_errno(errp, -ret, "Could not get temporary filename");
1341 goto out;
1344 bdrv_qcow2 = bdrv_find_format("qcow2");
1345 opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
1346 &error_abort);
1347 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1348 ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
1349 qemu_opts_del(opts);
1350 if (ret < 0) {
1351 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1352 "'%s': %s", tmp_filename,
1353 error_get_pretty(local_err));
1354 error_free(local_err);
1355 goto out;
1358 /* Prepare a new options QDict for the temporary file */
1359 snapshot_options = qdict_new();
1360 qdict_put(snapshot_options, "file.driver",
1361 qstring_from_str("file"));
1362 qdict_put(snapshot_options, "file.filename",
1363 qstring_from_str(tmp_filename));
1365 bs_snapshot = bdrv_new();
1367 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1368 flags, bdrv_qcow2, &local_err);
1369 if (ret < 0) {
1370 error_propagate(errp, local_err);
1371 goto out;
1374 bdrv_append(bs_snapshot, bs);
1376 out:
1377 g_free(tmp_filename);
1378 return ret;
1382 * Opens a disk image (raw, qcow2, vmdk, ...)
1384 * options is a QDict of options to pass to the block drivers, or NULL for an
1385 * empty set of options. The reference to the QDict belongs to the block layer
1386 * after the call (even on failure), so if the caller intends to reuse the
1387 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1389 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1390 * If it is not NULL, the referenced BDS will be reused.
1392 * The reference parameter may be used to specify an existing block device which
1393 * should be opened. If specified, neither options nor a filename may be given,
1394 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1396 int bdrv_open(BlockDriverState **pbs, const char *filename,
1397 const char *reference, QDict *options, int flags,
1398 BlockDriver *drv, Error **errp)
1400 int ret;
1401 BlockDriverState *file = NULL, *bs;
1402 const char *drvname;
1403 Error *local_err = NULL;
1404 int snapshot_flags = 0;
1406 assert(pbs);
1408 if (reference) {
1409 bool options_non_empty = options ? qdict_size(options) : false;
1410 QDECREF(options);
1412 if (*pbs) {
1413 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1414 "another block device");
1415 return -EINVAL;
1418 if (filename || options_non_empty) {
1419 error_setg(errp, "Cannot reference an existing block device with "
1420 "additional options or a new filename");
1421 return -EINVAL;
1424 bs = bdrv_lookup_bs(reference, reference, errp);
1425 if (!bs) {
1426 return -ENODEV;
1428 bdrv_ref(bs);
1429 *pbs = bs;
1430 return 0;
1433 if (*pbs) {
1434 bs = *pbs;
1435 } else {
1436 bs = bdrv_new();
1439 /* NULL means an empty set of options */
1440 if (options == NULL) {
1441 options = qdict_new();
1444 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1445 if (local_err) {
1446 goto fail;
1449 /* Find the right image format driver */
1450 drv = NULL;
1451 drvname = qdict_get_try_str(options, "driver");
1452 if (drvname) {
1453 drv = bdrv_find_format(drvname);
1454 qdict_del(options, "driver");
1455 if (!drv) {
1456 error_setg(errp, "Unknown driver: '%s'", drvname);
1457 ret = -EINVAL;
1458 goto fail;
1462 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1463 if (drv && !drv->bdrv_file_open) {
1464 /* If the user explicitly wants a format driver here, we'll need to add
1465 * another layer for the protocol in bs->file */
1466 flags &= ~BDRV_O_PROTOCOL;
1469 bs->options = options;
1470 options = qdict_clone_shallow(options);
1472 /* Open image file without format layer */
1473 if ((flags & BDRV_O_PROTOCOL) == 0) {
1474 if (flags & BDRV_O_RDWR) {
1475 flags |= BDRV_O_ALLOW_RDWR;
1477 if (flags & BDRV_O_SNAPSHOT) {
1478 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1479 flags = bdrv_backing_flags(flags);
1482 assert(file == NULL);
1483 ret = bdrv_open_image(&file, filename, options, "file",
1484 bdrv_inherited_flags(flags),
1485 true, &local_err);
1486 if (ret < 0) {
1487 goto fail;
1491 /* Image format probing */
1492 bs->probed = !drv;
1493 if (!drv && file) {
1494 ret = find_image_format(file, filename, &drv, &local_err);
1495 if (ret < 0) {
1496 goto fail;
1498 } else if (!drv) {
1499 error_setg(errp, "Must specify either driver or file");
1500 ret = -EINVAL;
1501 goto fail;
1504 /* Open the image */
1505 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1506 if (ret < 0) {
1507 goto fail;
1510 if (file && (bs->file != file)) {
1511 bdrv_unref(file);
1512 file = NULL;
1515 /* If there is a backing file, use it */
1516 if ((flags & BDRV_O_NO_BACKING) == 0) {
1517 QDict *backing_options;
1519 qdict_extract_subqdict(options, &backing_options, "backing.");
1520 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1521 if (ret < 0) {
1522 goto close_and_fail;
1526 bdrv_refresh_filename(bs);
1528 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1529 * temporary snapshot afterwards. */
1530 if (snapshot_flags) {
1531 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1532 if (local_err) {
1533 goto close_and_fail;
1537 /* Check if any unknown options were used */
1538 if (options && (qdict_size(options) != 0)) {
1539 const QDictEntry *entry = qdict_first(options);
1540 if (flags & BDRV_O_PROTOCOL) {
1541 error_setg(errp, "Block protocol '%s' doesn't support the option "
1542 "'%s'", drv->format_name, entry->key);
1543 } else {
1544 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1545 "support the option '%s'", drv->format_name,
1546 bdrv_get_device_name(bs), entry->key);
1549 ret = -EINVAL;
1550 goto close_and_fail;
1553 if (!bdrv_key_required(bs)) {
1554 if (bs->blk) {
1555 blk_dev_change_media_cb(bs->blk, true);
1557 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1558 && !runstate_check(RUN_STATE_INMIGRATE)
1559 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1560 error_setg(errp,
1561 "Guest must be stopped for opening of encrypted image");
1562 ret = -EBUSY;
1563 goto close_and_fail;
1566 QDECREF(options);
1567 *pbs = bs;
1568 return 0;
1570 fail:
1571 if (file != NULL) {
1572 bdrv_unref(file);
1574 QDECREF(bs->options);
1575 QDECREF(options);
1576 bs->options = NULL;
1577 if (!*pbs) {
1578 /* If *pbs is NULL, a new BDS has been created in this function and
1579 needs to be freed now. Otherwise, it does not need to be closed,
1580 since it has not really been opened yet. */
1581 bdrv_unref(bs);
1583 if (local_err) {
1584 error_propagate(errp, local_err);
1586 return ret;
1588 close_and_fail:
1589 /* See fail path, but now the BDS has to be always closed */
1590 if (*pbs) {
1591 bdrv_close(bs);
1592 } else {
1593 bdrv_unref(bs);
1595 QDECREF(options);
1596 if (local_err) {
1597 error_propagate(errp, local_err);
1599 return ret;
1602 typedef struct BlockReopenQueueEntry {
1603 bool prepared;
1604 BDRVReopenState state;
1605 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1606 } BlockReopenQueueEntry;
1609 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1610 * reopen of multiple devices.
1612 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1613 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1614 * be created and initialized. This newly created BlockReopenQueue should be
1615 * passed back in for subsequent calls that are intended to be of the same
1616 * atomic 'set'.
1618 * bs is the BlockDriverState to add to the reopen queue.
1620 * flags contains the open flags for the associated bs
1622 * returns a pointer to bs_queue, which is either the newly allocated
1623 * bs_queue, or the existing bs_queue being used.
1626 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1627 BlockDriverState *bs, int flags)
1629 assert(bs != NULL);
1631 BlockReopenQueueEntry *bs_entry;
1632 if (bs_queue == NULL) {
1633 bs_queue = g_new0(BlockReopenQueue, 1);
1634 QSIMPLEQ_INIT(bs_queue);
1637 /* bdrv_open() masks this flag out */
1638 flags &= ~BDRV_O_PROTOCOL;
1640 if (bs->file) {
1641 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1644 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1645 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1647 bs_entry->state.bs = bs;
1648 bs_entry->state.flags = flags;
1650 return bs_queue;
1654 * Reopen multiple BlockDriverStates atomically & transactionally.
1656 * The queue passed in (bs_queue) must have been built up previous
1657 * via bdrv_reopen_queue().
1659 * Reopens all BDS specified in the queue, with the appropriate
1660 * flags. All devices are prepared for reopen, and failure of any
1661 * device will cause all device changes to be abandonded, and intermediate
1662 * data cleaned up.
1664 * If all devices prepare successfully, then the changes are committed
1665 * to all devices.
1668 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1670 int ret = -1;
1671 BlockReopenQueueEntry *bs_entry, *next;
1672 Error *local_err = NULL;
1674 assert(bs_queue != NULL);
1676 bdrv_drain_all();
1678 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1679 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1680 error_propagate(errp, local_err);
1681 goto cleanup;
1683 bs_entry->prepared = true;
1686 /* If we reach this point, we have success and just need to apply the
1687 * changes
1689 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1690 bdrv_reopen_commit(&bs_entry->state);
1693 ret = 0;
1695 cleanup:
1696 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1697 if (ret && bs_entry->prepared) {
1698 bdrv_reopen_abort(&bs_entry->state);
1700 g_free(bs_entry);
1702 g_free(bs_queue);
1703 return ret;
1707 /* Reopen a single BlockDriverState with the specified flags. */
1708 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1710 int ret = -1;
1711 Error *local_err = NULL;
1712 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1714 ret = bdrv_reopen_multiple(queue, &local_err);
1715 if (local_err != NULL) {
1716 error_propagate(errp, local_err);
1718 return ret;
1723 * Prepares a BlockDriverState for reopen. All changes are staged in the
1724 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1725 * the block driver layer .bdrv_reopen_prepare()
1727 * bs is the BlockDriverState to reopen
1728 * flags are the new open flags
1729 * queue is the reopen queue
1731 * Returns 0 on success, non-zero on error. On error errp will be set
1732 * as well.
1734 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1735 * It is the responsibility of the caller to then call the abort() or
1736 * commit() for any other BDS that have been left in a prepare() state
1739 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1740 Error **errp)
1742 int ret = -1;
1743 Error *local_err = NULL;
1744 BlockDriver *drv;
1746 assert(reopen_state != NULL);
1747 assert(reopen_state->bs->drv != NULL);
1748 drv = reopen_state->bs->drv;
1750 /* if we are to stay read-only, do not allow permission change
1751 * to r/w */
1752 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1753 reopen_state->flags & BDRV_O_RDWR) {
1754 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1755 bdrv_get_device_name(reopen_state->bs));
1756 goto error;
1760 ret = bdrv_flush(reopen_state->bs);
1761 if (ret) {
1762 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1763 strerror(-ret));
1764 goto error;
1767 if (drv->bdrv_reopen_prepare) {
1768 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1769 if (ret) {
1770 if (local_err != NULL) {
1771 error_propagate(errp, local_err);
1772 } else {
1773 error_setg(errp, "failed while preparing to reopen image '%s'",
1774 reopen_state->bs->filename);
1776 goto error;
1778 } else {
1779 /* It is currently mandatory to have a bdrv_reopen_prepare()
1780 * handler for each supported drv. */
1781 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1782 drv->format_name, bdrv_get_device_name(reopen_state->bs),
1783 "reopening of file");
1784 ret = -1;
1785 goto error;
1788 ret = 0;
1790 error:
1791 return ret;
1795 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1796 * makes them final by swapping the staging BlockDriverState contents into
1797 * the active BlockDriverState contents.
1799 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1801 BlockDriver *drv;
1803 assert(reopen_state != NULL);
1804 drv = reopen_state->bs->drv;
1805 assert(drv != NULL);
1807 /* If there are any driver level actions to take */
1808 if (drv->bdrv_reopen_commit) {
1809 drv->bdrv_reopen_commit(reopen_state);
1812 /* set BDS specific flags now */
1813 reopen_state->bs->open_flags = reopen_state->flags;
1814 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1815 BDRV_O_CACHE_WB);
1816 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1818 bdrv_refresh_limits(reopen_state->bs, NULL);
1822 * Abort the reopen, and delete and free the staged changes in
1823 * reopen_state
1825 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1827 BlockDriver *drv;
1829 assert(reopen_state != NULL);
1830 drv = reopen_state->bs->drv;
1831 assert(drv != NULL);
1833 if (drv->bdrv_reopen_abort) {
1834 drv->bdrv_reopen_abort(reopen_state);
1839 void bdrv_close(BlockDriverState *bs)
1841 BdrvAioNotifier *ban, *ban_next;
1843 if (bs->job) {
1844 block_job_cancel_sync(bs->job);
1846 bdrv_drain_all(); /* complete I/O */
1847 bdrv_flush(bs);
1848 bdrv_drain_all(); /* in case flush left pending I/O */
1849 notifier_list_notify(&bs->close_notifiers, bs);
1851 if (bs->drv) {
1852 if (bs->backing_hd) {
1853 BlockDriverState *backing_hd = bs->backing_hd;
1854 bdrv_set_backing_hd(bs, NULL);
1855 bdrv_unref(backing_hd);
1857 bs->drv->bdrv_close(bs);
1858 g_free(bs->opaque);
1859 bs->opaque = NULL;
1860 bs->drv = NULL;
1861 bs->copy_on_read = 0;
1862 bs->backing_file[0] = '\0';
1863 bs->backing_format[0] = '\0';
1864 bs->total_sectors = 0;
1865 bs->encrypted = 0;
1866 bs->valid_key = 0;
1867 bs->sg = 0;
1868 bs->growable = 0;
1869 bs->zero_beyond_eof = false;
1870 QDECREF(bs->options);
1871 bs->options = NULL;
1872 QDECREF(bs->full_open_options);
1873 bs->full_open_options = NULL;
1875 if (bs->file != NULL) {
1876 bdrv_unref(bs->file);
1877 bs->file = NULL;
1881 if (bs->blk) {
1882 blk_dev_change_media_cb(bs->blk, false);
1885 /*throttling disk I/O limits*/
1886 if (bs->io_limits_enabled) {
1887 bdrv_io_limits_disable(bs);
1890 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1891 g_free(ban);
1893 QLIST_INIT(&bs->aio_notifiers);
1896 void bdrv_close_all(void)
1898 BlockDriverState *bs;
1900 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1901 AioContext *aio_context = bdrv_get_aio_context(bs);
1903 aio_context_acquire(aio_context);
1904 bdrv_close(bs);
1905 aio_context_release(aio_context);
1909 /* Check if any requests are in-flight (including throttled requests) */
1910 static bool bdrv_requests_pending(BlockDriverState *bs)
1912 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1913 return true;
1915 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1916 return true;
1918 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1919 return true;
1921 if (bs->file && bdrv_requests_pending(bs->file)) {
1922 return true;
1924 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1925 return true;
1927 return false;
1930 static bool bdrv_drain_one(BlockDriverState *bs)
1932 bool bs_busy;
1934 bdrv_flush_io_queue(bs);
1935 bdrv_start_throttled_reqs(bs);
1936 bs_busy = bdrv_requests_pending(bs);
1937 bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
1938 return bs_busy;
1942 * Wait for pending requests to complete on a single BlockDriverState subtree
1944 * See the warning in bdrv_drain_all(). This function can only be called if
1945 * you are sure nothing can generate I/O because you have op blockers
1946 * installed.
1948 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
1949 * AioContext.
1951 void bdrv_drain(BlockDriverState *bs)
1953 while (bdrv_drain_one(bs)) {
1954 /* Keep iterating */
1959 * Wait for pending requests to complete across all BlockDriverStates
1961 * This function does not flush data to disk, use bdrv_flush_all() for that
1962 * after calling this function.
1964 * Note that completion of an asynchronous I/O operation can trigger any
1965 * number of other I/O operations on other devices---for example a coroutine
1966 * can be arbitrarily complex and a constant flow of I/O can come until the
1967 * coroutine is complete. Because of this, it is not possible to have a
1968 * function to drain a single device's I/O queue.
1970 void bdrv_drain_all(void)
1972 /* Always run first iteration so any pending completion BHs run */
1973 bool busy = true;
1974 BlockDriverState *bs;
1976 while (busy) {
1977 busy = false;
1979 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1980 AioContext *aio_context = bdrv_get_aio_context(bs);
1982 aio_context_acquire(aio_context);
1983 busy |= bdrv_drain_one(bs);
1984 aio_context_release(aio_context);
1989 /* make a BlockDriverState anonymous by removing from bdrv_state and
1990 * graph_bdrv_state list.
1991 Also, NULL terminate the device_name to prevent double remove */
1992 void bdrv_make_anon(BlockDriverState *bs)
1995 * Take care to remove bs from bdrv_states only when it's actually
1996 * in it. Note that bs->device_list.tqe_prev is initially null,
1997 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
1998 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1999 * resetting it to null on remove.
2001 if (bs->device_list.tqe_prev) {
2002 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
2003 bs->device_list.tqe_prev = NULL;
2005 if (bs->node_name[0] != '\0') {
2006 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
2008 bs->node_name[0] = '\0';
2011 static void bdrv_rebind(BlockDriverState *bs)
2013 if (bs->drv && bs->drv->bdrv_rebind) {
2014 bs->drv->bdrv_rebind(bs);
2018 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2019 BlockDriverState *bs_src)
2021 /* move some fields that need to stay attached to the device */
2023 /* dev info */
2024 bs_dest->guest_block_size = bs_src->guest_block_size;
2025 bs_dest->copy_on_read = bs_src->copy_on_read;
2027 bs_dest->enable_write_cache = bs_src->enable_write_cache;
2029 /* i/o throttled req */
2030 memcpy(&bs_dest->throttle_state,
2031 &bs_src->throttle_state,
2032 sizeof(ThrottleState));
2033 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
2034 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
2035 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
2037 /* r/w error */
2038 bs_dest->on_read_error = bs_src->on_read_error;
2039 bs_dest->on_write_error = bs_src->on_write_error;
2041 /* i/o status */
2042 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2043 bs_dest->iostatus = bs_src->iostatus;
2045 /* dirty bitmap */
2046 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
2048 /* reference count */
2049 bs_dest->refcnt = bs_src->refcnt;
2051 /* job */
2052 bs_dest->job = bs_src->job;
2054 /* keep the same entry in bdrv_states */
2055 bs_dest->device_list = bs_src->device_list;
2056 bs_dest->blk = bs_src->blk;
2058 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2059 sizeof(bs_dest->op_blockers));
2063 * Swap bs contents for two image chains while they are live,
2064 * while keeping required fields on the BlockDriverState that is
2065 * actually attached to a device.
2067 * This will modify the BlockDriverState fields, and swap contents
2068 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2070 * bs_new must not be attached to a BlockBackend.
2072 * This function does not create any image files.
2074 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2076 BlockDriverState tmp;
2078 /* The code needs to swap the node_name but simply swapping node_list won't
2079 * work so first remove the nodes from the graph list, do the swap then
2080 * insert them back if needed.
2082 if (bs_new->node_name[0] != '\0') {
2083 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2085 if (bs_old->node_name[0] != '\0') {
2086 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2089 /* bs_new must be unattached and shouldn't have anything fancy enabled */
2090 assert(!bs_new->blk);
2091 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2092 assert(bs_new->job == NULL);
2093 assert(bs_new->io_limits_enabled == false);
2094 assert(!throttle_have_timer(&bs_new->throttle_state));
2096 tmp = *bs_new;
2097 *bs_new = *bs_old;
2098 *bs_old = tmp;
2100 /* there are some fields that should not be swapped, move them back */
2101 bdrv_move_feature_fields(&tmp, bs_old);
2102 bdrv_move_feature_fields(bs_old, bs_new);
2103 bdrv_move_feature_fields(bs_new, &tmp);
2105 /* bs_new must remain unattached */
2106 assert(!bs_new->blk);
2108 /* Check a few fields that should remain attached to the device */
2109 assert(bs_new->job == NULL);
2110 assert(bs_new->io_limits_enabled == false);
2111 assert(!throttle_have_timer(&bs_new->throttle_state));
2113 /* insert the nodes back into the graph node list if needed */
2114 if (bs_new->node_name[0] != '\0') {
2115 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2117 if (bs_old->node_name[0] != '\0') {
2118 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2121 bdrv_rebind(bs_new);
2122 bdrv_rebind(bs_old);
2126 * Add new bs contents at the top of an image chain while the chain is
2127 * live, while keeping required fields on the top layer.
2129 * This will modify the BlockDriverState fields, and swap contents
2130 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2132 * bs_new must not be attached to a BlockBackend.
2134 * This function does not create any image files.
2136 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2138 bdrv_swap(bs_new, bs_top);
2140 /* The contents of 'tmp' will become bs_top, as we are
2141 * swapping bs_new and bs_top contents. */
2142 bdrv_set_backing_hd(bs_top, bs_new);
2145 static void bdrv_delete(BlockDriverState *bs)
2147 assert(!bs->job);
2148 assert(bdrv_op_blocker_is_empty(bs));
2149 assert(!bs->refcnt);
2150 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2152 bdrv_close(bs);
2154 /* remove from list, if necessary */
2155 bdrv_make_anon(bs);
2157 g_free(bs);
2161 * Run consistency checks on an image
2163 * Returns 0 if the check could be completed (it doesn't mean that the image is
2164 * free of errors) or -errno when an internal error occurred. The results of the
2165 * check are stored in res.
2167 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2169 if (bs->drv == NULL) {
2170 return -ENOMEDIUM;
2172 if (bs->drv->bdrv_check == NULL) {
2173 return -ENOTSUP;
2176 memset(res, 0, sizeof(*res));
2177 return bs->drv->bdrv_check(bs, res, fix);
2180 #define COMMIT_BUF_SECTORS 2048
2182 /* commit COW file into the raw image */
2183 int bdrv_commit(BlockDriverState *bs)
2185 BlockDriver *drv = bs->drv;
2186 int64_t sector, total_sectors, length, backing_length;
2187 int n, ro, open_flags;
2188 int ret = 0;
2189 uint8_t *buf = NULL;
2190 char filename[PATH_MAX];
2192 if (!drv)
2193 return -ENOMEDIUM;
2195 if (!bs->backing_hd) {
2196 return -ENOTSUP;
2199 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2200 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2201 return -EBUSY;
2204 ro = bs->backing_hd->read_only;
2205 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2206 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2207 open_flags = bs->backing_hd->open_flags;
2209 if (ro) {
2210 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2211 return -EACCES;
2215 length = bdrv_getlength(bs);
2216 if (length < 0) {
2217 ret = length;
2218 goto ro_cleanup;
2221 backing_length = bdrv_getlength(bs->backing_hd);
2222 if (backing_length < 0) {
2223 ret = backing_length;
2224 goto ro_cleanup;
2227 /* If our top snapshot is larger than the backing file image,
2228 * grow the backing file image if possible. If not possible,
2229 * we must return an error */
2230 if (length > backing_length) {
2231 ret = bdrv_truncate(bs->backing_hd, length);
2232 if (ret < 0) {
2233 goto ro_cleanup;
2237 total_sectors = length >> BDRV_SECTOR_BITS;
2239 /* qemu_try_blockalign() for bs will choose an alignment that works for
2240 * bs->backing_hd as well, so no need to compare the alignment manually. */
2241 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2242 if (buf == NULL) {
2243 ret = -ENOMEM;
2244 goto ro_cleanup;
2247 for (sector = 0; sector < total_sectors; sector += n) {
2248 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2249 if (ret < 0) {
2250 goto ro_cleanup;
2252 if (ret) {
2253 ret = bdrv_read(bs, sector, buf, n);
2254 if (ret < 0) {
2255 goto ro_cleanup;
2258 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2259 if (ret < 0) {
2260 goto ro_cleanup;
2265 if (drv->bdrv_make_empty) {
2266 ret = drv->bdrv_make_empty(bs);
2267 if (ret < 0) {
2268 goto ro_cleanup;
2270 bdrv_flush(bs);
2274 * Make sure all data we wrote to the backing device is actually
2275 * stable on disk.
2277 if (bs->backing_hd) {
2278 bdrv_flush(bs->backing_hd);
2281 ret = 0;
2282 ro_cleanup:
2283 qemu_vfree(buf);
2285 if (ro) {
2286 /* ignoring error return here */
2287 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2290 return ret;
2293 int bdrv_commit_all(void)
2295 BlockDriverState *bs;
2297 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2298 AioContext *aio_context = bdrv_get_aio_context(bs);
2300 aio_context_acquire(aio_context);
2301 if (bs->drv && bs->backing_hd) {
2302 int ret = bdrv_commit(bs);
2303 if (ret < 0) {
2304 aio_context_release(aio_context);
2305 return ret;
2308 aio_context_release(aio_context);
2310 return 0;
2314 * Remove an active request from the tracked requests list
2316 * This function should be called when a tracked request is completing.
2318 static void tracked_request_end(BdrvTrackedRequest *req)
2320 if (req->serialising) {
2321 req->bs->serialising_in_flight--;
2324 QLIST_REMOVE(req, list);
2325 qemu_co_queue_restart_all(&req->wait_queue);
2329 * Add an active request to the tracked requests list
2331 static void tracked_request_begin(BdrvTrackedRequest *req,
2332 BlockDriverState *bs,
2333 int64_t offset,
2334 unsigned int bytes, bool is_write)
2336 *req = (BdrvTrackedRequest){
2337 .bs = bs,
2338 .offset = offset,
2339 .bytes = bytes,
2340 .is_write = is_write,
2341 .co = qemu_coroutine_self(),
2342 .serialising = false,
2343 .overlap_offset = offset,
2344 .overlap_bytes = bytes,
2347 qemu_co_queue_init(&req->wait_queue);
2349 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2352 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2354 int64_t overlap_offset = req->offset & ~(align - 1);
2355 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2356 - overlap_offset;
2358 if (!req->serialising) {
2359 req->bs->serialising_in_flight++;
2360 req->serialising = true;
2363 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2364 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2368 * Round a region to cluster boundaries
2370 void bdrv_round_to_clusters(BlockDriverState *bs,
2371 int64_t sector_num, int nb_sectors,
2372 int64_t *cluster_sector_num,
2373 int *cluster_nb_sectors)
2375 BlockDriverInfo bdi;
2377 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2378 *cluster_sector_num = sector_num;
2379 *cluster_nb_sectors = nb_sectors;
2380 } else {
2381 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2382 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2383 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2384 nb_sectors, c);
2388 static int bdrv_get_cluster_size(BlockDriverState *bs)
2390 BlockDriverInfo bdi;
2391 int ret;
2393 ret = bdrv_get_info(bs, &bdi);
2394 if (ret < 0 || bdi.cluster_size == 0) {
2395 return bs->request_alignment;
2396 } else {
2397 return bdi.cluster_size;
2401 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2402 int64_t offset, unsigned int bytes)
2404 /* aaaa bbbb */
2405 if (offset >= req->overlap_offset + req->overlap_bytes) {
2406 return false;
2408 /* bbbb aaaa */
2409 if (req->overlap_offset >= offset + bytes) {
2410 return false;
2412 return true;
2415 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2417 BlockDriverState *bs = self->bs;
2418 BdrvTrackedRequest *req;
2419 bool retry;
2420 bool waited = false;
2422 if (!bs->serialising_in_flight) {
2423 return false;
2426 do {
2427 retry = false;
2428 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2429 if (req == self || (!req->serialising && !self->serialising)) {
2430 continue;
2432 if (tracked_request_overlaps(req, self->overlap_offset,
2433 self->overlap_bytes))
2435 /* Hitting this means there was a reentrant request, for
2436 * example, a block driver issuing nested requests. This must
2437 * never happen since it means deadlock.
2439 assert(qemu_coroutine_self() != req->co);
2441 /* If the request is already (indirectly) waiting for us, or
2442 * will wait for us as soon as it wakes up, then just go on
2443 * (instead of producing a deadlock in the former case). */
2444 if (!req->waiting_for) {
2445 self->waiting_for = req;
2446 qemu_co_queue_wait(&req->wait_queue);
2447 self->waiting_for = NULL;
2448 retry = true;
2449 waited = true;
2450 break;
2454 } while (retry);
2456 return waited;
2460 * Return values:
2461 * 0 - success
2462 * -EINVAL - backing format specified, but no file
2463 * -ENOSPC - can't update the backing file because no space is left in the
2464 * image file header
2465 * -ENOTSUP - format driver doesn't support changing the backing file
2467 int bdrv_change_backing_file(BlockDriverState *bs,
2468 const char *backing_file, const char *backing_fmt)
2470 BlockDriver *drv = bs->drv;
2471 int ret;
2473 /* Backing file format doesn't make sense without a backing file */
2474 if (backing_fmt && !backing_file) {
2475 return -EINVAL;
2478 if (drv->bdrv_change_backing_file != NULL) {
2479 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2480 } else {
2481 ret = -ENOTSUP;
2484 if (ret == 0) {
2485 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2486 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2488 return ret;
2492 * Finds the image layer in the chain that has 'bs' as its backing file.
2494 * active is the current topmost image.
2496 * Returns NULL if bs is not found in active's image chain,
2497 * or if active == bs.
2499 * Returns the bottommost base image if bs == NULL.
2501 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2502 BlockDriverState *bs)
2504 while (active && bs != active->backing_hd) {
2505 active = active->backing_hd;
2508 return active;
2511 /* Given a BDS, searches for the base layer. */
2512 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2514 return bdrv_find_overlay(bs, NULL);
2517 typedef struct BlkIntermediateStates {
2518 BlockDriverState *bs;
2519 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2520 } BlkIntermediateStates;
2524 * Drops images above 'base' up to and including 'top', and sets the image
2525 * above 'top' to have base as its backing file.
2527 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2528 * information in 'bs' can be properly updated.
2530 * E.g., this will convert the following chain:
2531 * bottom <- base <- intermediate <- top <- active
2533 * to
2535 * bottom <- base <- active
2537 * It is allowed for bottom==base, in which case it converts:
2539 * base <- intermediate <- top <- active
2541 * to
2543 * base <- active
2545 * If backing_file_str is non-NULL, it will be used when modifying top's
2546 * overlay image metadata.
2548 * Error conditions:
2549 * if active == top, that is considered an error
2552 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2553 BlockDriverState *base, const char *backing_file_str)
2555 BlockDriverState *intermediate;
2556 BlockDriverState *base_bs = NULL;
2557 BlockDriverState *new_top_bs = NULL;
2558 BlkIntermediateStates *intermediate_state, *next;
2559 int ret = -EIO;
2561 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2562 QSIMPLEQ_INIT(&states_to_delete);
2564 if (!top->drv || !base->drv) {
2565 goto exit;
2568 new_top_bs = bdrv_find_overlay(active, top);
2570 if (new_top_bs == NULL) {
2571 /* we could not find the image above 'top', this is an error */
2572 goto exit;
2575 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2576 * to do, no intermediate images */
2577 if (new_top_bs->backing_hd == base) {
2578 ret = 0;
2579 goto exit;
2582 intermediate = top;
2584 /* now we will go down through the list, and add each BDS we find
2585 * into our deletion queue, until we hit the 'base'
2587 while (intermediate) {
2588 intermediate_state = g_new0(BlkIntermediateStates, 1);
2589 intermediate_state->bs = intermediate;
2590 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2592 if (intermediate->backing_hd == base) {
2593 base_bs = intermediate->backing_hd;
2594 break;
2596 intermediate = intermediate->backing_hd;
2598 if (base_bs == NULL) {
2599 /* something went wrong, we did not end at the base. safely
2600 * unravel everything, and exit with error */
2601 goto exit;
2604 /* success - we can delete the intermediate states, and link top->base */
2605 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2606 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2607 base_bs->drv ? base_bs->drv->format_name : "");
2608 if (ret) {
2609 goto exit;
2611 bdrv_set_backing_hd(new_top_bs, base_bs);
2613 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2614 /* so that bdrv_close() does not recursively close the chain */
2615 bdrv_set_backing_hd(intermediate_state->bs, NULL);
2616 bdrv_unref(intermediate_state->bs);
2618 ret = 0;
2620 exit:
2621 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2622 g_free(intermediate_state);
2624 return ret;
2628 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2629 size_t size)
2631 int64_t len;
2633 if (size > INT_MAX) {
2634 return -EIO;
2637 if (!bdrv_is_inserted(bs))
2638 return -ENOMEDIUM;
2640 if (bs->growable)
2641 return 0;
2643 len = bdrv_getlength(bs);
2645 if (offset < 0)
2646 return -EIO;
2648 if ((offset > len) || (len - offset < size))
2649 return -EIO;
2651 return 0;
2654 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2655 int nb_sectors)
2657 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2658 return -EIO;
2661 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2662 nb_sectors * BDRV_SECTOR_SIZE);
2665 typedef struct RwCo {
2666 BlockDriverState *bs;
2667 int64_t offset;
2668 QEMUIOVector *qiov;
2669 bool is_write;
2670 int ret;
2671 BdrvRequestFlags flags;
2672 } RwCo;
2674 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2676 RwCo *rwco = opaque;
2678 if (!rwco->is_write) {
2679 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2680 rwco->qiov->size, rwco->qiov,
2681 rwco->flags);
2682 } else {
2683 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2684 rwco->qiov->size, rwco->qiov,
2685 rwco->flags);
2690 * Process a vectored synchronous request using coroutines
2692 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2693 QEMUIOVector *qiov, bool is_write,
2694 BdrvRequestFlags flags)
2696 Coroutine *co;
2697 RwCo rwco = {
2698 .bs = bs,
2699 .offset = offset,
2700 .qiov = qiov,
2701 .is_write = is_write,
2702 .ret = NOT_DONE,
2703 .flags = flags,
2707 * In sync call context, when the vcpu is blocked, this throttling timer
2708 * will not fire; so the I/O throttling function has to be disabled here
2709 * if it has been enabled.
2711 if (bs->io_limits_enabled) {
2712 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2713 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2714 bdrv_io_limits_disable(bs);
2717 if (qemu_in_coroutine()) {
2718 /* Fast-path if already in coroutine context */
2719 bdrv_rw_co_entry(&rwco);
2720 } else {
2721 AioContext *aio_context = bdrv_get_aio_context(bs);
2723 co = qemu_coroutine_create(bdrv_rw_co_entry);
2724 qemu_coroutine_enter(co, &rwco);
2725 while (rwco.ret == NOT_DONE) {
2726 aio_poll(aio_context, true);
2729 return rwco.ret;
2733 * Process a synchronous request using coroutines
2735 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2736 int nb_sectors, bool is_write, BdrvRequestFlags flags)
2738 QEMUIOVector qiov;
2739 struct iovec iov = {
2740 .iov_base = (void *)buf,
2741 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2744 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2745 return -EINVAL;
2748 qemu_iovec_init_external(&qiov, &iov, 1);
2749 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2750 &qiov, is_write, flags);
2753 /* return < 0 if error. See bdrv_write() for the return codes */
2754 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2755 uint8_t *buf, int nb_sectors)
2757 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2760 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2761 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2762 uint8_t *buf, int nb_sectors)
2764 bool enabled;
2765 int ret;
2767 enabled = bs->io_limits_enabled;
2768 bs->io_limits_enabled = false;
2769 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2770 bs->io_limits_enabled = enabled;
2771 return ret;
2774 /* Return < 0 if error. Important errors are:
2775 -EIO generic I/O error (may happen for all errors)
2776 -ENOMEDIUM No media inserted.
2777 -EINVAL Invalid sector number or nb_sectors
2778 -EACCES Trying to write a read-only device
2780 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2781 const uint8_t *buf, int nb_sectors)
2783 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2786 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2787 int nb_sectors, BdrvRequestFlags flags)
2789 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2790 BDRV_REQ_ZERO_WRITE | flags);
2794 * Completely zero out a block device with the help of bdrv_write_zeroes.
2795 * The operation is sped up by checking the block status and only writing
2796 * zeroes to the device if they currently do not return zeroes. Optional
2797 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2799 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2801 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2803 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2804 int n;
2806 target_sectors = bdrv_nb_sectors(bs);
2807 if (target_sectors < 0) {
2808 return target_sectors;
2811 for (;;) {
2812 nb_sectors = target_sectors - sector_num;
2813 if (nb_sectors <= 0) {
2814 return 0;
2816 if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2817 nb_sectors = INT_MAX / BDRV_SECTOR_SIZE;
2819 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2820 if (ret < 0) {
2821 error_report("error getting block status at sector %" PRId64 ": %s",
2822 sector_num, strerror(-ret));
2823 return ret;
2825 if (ret & BDRV_BLOCK_ZERO) {
2826 sector_num += n;
2827 continue;
2829 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2830 if (ret < 0) {
2831 error_report("error writing zeroes at sector %" PRId64 ": %s",
2832 sector_num, strerror(-ret));
2833 return ret;
2835 sector_num += n;
2839 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2841 QEMUIOVector qiov;
2842 struct iovec iov = {
2843 .iov_base = (void *)buf,
2844 .iov_len = bytes,
2846 int ret;
2848 if (bytes < 0) {
2849 return -EINVAL;
2852 qemu_iovec_init_external(&qiov, &iov, 1);
2853 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2854 if (ret < 0) {
2855 return ret;
2858 return bytes;
2861 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2863 int ret;
2865 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2866 if (ret < 0) {
2867 return ret;
2870 return qiov->size;
2873 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2874 const void *buf, int bytes)
2876 QEMUIOVector qiov;
2877 struct iovec iov = {
2878 .iov_base = (void *) buf,
2879 .iov_len = bytes,
2882 if (bytes < 0) {
2883 return -EINVAL;
2886 qemu_iovec_init_external(&qiov, &iov, 1);
2887 return bdrv_pwritev(bs, offset, &qiov);
2891 * Writes to the file and ensures that no writes are reordered across this
2892 * request (acts as a barrier)
2894 * Returns 0 on success, -errno in error cases.
2896 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2897 const void *buf, int count)
2899 int ret;
2901 ret = bdrv_pwrite(bs, offset, buf, count);
2902 if (ret < 0) {
2903 return ret;
2906 /* No flush needed for cache modes that already do it */
2907 if (bs->enable_write_cache) {
2908 bdrv_flush(bs);
2911 return 0;
2914 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2915 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2917 /* Perform I/O through a temporary buffer so that users who scribble over
2918 * their read buffer while the operation is in progress do not end up
2919 * modifying the image file. This is critical for zero-copy guest I/O
2920 * where anything might happen inside guest memory.
2922 void *bounce_buffer;
2924 BlockDriver *drv = bs->drv;
2925 struct iovec iov;
2926 QEMUIOVector bounce_qiov;
2927 int64_t cluster_sector_num;
2928 int cluster_nb_sectors;
2929 size_t skip_bytes;
2930 int ret;
2932 /* Cover entire cluster so no additional backing file I/O is required when
2933 * allocating cluster in the image file.
2935 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2936 &cluster_sector_num, &cluster_nb_sectors);
2938 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2939 cluster_sector_num, cluster_nb_sectors);
2941 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2942 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2943 if (bounce_buffer == NULL) {
2944 ret = -ENOMEM;
2945 goto err;
2948 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2950 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2951 &bounce_qiov);
2952 if (ret < 0) {
2953 goto err;
2956 if (drv->bdrv_co_write_zeroes &&
2957 buffer_is_zero(bounce_buffer, iov.iov_len)) {
2958 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2959 cluster_nb_sectors, 0);
2960 } else {
2961 /* This does not change the data on the disk, it is not necessary
2962 * to flush even in cache=writethrough mode.
2964 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2965 &bounce_qiov);
2968 if (ret < 0) {
2969 /* It might be okay to ignore write errors for guest requests. If this
2970 * is a deliberate copy-on-read then we don't want to ignore the error.
2971 * Simply report it in all cases.
2973 goto err;
2976 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2977 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2978 nb_sectors * BDRV_SECTOR_SIZE);
2980 err:
2981 qemu_vfree(bounce_buffer);
2982 return ret;
2986 * Forwards an already correctly aligned request to the BlockDriver. This
2987 * handles copy on read and zeroing after EOF; any other features must be
2988 * implemented by the caller.
2990 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2991 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2992 int64_t align, QEMUIOVector *qiov, int flags)
2994 BlockDriver *drv = bs->drv;
2995 int ret;
2997 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2998 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3000 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3001 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3002 assert(!qiov || bytes == qiov->size);
3004 /* Handle Copy on Read and associated serialisation */
3005 if (flags & BDRV_REQ_COPY_ON_READ) {
3006 /* If we touch the same cluster it counts as an overlap. This
3007 * guarantees that allocating writes will be serialized and not race
3008 * with each other for the same cluster. For example, in copy-on-read
3009 * it ensures that the CoR read and write operations are atomic and
3010 * guest writes cannot interleave between them. */
3011 mark_request_serialising(req, bdrv_get_cluster_size(bs));
3014 wait_serialising_requests(req);
3016 if (flags & BDRV_REQ_COPY_ON_READ) {
3017 int pnum;
3019 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3020 if (ret < 0) {
3021 goto out;
3024 if (!ret || pnum != nb_sectors) {
3025 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3026 goto out;
3030 /* Forward the request to the BlockDriver */
3031 if (!(bs->zero_beyond_eof && bs->growable)) {
3032 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3033 } else {
3034 /* Read zeros after EOF of growable BDSes */
3035 int64_t total_sectors, max_nb_sectors;
3037 total_sectors = bdrv_nb_sectors(bs);
3038 if (total_sectors < 0) {
3039 ret = total_sectors;
3040 goto out;
3043 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3044 align >> BDRV_SECTOR_BITS);
3045 if (max_nb_sectors > 0) {
3046 QEMUIOVector local_qiov;
3047 size_t local_sectors;
3049 max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3050 local_sectors = MIN(max_nb_sectors, nb_sectors);
3052 qemu_iovec_init(&local_qiov, qiov->niov);
3053 qemu_iovec_concat(&local_qiov, qiov, 0,
3054 local_sectors * BDRV_SECTOR_SIZE);
3056 ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3057 &local_qiov);
3059 qemu_iovec_destroy(&local_qiov);
3060 } else {
3061 ret = 0;
3064 /* Reading beyond end of file is supposed to produce zeroes */
3065 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3066 uint64_t offset = MAX(0, total_sectors - sector_num);
3067 uint64_t bytes = (sector_num + nb_sectors - offset) *
3068 BDRV_SECTOR_SIZE;
3069 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3073 out:
3074 return ret;
3078 * Handle a read request in coroutine context
3080 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3081 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3082 BdrvRequestFlags flags)
3084 BlockDriver *drv = bs->drv;
3085 BdrvTrackedRequest req;
3087 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3088 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3089 uint8_t *head_buf = NULL;
3090 uint8_t *tail_buf = NULL;
3091 QEMUIOVector local_qiov;
3092 bool use_local_qiov = false;
3093 int ret;
3095 if (!drv) {
3096 return -ENOMEDIUM;
3098 if (bdrv_check_byte_request(bs, offset, bytes)) {
3099 return -EIO;
3102 if (bs->copy_on_read) {
3103 flags |= BDRV_REQ_COPY_ON_READ;
3106 /* throttling disk I/O */
3107 if (bs->io_limits_enabled) {
3108 bdrv_io_limits_intercept(bs, bytes, false);
3111 /* Align read if necessary by padding qiov */
3112 if (offset & (align - 1)) {
3113 head_buf = qemu_blockalign(bs, align);
3114 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3115 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3116 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3117 use_local_qiov = true;
3119 bytes += offset & (align - 1);
3120 offset = offset & ~(align - 1);
3123 if ((offset + bytes) & (align - 1)) {
3124 if (!use_local_qiov) {
3125 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3126 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3127 use_local_qiov = true;
3129 tail_buf = qemu_blockalign(bs, align);
3130 qemu_iovec_add(&local_qiov, tail_buf,
3131 align - ((offset + bytes) & (align - 1)));
3133 bytes = ROUND_UP(bytes, align);
3136 tracked_request_begin(&req, bs, offset, bytes, false);
3137 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3138 use_local_qiov ? &local_qiov : qiov,
3139 flags);
3140 tracked_request_end(&req);
3142 if (use_local_qiov) {
3143 qemu_iovec_destroy(&local_qiov);
3144 qemu_vfree(head_buf);
3145 qemu_vfree(tail_buf);
3148 return ret;
3151 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3152 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3153 BdrvRequestFlags flags)
3155 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3156 return -EINVAL;
3159 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3160 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3163 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3164 int nb_sectors, QEMUIOVector *qiov)
3166 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3168 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3171 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3172 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3174 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3176 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3177 BDRV_REQ_COPY_ON_READ);
3180 /* if no limit is specified in the BlockLimits use a default
3181 * of 32768 512-byte sectors (16 MiB) per request.
3183 #define MAX_WRITE_ZEROES_DEFAULT 32768
3185 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3186 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3188 BlockDriver *drv = bs->drv;
3189 QEMUIOVector qiov;
3190 struct iovec iov = {0};
3191 int ret = 0;
3193 int max_write_zeroes = bs->bl.max_write_zeroes ?
3194 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3196 while (nb_sectors > 0 && !ret) {
3197 int num = nb_sectors;
3199 /* Align request. Block drivers can expect the "bulk" of the request
3200 * to be aligned.
3202 if (bs->bl.write_zeroes_alignment
3203 && num > bs->bl.write_zeroes_alignment) {
3204 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3205 /* Make a small request up to the first aligned sector. */
3206 num = bs->bl.write_zeroes_alignment;
3207 num -= sector_num % bs->bl.write_zeroes_alignment;
3208 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3209 /* Shorten the request to the last aligned sector. num cannot
3210 * underflow because num > bs->bl.write_zeroes_alignment.
3212 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3216 /* limit request size */
3217 if (num > max_write_zeroes) {
3218 num = max_write_zeroes;
3221 ret = -ENOTSUP;
3222 /* First try the efficient write zeroes operation */
3223 if (drv->bdrv_co_write_zeroes) {
3224 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3227 if (ret == -ENOTSUP) {
3228 /* Fall back to bounce buffer if write zeroes is unsupported */
3229 iov.iov_len = num * BDRV_SECTOR_SIZE;
3230 if (iov.iov_base == NULL) {
3231 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3232 if (iov.iov_base == NULL) {
3233 ret = -ENOMEM;
3234 goto fail;
3236 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3238 qemu_iovec_init_external(&qiov, &iov, 1);
3240 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3242 /* Keep bounce buffer around if it is big enough for all
3243 * all future requests.
3245 if (num < max_write_zeroes) {
3246 qemu_vfree(iov.iov_base);
3247 iov.iov_base = NULL;
3251 sector_num += num;
3252 nb_sectors -= num;
3255 fail:
3256 qemu_vfree(iov.iov_base);
3257 return ret;
3261 * Forwards an already correctly aligned write request to the BlockDriver.
3263 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3264 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3265 QEMUIOVector *qiov, int flags)
3267 BlockDriver *drv = bs->drv;
3268 bool waited;
3269 int ret;
3271 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3272 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3274 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3275 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3276 assert(!qiov || bytes == qiov->size);
3278 waited = wait_serialising_requests(req);
3279 assert(!waited || !req->serialising);
3280 assert(req->overlap_offset <= offset);
3281 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3283 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3285 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3286 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3287 qemu_iovec_is_zero(qiov)) {
3288 flags |= BDRV_REQ_ZERO_WRITE;
3289 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3290 flags |= BDRV_REQ_MAY_UNMAP;
3294 if (ret < 0) {
3295 /* Do nothing, write notifier decided to fail this request */
3296 } else if (flags & BDRV_REQ_ZERO_WRITE) {
3297 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3298 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3299 } else {
3300 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3301 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3303 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3305 if (ret == 0 && !bs->enable_write_cache) {
3306 ret = bdrv_co_flush(bs);
3309 bdrv_set_dirty(bs, sector_num, nb_sectors);
3311 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3313 if (bs->growable && ret >= 0) {
3314 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3317 return ret;
3321 * Handle a write request in coroutine context
3323 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3324 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3325 BdrvRequestFlags flags)
3327 BdrvTrackedRequest req;
3328 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3329 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3330 uint8_t *head_buf = NULL;
3331 uint8_t *tail_buf = NULL;
3332 QEMUIOVector local_qiov;
3333 bool use_local_qiov = false;
3334 int ret;
3336 if (!bs->drv) {
3337 return -ENOMEDIUM;
3339 if (bs->read_only) {
3340 return -EACCES;
3342 if (bdrv_check_byte_request(bs, offset, bytes)) {
3343 return -EIO;
3346 /* throttling disk I/O */
3347 if (bs->io_limits_enabled) {
3348 bdrv_io_limits_intercept(bs, bytes, true);
3352 * Align write if necessary by performing a read-modify-write cycle.
3353 * Pad qiov with the read parts and be sure to have a tracked request not
3354 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3356 tracked_request_begin(&req, bs, offset, bytes, true);
3358 if (offset & (align - 1)) {
3359 QEMUIOVector head_qiov;
3360 struct iovec head_iov;
3362 mark_request_serialising(&req, align);
3363 wait_serialising_requests(&req);
3365 head_buf = qemu_blockalign(bs, align);
3366 head_iov = (struct iovec) {
3367 .iov_base = head_buf,
3368 .iov_len = align,
3370 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3372 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3373 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3374 align, &head_qiov, 0);
3375 if (ret < 0) {
3376 goto fail;
3378 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3380 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3381 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3382 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3383 use_local_qiov = true;
3385 bytes += offset & (align - 1);
3386 offset = offset & ~(align - 1);
3389 if ((offset + bytes) & (align - 1)) {
3390 QEMUIOVector tail_qiov;
3391 struct iovec tail_iov;
3392 size_t tail_bytes;
3393 bool waited;
3395 mark_request_serialising(&req, align);
3396 waited = wait_serialising_requests(&req);
3397 assert(!waited || !use_local_qiov);
3399 tail_buf = qemu_blockalign(bs, align);
3400 tail_iov = (struct iovec) {
3401 .iov_base = tail_buf,
3402 .iov_len = align,
3404 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3406 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3407 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3408 align, &tail_qiov, 0);
3409 if (ret < 0) {
3410 goto fail;
3412 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3414 if (!use_local_qiov) {
3415 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3416 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3417 use_local_qiov = true;
3420 tail_bytes = (offset + bytes) & (align - 1);
3421 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3423 bytes = ROUND_UP(bytes, align);
3426 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3427 use_local_qiov ? &local_qiov : qiov,
3428 flags);
3430 fail:
3431 tracked_request_end(&req);
3433 if (use_local_qiov) {
3434 qemu_iovec_destroy(&local_qiov);
3436 qemu_vfree(head_buf);
3437 qemu_vfree(tail_buf);
3439 return ret;
3442 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3443 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3444 BdrvRequestFlags flags)
3446 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3447 return -EINVAL;
3450 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3451 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3454 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3455 int nb_sectors, QEMUIOVector *qiov)
3457 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3459 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3462 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3463 int64_t sector_num, int nb_sectors,
3464 BdrvRequestFlags flags)
3466 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3468 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3469 flags &= ~BDRV_REQ_MAY_UNMAP;
3472 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3473 BDRV_REQ_ZERO_WRITE | flags);
3477 * Truncate file to 'offset' bytes (needed only for file protocols)
3479 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3481 BlockDriver *drv = bs->drv;
3482 int ret;
3483 if (!drv)
3484 return -ENOMEDIUM;
3485 if (!drv->bdrv_truncate)
3486 return -ENOTSUP;
3487 if (bs->read_only)
3488 return -EACCES;
3490 ret = drv->bdrv_truncate(bs, offset);
3491 if (ret == 0) {
3492 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3493 if (bs->blk) {
3494 blk_dev_resize_cb(bs->blk);
3497 return ret;
3501 * Length of a allocated file in bytes. Sparse files are counted by actual
3502 * allocated space. Return < 0 if error or unknown.
3504 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3506 BlockDriver *drv = bs->drv;
3507 if (!drv) {
3508 return -ENOMEDIUM;
3510 if (drv->bdrv_get_allocated_file_size) {
3511 return drv->bdrv_get_allocated_file_size(bs);
3513 if (bs->file) {
3514 return bdrv_get_allocated_file_size(bs->file);
3516 return -ENOTSUP;
3520 * Return number of sectors on success, -errno on error.
3522 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3524 BlockDriver *drv = bs->drv;
3526 if (!drv)
3527 return -ENOMEDIUM;
3529 if (drv->has_variable_length) {
3530 int ret = refresh_total_sectors(bs, bs->total_sectors);
3531 if (ret < 0) {
3532 return ret;
3535 return bs->total_sectors;
3539 * Return length in bytes on success, -errno on error.
3540 * The length is always a multiple of BDRV_SECTOR_SIZE.
3542 int64_t bdrv_getlength(BlockDriverState *bs)
3544 int64_t ret = bdrv_nb_sectors(bs);
3546 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3549 /* return 0 as number of sectors if no device present or error */
3550 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3552 int64_t nb_sectors = bdrv_nb_sectors(bs);
3554 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3557 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3558 BlockdevOnError on_write_error)
3560 bs->on_read_error = on_read_error;
3561 bs->on_write_error = on_write_error;
3564 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3566 return is_read ? bs->on_read_error : bs->on_write_error;
3569 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3571 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3573 switch (on_err) {
3574 case BLOCKDEV_ON_ERROR_ENOSPC:
3575 return (error == ENOSPC) ?
3576 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3577 case BLOCKDEV_ON_ERROR_STOP:
3578 return BLOCK_ERROR_ACTION_STOP;
3579 case BLOCKDEV_ON_ERROR_REPORT:
3580 return BLOCK_ERROR_ACTION_REPORT;
3581 case BLOCKDEV_ON_ERROR_IGNORE:
3582 return BLOCK_ERROR_ACTION_IGNORE;
3583 default:
3584 abort();
3588 static void send_qmp_error_event(BlockDriverState *bs,
3589 BlockErrorAction action,
3590 bool is_read, int error)
3592 IoOperationType optype;
3594 optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3595 qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
3596 bdrv_iostatus_is_enabled(bs),
3597 error == ENOSPC, strerror(error),
3598 &error_abort);
3601 /* This is done by device models because, while the block layer knows
3602 * about the error, it does not know whether an operation comes from
3603 * the device or the block layer (from a job, for example).
3605 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3606 bool is_read, int error)
3608 assert(error >= 0);
3610 if (action == BLOCK_ERROR_ACTION_STOP) {
3611 /* First set the iostatus, so that "info block" returns an iostatus
3612 * that matches the events raised so far (an additional error iostatus
3613 * is fine, but not a lost one).
3615 bdrv_iostatus_set_err(bs, error);
3617 /* Then raise the request to stop the VM and the event.
3618 * qemu_system_vmstop_request_prepare has two effects. First,
3619 * it ensures that the STOP event always comes after the
3620 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3621 * can observe the STOP event and do a "cont" before the STOP
3622 * event is issued, the VM will not stop. In this case, vm_start()
3623 * also ensures that the STOP/RESUME pair of events is emitted.
3625 qemu_system_vmstop_request_prepare();
3626 send_qmp_error_event(bs, action, is_read, error);
3627 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3628 } else {
3629 send_qmp_error_event(bs, action, is_read, error);
3633 int bdrv_is_read_only(BlockDriverState *bs)
3635 return bs->read_only;
3638 int bdrv_is_sg(BlockDriverState *bs)
3640 return bs->sg;
3643 int bdrv_enable_write_cache(BlockDriverState *bs)
3645 return bs->enable_write_cache;
3648 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3650 bs->enable_write_cache = wce;
3652 /* so a reopen() will preserve wce */
3653 if (wce) {
3654 bs->open_flags |= BDRV_O_CACHE_WB;
3655 } else {
3656 bs->open_flags &= ~BDRV_O_CACHE_WB;
3660 int bdrv_is_encrypted(BlockDriverState *bs)
3662 if (bs->backing_hd && bs->backing_hd->encrypted)
3663 return 1;
3664 return bs->encrypted;
3667 int bdrv_key_required(BlockDriverState *bs)
3669 BlockDriverState *backing_hd = bs->backing_hd;
3671 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3672 return 1;
3673 return (bs->encrypted && !bs->valid_key);
3676 int bdrv_set_key(BlockDriverState *bs, const char *key)
3678 int ret;
3679 if (bs->backing_hd && bs->backing_hd->encrypted) {
3680 ret = bdrv_set_key(bs->backing_hd, key);
3681 if (ret < 0)
3682 return ret;
3683 if (!bs->encrypted)
3684 return 0;
3686 if (!bs->encrypted) {
3687 return -EINVAL;
3688 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3689 return -ENOMEDIUM;
3691 ret = bs->drv->bdrv_set_key(bs, key);
3692 if (ret < 0) {
3693 bs->valid_key = 0;
3694 } else if (!bs->valid_key) {
3695 bs->valid_key = 1;
3696 if (bs->blk) {
3697 /* call the change callback now, we skipped it on open */
3698 blk_dev_change_media_cb(bs->blk, true);
3701 return ret;
3704 const char *bdrv_get_format_name(BlockDriverState *bs)
3706 return bs->drv ? bs->drv->format_name : NULL;
3709 static int qsort_strcmp(const void *a, const void *b)
3711 return strcmp(a, b);
3714 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3715 void *opaque)
3717 BlockDriver *drv;
3718 int count = 0;
3719 int i;
3720 const char **formats = NULL;
3722 QLIST_FOREACH(drv, &bdrv_drivers, list) {
3723 if (drv->format_name) {
3724 bool found = false;
3725 int i = count;
3726 while (formats && i && !found) {
3727 found = !strcmp(formats[--i], drv->format_name);
3730 if (!found) {
3731 formats = g_renew(const char *, formats, count + 1);
3732 formats[count++] = drv->format_name;
3737 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3739 for (i = 0; i < count; i++) {
3740 it(opaque, formats[i]);
3743 g_free(formats);
3746 /* This function is to find block backend bs */
3747 /* TODO convert callers to blk_by_name(), then remove */
3748 BlockDriverState *bdrv_find(const char *name)
3750 BlockBackend *blk = blk_by_name(name);
3752 return blk ? blk_bs(blk) : NULL;
3755 /* This function is to find a node in the bs graph */
3756 BlockDriverState *bdrv_find_node(const char *node_name)
3758 BlockDriverState *bs;
3760 assert(node_name);
3762 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3763 if (!strcmp(node_name, bs->node_name)) {
3764 return bs;
3767 return NULL;
3770 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3771 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3773 BlockDeviceInfoList *list, *entry;
3774 BlockDriverState *bs;
3776 list = NULL;
3777 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3778 entry = g_malloc0(sizeof(*entry));
3779 entry->value = bdrv_block_device_info(bs);
3780 entry->next = list;
3781 list = entry;
3784 return list;
3787 BlockDriverState *bdrv_lookup_bs(const char *device,
3788 const char *node_name,
3789 Error **errp)
3791 BlockBackend *blk;
3792 BlockDriverState *bs;
3794 if (device) {
3795 blk = blk_by_name(device);
3797 if (blk) {
3798 return blk_bs(blk);
3802 if (node_name) {
3803 bs = bdrv_find_node(node_name);
3805 if (bs) {
3806 return bs;
3810 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3811 device ? device : "",
3812 node_name ? node_name : "");
3813 return NULL;
3816 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3817 * return false. If either argument is NULL, return false. */
3818 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3820 while (top && top != base) {
3821 top = top->backing_hd;
3824 return top != NULL;
3827 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3829 if (!bs) {
3830 return QTAILQ_FIRST(&graph_bdrv_states);
3832 return QTAILQ_NEXT(bs, node_list);
3835 BlockDriverState *bdrv_next(BlockDriverState *bs)
3837 if (!bs) {
3838 return QTAILQ_FIRST(&bdrv_states);
3840 return QTAILQ_NEXT(bs, device_list);
3843 const char *bdrv_get_node_name(const BlockDriverState *bs)
3845 return bs->node_name;
3848 /* TODO check what callers really want: bs->node_name or blk_name() */
3849 const char *bdrv_get_device_name(const BlockDriverState *bs)
3851 return bs->blk ? blk_name(bs->blk) : "";
3854 int bdrv_get_flags(BlockDriverState *bs)
3856 return bs->open_flags;
3859 int bdrv_flush_all(void)
3861 BlockDriverState *bs;
3862 int result = 0;
3864 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3865 AioContext *aio_context = bdrv_get_aio_context(bs);
3866 int ret;
3868 aio_context_acquire(aio_context);
3869 ret = bdrv_flush(bs);
3870 if (ret < 0 && !result) {
3871 result = ret;
3873 aio_context_release(aio_context);
3876 return result;
3879 int bdrv_has_zero_init_1(BlockDriverState *bs)
3881 return 1;
3884 int bdrv_has_zero_init(BlockDriverState *bs)
3886 assert(bs->drv);
3888 /* If BS is a copy on write image, it is initialized to
3889 the contents of the base image, which may not be zeroes. */
3890 if (bs->backing_hd) {
3891 return 0;
3893 if (bs->drv->bdrv_has_zero_init) {
3894 return bs->drv->bdrv_has_zero_init(bs);
3897 /* safe default */
3898 return 0;
3901 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3903 BlockDriverInfo bdi;
3905 if (bs->backing_hd) {
3906 return false;
3909 if (bdrv_get_info(bs, &bdi) == 0) {
3910 return bdi.unallocated_blocks_are_zero;
3913 return false;
3916 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3918 BlockDriverInfo bdi;
3920 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3921 return false;
3924 if (bdrv_get_info(bs, &bdi) == 0) {
3925 return bdi.can_write_zeroes_with_unmap;
3928 return false;
3931 typedef struct BdrvCoGetBlockStatusData {
3932 BlockDriverState *bs;
3933 BlockDriverState *base;
3934 int64_t sector_num;
3935 int nb_sectors;
3936 int *pnum;
3937 int64_t ret;
3938 bool done;
3939 } BdrvCoGetBlockStatusData;
3942 * Returns the allocation status of the specified sectors.
3943 * Drivers not implementing the functionality are assumed to not support
3944 * backing files, hence all their sectors are reported as allocated.
3946 * If 'sector_num' is beyond the end of the disk image the return value is 0
3947 * and 'pnum' is set to 0.
3949 * 'pnum' is set to the number of sectors (including and immediately following
3950 * the specified sector) that are known to be in the same
3951 * allocated/unallocated state.
3953 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3954 * beyond the end of the disk image it will be clamped.
3956 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3957 int64_t sector_num,
3958 int nb_sectors, int *pnum)
3960 int64_t total_sectors;
3961 int64_t n;
3962 int64_t ret, ret2;
3964 total_sectors = bdrv_nb_sectors(bs);
3965 if (total_sectors < 0) {
3966 return total_sectors;
3969 if (sector_num >= total_sectors) {
3970 *pnum = 0;
3971 return 0;
3974 n = total_sectors - sector_num;
3975 if (n < nb_sectors) {
3976 nb_sectors = n;
3979 if (!bs->drv->bdrv_co_get_block_status) {
3980 *pnum = nb_sectors;
3981 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3982 if (bs->drv->protocol_name) {
3983 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3985 return ret;
3988 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3989 if (ret < 0) {
3990 *pnum = 0;
3991 return ret;
3994 if (ret & BDRV_BLOCK_RAW) {
3995 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3996 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3997 *pnum, pnum);
4000 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4001 ret |= BDRV_BLOCK_ALLOCATED;
4004 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4005 if (bdrv_unallocated_blocks_are_zero(bs)) {
4006 ret |= BDRV_BLOCK_ZERO;
4007 } else if (bs->backing_hd) {
4008 BlockDriverState *bs2 = bs->backing_hd;
4009 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4010 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4011 ret |= BDRV_BLOCK_ZERO;
4016 if (bs->file &&
4017 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4018 (ret & BDRV_BLOCK_OFFSET_VALID)) {
4019 int file_pnum;
4021 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4022 *pnum, &file_pnum);
4023 if (ret2 >= 0) {
4024 /* Ignore errors. This is just providing extra information, it
4025 * is useful but not necessary.
4027 if (!file_pnum) {
4028 /* !file_pnum indicates an offset at or beyond the EOF; it is
4029 * perfectly valid for the format block driver to point to such
4030 * offsets, so catch it and mark everything as zero */
4031 ret |= BDRV_BLOCK_ZERO;
4032 } else {
4033 /* Limit request to the range reported by the protocol driver */
4034 *pnum = file_pnum;
4035 ret |= (ret2 & BDRV_BLOCK_ZERO);
4040 return ret;
4043 /* Coroutine wrapper for bdrv_get_block_status() */
4044 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4046 BdrvCoGetBlockStatusData *data = opaque;
4047 BlockDriverState *bs = data->bs;
4049 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4050 data->pnum);
4051 data->done = true;
4055 * Synchronous wrapper around bdrv_co_get_block_status().
4057 * See bdrv_co_get_block_status() for details.
4059 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4060 int nb_sectors, int *pnum)
4062 Coroutine *co;
4063 BdrvCoGetBlockStatusData data = {
4064 .bs = bs,
4065 .sector_num = sector_num,
4066 .nb_sectors = nb_sectors,
4067 .pnum = pnum,
4068 .done = false,
4071 if (qemu_in_coroutine()) {
4072 /* Fast-path if already in coroutine context */
4073 bdrv_get_block_status_co_entry(&data);
4074 } else {
4075 AioContext *aio_context = bdrv_get_aio_context(bs);
4077 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4078 qemu_coroutine_enter(co, &data);
4079 while (!data.done) {
4080 aio_poll(aio_context, true);
4083 return data.ret;
4086 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4087 int nb_sectors, int *pnum)
4089 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4090 if (ret < 0) {
4091 return ret;
4093 return !!(ret & BDRV_BLOCK_ALLOCATED);
4097 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4099 * Return true if the given sector is allocated in any image between
4100 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4101 * sector is allocated in any image of the chain. Return false otherwise.
4103 * 'pnum' is set to the number of sectors (including and immediately following
4104 * the specified sector) that are known to be in the same
4105 * allocated/unallocated state.
4108 int bdrv_is_allocated_above(BlockDriverState *top,
4109 BlockDriverState *base,
4110 int64_t sector_num,
4111 int nb_sectors, int *pnum)
4113 BlockDriverState *intermediate;
4114 int ret, n = nb_sectors;
4116 intermediate = top;
4117 while (intermediate && intermediate != base) {
4118 int pnum_inter;
4119 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4120 &pnum_inter);
4121 if (ret < 0) {
4122 return ret;
4123 } else if (ret) {
4124 *pnum = pnum_inter;
4125 return 1;
4129 * [sector_num, nb_sectors] is unallocated on top but intermediate
4130 * might have
4132 * [sector_num+x, nr_sectors] allocated.
4134 if (n > pnum_inter &&
4135 (intermediate == top ||
4136 sector_num + pnum_inter < intermediate->total_sectors)) {
4137 n = pnum_inter;
4140 intermediate = intermediate->backing_hd;
4143 *pnum = n;
4144 return 0;
4147 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4149 if (bs->backing_hd && bs->backing_hd->encrypted)
4150 return bs->backing_file;
4151 else if (bs->encrypted)
4152 return bs->filename;
4153 else
4154 return NULL;
4157 void bdrv_get_backing_filename(BlockDriverState *bs,
4158 char *filename, int filename_size)
4160 pstrcpy(filename, filename_size, bs->backing_file);
4163 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4164 const uint8_t *buf, int nb_sectors)
4166 BlockDriver *drv = bs->drv;
4167 if (!drv)
4168 return -ENOMEDIUM;
4169 if (!drv->bdrv_write_compressed)
4170 return -ENOTSUP;
4171 if (bdrv_check_request(bs, sector_num, nb_sectors))
4172 return -EIO;
4174 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4176 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4179 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4181 BlockDriver *drv = bs->drv;
4182 if (!drv)
4183 return -ENOMEDIUM;
4184 if (!drv->bdrv_get_info)
4185 return -ENOTSUP;
4186 memset(bdi, 0, sizeof(*bdi));
4187 return drv->bdrv_get_info(bs, bdi);
4190 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4192 BlockDriver *drv = bs->drv;
4193 if (drv && drv->bdrv_get_specific_info) {
4194 return drv->bdrv_get_specific_info(bs);
4196 return NULL;
4199 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4200 int64_t pos, int size)
4202 QEMUIOVector qiov;
4203 struct iovec iov = {
4204 .iov_base = (void *) buf,
4205 .iov_len = size,
4208 qemu_iovec_init_external(&qiov, &iov, 1);
4209 return bdrv_writev_vmstate(bs, &qiov, pos);
4212 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4214 BlockDriver *drv = bs->drv;
4216 if (!drv) {
4217 return -ENOMEDIUM;
4218 } else if (drv->bdrv_save_vmstate) {
4219 return drv->bdrv_save_vmstate(bs, qiov, pos);
4220 } else if (bs->file) {
4221 return bdrv_writev_vmstate(bs->file, qiov, pos);
4224 return -ENOTSUP;
4227 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4228 int64_t pos, int size)
4230 BlockDriver *drv = bs->drv;
4231 if (!drv)
4232 return -ENOMEDIUM;
4233 if (drv->bdrv_load_vmstate)
4234 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4235 if (bs->file)
4236 return bdrv_load_vmstate(bs->file, buf, pos, size);
4237 return -ENOTSUP;
4240 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4242 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4243 return;
4246 bs->drv->bdrv_debug_event(bs, event);
4249 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4250 const char *tag)
4252 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4253 bs = bs->file;
4256 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4257 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4260 return -ENOTSUP;
4263 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4265 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4266 bs = bs->file;
4269 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4270 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4273 return -ENOTSUP;
4276 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4278 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4279 bs = bs->file;
4282 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4283 return bs->drv->bdrv_debug_resume(bs, tag);
4286 return -ENOTSUP;
4289 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4291 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4292 bs = bs->file;
4295 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4296 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4299 return false;
4302 int bdrv_is_snapshot(BlockDriverState *bs)
4304 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4307 /* backing_file can either be relative, or absolute, or a protocol. If it is
4308 * relative, it must be relative to the chain. So, passing in bs->filename
4309 * from a BDS as backing_file should not be done, as that may be relative to
4310 * the CWD rather than the chain. */
4311 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4312 const char *backing_file)
4314 char *filename_full = NULL;
4315 char *backing_file_full = NULL;
4316 char *filename_tmp = NULL;
4317 int is_protocol = 0;
4318 BlockDriverState *curr_bs = NULL;
4319 BlockDriverState *retval = NULL;
4321 if (!bs || !bs->drv || !backing_file) {
4322 return NULL;
4325 filename_full = g_malloc(PATH_MAX);
4326 backing_file_full = g_malloc(PATH_MAX);
4327 filename_tmp = g_malloc(PATH_MAX);
4329 is_protocol = path_has_protocol(backing_file);
4331 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4333 /* If either of the filename paths is actually a protocol, then
4334 * compare unmodified paths; otherwise make paths relative */
4335 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4336 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4337 retval = curr_bs->backing_hd;
4338 break;
4340 } else {
4341 /* If not an absolute filename path, make it relative to the current
4342 * image's filename path */
4343 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4344 backing_file);
4346 /* We are going to compare absolute pathnames */
4347 if (!realpath(filename_tmp, filename_full)) {
4348 continue;
4351 /* We need to make sure the backing filename we are comparing against
4352 * is relative to the current image filename (or absolute) */
4353 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4354 curr_bs->backing_file);
4356 if (!realpath(filename_tmp, backing_file_full)) {
4357 continue;
4360 if (strcmp(backing_file_full, filename_full) == 0) {
4361 retval = curr_bs->backing_hd;
4362 break;
4367 g_free(filename_full);
4368 g_free(backing_file_full);
4369 g_free(filename_tmp);
4370 return retval;
4373 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4375 if (!bs->drv) {
4376 return 0;
4379 if (!bs->backing_hd) {
4380 return 0;
4383 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4386 /**************************************************************/
4387 /* async I/Os */
4389 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4390 QEMUIOVector *qiov, int nb_sectors,
4391 BlockCompletionFunc *cb, void *opaque)
4393 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4395 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4396 cb, opaque, false);
4399 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4400 QEMUIOVector *qiov, int nb_sectors,
4401 BlockCompletionFunc *cb, void *opaque)
4403 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4405 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4406 cb, opaque, true);
4409 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4410 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4411 BlockCompletionFunc *cb, void *opaque)
4413 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4415 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4416 BDRV_REQ_ZERO_WRITE | flags,
4417 cb, opaque, true);
4421 typedef struct MultiwriteCB {
4422 int error;
4423 int num_requests;
4424 int num_callbacks;
4425 struct {
4426 BlockCompletionFunc *cb;
4427 void *opaque;
4428 QEMUIOVector *free_qiov;
4429 } callbacks[];
4430 } MultiwriteCB;
4432 static void multiwrite_user_cb(MultiwriteCB *mcb)
4434 int i;
4436 for (i = 0; i < mcb->num_callbacks; i++) {
4437 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4438 if (mcb->callbacks[i].free_qiov) {
4439 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4441 g_free(mcb->callbacks[i].free_qiov);
4445 static void multiwrite_cb(void *opaque, int ret)
4447 MultiwriteCB *mcb = opaque;
4449 trace_multiwrite_cb(mcb, ret);
4451 if (ret < 0 && !mcb->error) {
4452 mcb->error = ret;
4455 mcb->num_requests--;
4456 if (mcb->num_requests == 0) {
4457 multiwrite_user_cb(mcb);
4458 g_free(mcb);
4462 static int multiwrite_req_compare(const void *a, const void *b)
4464 const BlockRequest *req1 = a, *req2 = b;
4467 * Note that we can't simply subtract req2->sector from req1->sector
4468 * here as that could overflow the return value.
4470 if (req1->sector > req2->sector) {
4471 return 1;
4472 } else if (req1->sector < req2->sector) {
4473 return -1;
4474 } else {
4475 return 0;
4480 * Takes a bunch of requests and tries to merge them. Returns the number of
4481 * requests that remain after merging.
4483 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4484 int num_reqs, MultiwriteCB *mcb)
4486 int i, outidx;
4488 // Sort requests by start sector
4489 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4491 // Check if adjacent requests touch the same clusters. If so, combine them,
4492 // filling up gaps with zero sectors.
4493 outidx = 0;
4494 for (i = 1; i < num_reqs; i++) {
4495 int merge = 0;
4496 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4498 // Handle exactly sequential writes and overlapping writes.
4499 if (reqs[i].sector <= oldreq_last) {
4500 merge = 1;
4503 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4504 merge = 0;
4507 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4508 reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4509 merge = 0;
4512 if (merge) {
4513 size_t size;
4514 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4515 qemu_iovec_init(qiov,
4516 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4518 // Add the first request to the merged one. If the requests are
4519 // overlapping, drop the last sectors of the first request.
4520 size = (reqs[i].sector - reqs[outidx].sector) << 9;
4521 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4523 // We should need to add any zeros between the two requests
4524 assert (reqs[i].sector <= oldreq_last);
4526 // Add the second request
4527 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4529 // Add tail of first request, if necessary
4530 if (qiov->size < reqs[outidx].qiov->size) {
4531 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4532 reqs[outidx].qiov->size - qiov->size);
4535 reqs[outidx].nb_sectors = qiov->size >> 9;
4536 reqs[outidx].qiov = qiov;
4538 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4539 } else {
4540 outidx++;
4541 reqs[outidx].sector = reqs[i].sector;
4542 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4543 reqs[outidx].qiov = reqs[i].qiov;
4547 return outidx + 1;
4551 * Submit multiple AIO write requests at once.
4553 * On success, the function returns 0 and all requests in the reqs array have
4554 * been submitted. In error case this function returns -1, and any of the
4555 * requests may or may not be submitted yet. In particular, this means that the
4556 * callback will be called for some of the requests, for others it won't. The
4557 * caller must check the error field of the BlockRequest to wait for the right
4558 * callbacks (if error != 0, no callback will be called).
4560 * The implementation may modify the contents of the reqs array, e.g. to merge
4561 * requests. However, the fields opaque and error are left unmodified as they
4562 * are used to signal failure for a single request to the caller.
4564 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4566 MultiwriteCB *mcb;
4567 int i;
4569 /* don't submit writes if we don't have a medium */
4570 if (bs->drv == NULL) {
4571 for (i = 0; i < num_reqs; i++) {
4572 reqs[i].error = -ENOMEDIUM;
4574 return -1;
4577 if (num_reqs == 0) {
4578 return 0;
4581 // Create MultiwriteCB structure
4582 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4583 mcb->num_requests = 0;
4584 mcb->num_callbacks = num_reqs;
4586 for (i = 0; i < num_reqs; i++) {
4587 mcb->callbacks[i].cb = reqs[i].cb;
4588 mcb->callbacks[i].opaque = reqs[i].opaque;
4591 // Check for mergable requests
4592 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4594 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4596 /* Run the aio requests. */
4597 mcb->num_requests = num_reqs;
4598 for (i = 0; i < num_reqs; i++) {
4599 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4600 reqs[i].nb_sectors, reqs[i].flags,
4601 multiwrite_cb, mcb,
4602 true);
4605 return 0;
4608 void bdrv_aio_cancel(BlockAIOCB *acb)
4610 qemu_aio_ref(acb);
4611 bdrv_aio_cancel_async(acb);
4612 while (acb->refcnt > 1) {
4613 if (acb->aiocb_info->get_aio_context) {
4614 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4615 } else if (acb->bs) {
4616 aio_poll(bdrv_get_aio_context(acb->bs), true);
4617 } else {
4618 abort();
4621 qemu_aio_unref(acb);
4624 /* Async version of aio cancel. The caller is not blocked if the acb implements
4625 * cancel_async, otherwise we do nothing and let the request normally complete.
4626 * In either case the completion callback must be called. */
4627 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4629 if (acb->aiocb_info->cancel_async) {
4630 acb->aiocb_info->cancel_async(acb);
4634 /**************************************************************/
4635 /* async block device emulation */
4637 typedef struct BlockAIOCBSync {
4638 BlockAIOCB common;
4639 QEMUBH *bh;
4640 int ret;
4641 /* vector translation state */
4642 QEMUIOVector *qiov;
4643 uint8_t *bounce;
4644 int is_write;
4645 } BlockAIOCBSync;
4647 static const AIOCBInfo bdrv_em_aiocb_info = {
4648 .aiocb_size = sizeof(BlockAIOCBSync),
4651 static void bdrv_aio_bh_cb(void *opaque)
4653 BlockAIOCBSync *acb = opaque;
4655 if (!acb->is_write && acb->ret >= 0) {
4656 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4658 qemu_vfree(acb->bounce);
4659 acb->common.cb(acb->common.opaque, acb->ret);
4660 qemu_bh_delete(acb->bh);
4661 acb->bh = NULL;
4662 qemu_aio_unref(acb);
4665 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4666 int64_t sector_num,
4667 QEMUIOVector *qiov,
4668 int nb_sectors,
4669 BlockCompletionFunc *cb,
4670 void *opaque,
4671 int is_write)
4674 BlockAIOCBSync *acb;
4676 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4677 acb->is_write = is_write;
4678 acb->qiov = qiov;
4679 acb->bounce = qemu_try_blockalign(bs, qiov->size);
4680 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4682 if (acb->bounce == NULL) {
4683 acb->ret = -ENOMEM;
4684 } else if (is_write) {
4685 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4686 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4687 } else {
4688 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4691 qemu_bh_schedule(acb->bh);
4693 return &acb->common;
4696 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4697 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4698 BlockCompletionFunc *cb, void *opaque)
4700 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4703 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4704 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4705 BlockCompletionFunc *cb, void *opaque)
4707 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4711 typedef struct BlockAIOCBCoroutine {
4712 BlockAIOCB common;
4713 BlockRequest req;
4714 bool is_write;
4715 bool *done;
4716 QEMUBH* bh;
4717 } BlockAIOCBCoroutine;
4719 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4720 .aiocb_size = sizeof(BlockAIOCBCoroutine),
4723 static void bdrv_co_em_bh(void *opaque)
4725 BlockAIOCBCoroutine *acb = opaque;
4727 acb->common.cb(acb->common.opaque, acb->req.error);
4729 qemu_bh_delete(acb->bh);
4730 qemu_aio_unref(acb);
4733 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4734 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4736 BlockAIOCBCoroutine *acb = opaque;
4737 BlockDriverState *bs = acb->common.bs;
4739 if (!acb->is_write) {
4740 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4741 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4742 } else {
4743 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4744 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4747 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4748 qemu_bh_schedule(acb->bh);
4751 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4752 int64_t sector_num,
4753 QEMUIOVector *qiov,
4754 int nb_sectors,
4755 BdrvRequestFlags flags,
4756 BlockCompletionFunc *cb,
4757 void *opaque,
4758 bool is_write)
4760 Coroutine *co;
4761 BlockAIOCBCoroutine *acb;
4763 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4764 acb->req.sector = sector_num;
4765 acb->req.nb_sectors = nb_sectors;
4766 acb->req.qiov = qiov;
4767 acb->req.flags = flags;
4768 acb->is_write = is_write;
4770 co = qemu_coroutine_create(bdrv_co_do_rw);
4771 qemu_coroutine_enter(co, acb);
4773 return &acb->common;
4776 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4778 BlockAIOCBCoroutine *acb = opaque;
4779 BlockDriverState *bs = acb->common.bs;
4781 acb->req.error = bdrv_co_flush(bs);
4782 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4783 qemu_bh_schedule(acb->bh);
4786 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4787 BlockCompletionFunc *cb, void *opaque)
4789 trace_bdrv_aio_flush(bs, opaque);
4791 Coroutine *co;
4792 BlockAIOCBCoroutine *acb;
4794 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4796 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4797 qemu_coroutine_enter(co, acb);
4799 return &acb->common;
4802 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4804 BlockAIOCBCoroutine *acb = opaque;
4805 BlockDriverState *bs = acb->common.bs;
4807 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4808 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4809 qemu_bh_schedule(acb->bh);
4812 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4813 int64_t sector_num, int nb_sectors,
4814 BlockCompletionFunc *cb, void *opaque)
4816 Coroutine *co;
4817 BlockAIOCBCoroutine *acb;
4819 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4821 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4822 acb->req.sector = sector_num;
4823 acb->req.nb_sectors = nb_sectors;
4824 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4825 qemu_coroutine_enter(co, acb);
4827 return &acb->common;
4830 void bdrv_init(void)
4832 module_call_init(MODULE_INIT_BLOCK);
4835 void bdrv_init_with_whitelist(void)
4837 use_bdrv_whitelist = 1;
4838 bdrv_init();
4841 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4842 BlockCompletionFunc *cb, void *opaque)
4844 BlockAIOCB *acb;
4846 acb = g_slice_alloc(aiocb_info->aiocb_size);
4847 acb->aiocb_info = aiocb_info;
4848 acb->bs = bs;
4849 acb->cb = cb;
4850 acb->opaque = opaque;
4851 acb->refcnt = 1;
4852 return acb;
4855 void qemu_aio_ref(void *p)
4857 BlockAIOCB *acb = p;
4858 acb->refcnt++;
4861 void qemu_aio_unref(void *p)
4863 BlockAIOCB *acb = p;
4864 assert(acb->refcnt > 0);
4865 if (--acb->refcnt == 0) {
4866 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4870 /**************************************************************/
4871 /* Coroutine block device emulation */
4873 typedef struct CoroutineIOCompletion {
4874 Coroutine *coroutine;
4875 int ret;
4876 } CoroutineIOCompletion;
4878 static void bdrv_co_io_em_complete(void *opaque, int ret)
4880 CoroutineIOCompletion *co = opaque;
4882 co->ret = ret;
4883 qemu_coroutine_enter(co->coroutine, NULL);
4886 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4887 int nb_sectors, QEMUIOVector *iov,
4888 bool is_write)
4890 CoroutineIOCompletion co = {
4891 .coroutine = qemu_coroutine_self(),
4893 BlockAIOCB *acb;
4895 if (is_write) {
4896 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4897 bdrv_co_io_em_complete, &co);
4898 } else {
4899 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4900 bdrv_co_io_em_complete, &co);
4903 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4904 if (!acb) {
4905 return -EIO;
4907 qemu_coroutine_yield();
4909 return co.ret;
4912 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4913 int64_t sector_num, int nb_sectors,
4914 QEMUIOVector *iov)
4916 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4919 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4920 int64_t sector_num, int nb_sectors,
4921 QEMUIOVector *iov)
4923 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4926 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4928 RwCo *rwco = opaque;
4930 rwco->ret = bdrv_co_flush(rwco->bs);
4933 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4935 int ret;
4937 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4938 return 0;
4941 /* Write back cached data to the OS even with cache=unsafe */
4942 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4943 if (bs->drv->bdrv_co_flush_to_os) {
4944 ret = bs->drv->bdrv_co_flush_to_os(bs);
4945 if (ret < 0) {
4946 return ret;
4950 /* But don't actually force it to the disk with cache=unsafe */
4951 if (bs->open_flags & BDRV_O_NO_FLUSH) {
4952 goto flush_parent;
4955 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4956 if (bs->drv->bdrv_co_flush_to_disk) {
4957 ret = bs->drv->bdrv_co_flush_to_disk(bs);
4958 } else if (bs->drv->bdrv_aio_flush) {
4959 BlockAIOCB *acb;
4960 CoroutineIOCompletion co = {
4961 .coroutine = qemu_coroutine_self(),
4964 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4965 if (acb == NULL) {
4966 ret = -EIO;
4967 } else {
4968 qemu_coroutine_yield();
4969 ret = co.ret;
4971 } else {
4973 * Some block drivers always operate in either writethrough or unsafe
4974 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4975 * know how the server works (because the behaviour is hardcoded or
4976 * depends on server-side configuration), so we can't ensure that
4977 * everything is safe on disk. Returning an error doesn't work because
4978 * that would break guests even if the server operates in writethrough
4979 * mode.
4981 * Let's hope the user knows what he's doing.
4983 ret = 0;
4985 if (ret < 0) {
4986 return ret;
4989 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4990 * in the case of cache=unsafe, so there are no useless flushes.
4992 flush_parent:
4993 return bdrv_co_flush(bs->file);
4996 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4998 Error *local_err = NULL;
4999 int ret;
5001 if (!bs->drv) {
5002 return;
5005 if (!(bs->open_flags & BDRV_O_INCOMING)) {
5006 return;
5008 bs->open_flags &= ~BDRV_O_INCOMING;
5010 if (bs->drv->bdrv_invalidate_cache) {
5011 bs->drv->bdrv_invalidate_cache(bs, &local_err);
5012 } else if (bs->file) {
5013 bdrv_invalidate_cache(bs->file, &local_err);
5015 if (local_err) {
5016 error_propagate(errp, local_err);
5017 return;
5020 ret = refresh_total_sectors(bs, bs->total_sectors);
5021 if (ret < 0) {
5022 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5023 return;
5027 void bdrv_invalidate_cache_all(Error **errp)
5029 BlockDriverState *bs;
5030 Error *local_err = NULL;
5032 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5033 AioContext *aio_context = bdrv_get_aio_context(bs);
5035 aio_context_acquire(aio_context);
5036 bdrv_invalidate_cache(bs, &local_err);
5037 aio_context_release(aio_context);
5038 if (local_err) {
5039 error_propagate(errp, local_err);
5040 return;
5045 int bdrv_flush(BlockDriverState *bs)
5047 Coroutine *co;
5048 RwCo rwco = {
5049 .bs = bs,
5050 .ret = NOT_DONE,
5053 if (qemu_in_coroutine()) {
5054 /* Fast-path if already in coroutine context */
5055 bdrv_flush_co_entry(&rwco);
5056 } else {
5057 AioContext *aio_context = bdrv_get_aio_context(bs);
5059 co = qemu_coroutine_create(bdrv_flush_co_entry);
5060 qemu_coroutine_enter(co, &rwco);
5061 while (rwco.ret == NOT_DONE) {
5062 aio_poll(aio_context, true);
5066 return rwco.ret;
5069 typedef struct DiscardCo {
5070 BlockDriverState *bs;
5071 int64_t sector_num;
5072 int nb_sectors;
5073 int ret;
5074 } DiscardCo;
5075 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5077 DiscardCo *rwco = opaque;
5079 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5082 /* if no limit is specified in the BlockLimits use a default
5083 * of 32768 512-byte sectors (16 MiB) per request.
5085 #define MAX_DISCARD_DEFAULT 32768
5087 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5088 int nb_sectors)
5090 int max_discard;
5092 if (!bs->drv) {
5093 return -ENOMEDIUM;
5094 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5095 return -EIO;
5096 } else if (bs->read_only) {
5097 return -EROFS;
5100 bdrv_reset_dirty(bs, sector_num, nb_sectors);
5102 /* Do nothing if disabled. */
5103 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5104 return 0;
5107 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5108 return 0;
5111 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5112 while (nb_sectors > 0) {
5113 int ret;
5114 int num = nb_sectors;
5116 /* align request */
5117 if (bs->bl.discard_alignment &&
5118 num >= bs->bl.discard_alignment &&
5119 sector_num % bs->bl.discard_alignment) {
5120 if (num > bs->bl.discard_alignment) {
5121 num = bs->bl.discard_alignment;
5123 num -= sector_num % bs->bl.discard_alignment;
5126 /* limit request size */
5127 if (num > max_discard) {
5128 num = max_discard;
5131 if (bs->drv->bdrv_co_discard) {
5132 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5133 } else {
5134 BlockAIOCB *acb;
5135 CoroutineIOCompletion co = {
5136 .coroutine = qemu_coroutine_self(),
5139 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5140 bdrv_co_io_em_complete, &co);
5141 if (acb == NULL) {
5142 return -EIO;
5143 } else {
5144 qemu_coroutine_yield();
5145 ret = co.ret;
5148 if (ret && ret != -ENOTSUP) {
5149 return ret;
5152 sector_num += num;
5153 nb_sectors -= num;
5155 return 0;
5158 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5160 Coroutine *co;
5161 DiscardCo rwco = {
5162 .bs = bs,
5163 .sector_num = sector_num,
5164 .nb_sectors = nb_sectors,
5165 .ret = NOT_DONE,
5168 if (qemu_in_coroutine()) {
5169 /* Fast-path if already in coroutine context */
5170 bdrv_discard_co_entry(&rwco);
5171 } else {
5172 AioContext *aio_context = bdrv_get_aio_context(bs);
5174 co = qemu_coroutine_create(bdrv_discard_co_entry);
5175 qemu_coroutine_enter(co, &rwco);
5176 while (rwco.ret == NOT_DONE) {
5177 aio_poll(aio_context, true);
5181 return rwco.ret;
5184 /**************************************************************/
5185 /* removable device support */
5188 * Return TRUE if the media is present
5190 int bdrv_is_inserted(BlockDriverState *bs)
5192 BlockDriver *drv = bs->drv;
5194 if (!drv)
5195 return 0;
5196 if (!drv->bdrv_is_inserted)
5197 return 1;
5198 return drv->bdrv_is_inserted(bs);
5202 * Return whether the media changed since the last call to this
5203 * function, or -ENOTSUP if we don't know. Most drivers don't know.
5205 int bdrv_media_changed(BlockDriverState *bs)
5207 BlockDriver *drv = bs->drv;
5209 if (drv && drv->bdrv_media_changed) {
5210 return drv->bdrv_media_changed(bs);
5212 return -ENOTSUP;
5216 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5218 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5220 BlockDriver *drv = bs->drv;
5221 const char *device_name;
5223 if (drv && drv->bdrv_eject) {
5224 drv->bdrv_eject(bs, eject_flag);
5227 device_name = bdrv_get_device_name(bs);
5228 if (device_name[0] != '\0') {
5229 qapi_event_send_device_tray_moved(device_name,
5230 eject_flag, &error_abort);
5235 * Lock or unlock the media (if it is locked, the user won't be able
5236 * to eject it manually).
5238 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5240 BlockDriver *drv = bs->drv;
5242 trace_bdrv_lock_medium(bs, locked);
5244 if (drv && drv->bdrv_lock_medium) {
5245 drv->bdrv_lock_medium(bs, locked);
5249 /* needed for generic scsi interface */
5251 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5253 BlockDriver *drv = bs->drv;
5255 if (drv && drv->bdrv_ioctl)
5256 return drv->bdrv_ioctl(bs, req, buf);
5257 return -ENOTSUP;
5260 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5261 unsigned long int req, void *buf,
5262 BlockCompletionFunc *cb, void *opaque)
5264 BlockDriver *drv = bs->drv;
5266 if (drv && drv->bdrv_aio_ioctl)
5267 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5268 return NULL;
5271 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5273 bs->guest_block_size = align;
5276 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5278 return qemu_memalign(bdrv_opt_mem_align(bs), size);
5281 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5283 return memset(qemu_blockalign(bs, size), 0, size);
5286 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5288 size_t align = bdrv_opt_mem_align(bs);
5290 /* Ensure that NULL is never returned on success */
5291 assert(align > 0);
5292 if (size == 0) {
5293 size = align;
5296 return qemu_try_memalign(align, size);
5299 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5301 void *mem = qemu_try_blockalign(bs, size);
5303 if (mem) {
5304 memset(mem, 0, size);
5307 return mem;
5311 * Check if all memory in this vector is sector aligned.
5313 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5315 int i;
5316 size_t alignment = bdrv_opt_mem_align(bs);
5318 for (i = 0; i < qiov->niov; i++) {
5319 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5320 return false;
5322 if (qiov->iov[i].iov_len % alignment) {
5323 return false;
5327 return true;
5330 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5331 Error **errp)
5333 int64_t bitmap_size;
5334 BdrvDirtyBitmap *bitmap;
5336 assert((granularity & (granularity - 1)) == 0);
5338 granularity >>= BDRV_SECTOR_BITS;
5339 assert(granularity);
5340 bitmap_size = bdrv_nb_sectors(bs);
5341 if (bitmap_size < 0) {
5342 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5343 errno = -bitmap_size;
5344 return NULL;
5346 bitmap = g_new0(BdrvDirtyBitmap, 1);
5347 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5348 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5349 return bitmap;
5352 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5354 BdrvDirtyBitmap *bm, *next;
5355 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5356 if (bm == bitmap) {
5357 QLIST_REMOVE(bitmap, list);
5358 hbitmap_free(bitmap->bitmap);
5359 g_free(bitmap);
5360 return;
5365 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5367 BdrvDirtyBitmap *bm;
5368 BlockDirtyInfoList *list = NULL;
5369 BlockDirtyInfoList **plist = &list;
5371 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5372 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5373 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5374 info->count = bdrv_get_dirty_count(bs, bm);
5375 info->granularity =
5376 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5377 entry->value = info;
5378 *plist = entry;
5379 plist = &entry->next;
5382 return list;
5385 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5387 if (bitmap) {
5388 return hbitmap_get(bitmap->bitmap, sector);
5389 } else {
5390 return 0;
5394 void bdrv_dirty_iter_init(BlockDriverState *bs,
5395 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5397 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5400 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5401 int nr_sectors)
5403 BdrvDirtyBitmap *bitmap;
5404 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5405 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5409 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5411 BdrvDirtyBitmap *bitmap;
5412 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5413 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5417 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5419 return hbitmap_count(bitmap->bitmap);
5422 /* Get a reference to bs */
5423 void bdrv_ref(BlockDriverState *bs)
5425 bs->refcnt++;
5428 /* Release a previously grabbed reference to bs.
5429 * If after releasing, reference count is zero, the BlockDriverState is
5430 * deleted. */
5431 void bdrv_unref(BlockDriverState *bs)
5433 if (!bs) {
5434 return;
5436 assert(bs->refcnt > 0);
5437 if (--bs->refcnt == 0) {
5438 bdrv_delete(bs);
5442 struct BdrvOpBlocker {
5443 Error *reason;
5444 QLIST_ENTRY(BdrvOpBlocker) list;
5447 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5449 BdrvOpBlocker *blocker;
5450 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5451 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5452 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5453 if (errp) {
5454 error_setg(errp, "Device '%s' is busy: %s",
5455 bdrv_get_device_name(bs),
5456 error_get_pretty(blocker->reason));
5458 return true;
5460 return false;
5463 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5465 BdrvOpBlocker *blocker;
5466 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5468 blocker = g_new0(BdrvOpBlocker, 1);
5469 blocker->reason = reason;
5470 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5473 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5475 BdrvOpBlocker *blocker, *next;
5476 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5477 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5478 if (blocker->reason == reason) {
5479 QLIST_REMOVE(blocker, list);
5480 g_free(blocker);
5485 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5487 int i;
5488 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5489 bdrv_op_block(bs, i, reason);
5493 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5495 int i;
5496 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5497 bdrv_op_unblock(bs, i, reason);
5501 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5503 int i;
5505 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5506 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5507 return false;
5510 return true;
5513 void bdrv_iostatus_enable(BlockDriverState *bs)
5515 bs->iostatus_enabled = true;
5516 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5519 /* The I/O status is only enabled if the drive explicitly
5520 * enables it _and_ the VM is configured to stop on errors */
5521 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5523 return (bs->iostatus_enabled &&
5524 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5525 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5526 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5529 void bdrv_iostatus_disable(BlockDriverState *bs)
5531 bs->iostatus_enabled = false;
5534 void bdrv_iostatus_reset(BlockDriverState *bs)
5536 if (bdrv_iostatus_is_enabled(bs)) {
5537 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5538 if (bs->job) {
5539 block_job_iostatus_reset(bs->job);
5544 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5546 assert(bdrv_iostatus_is_enabled(bs));
5547 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5548 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5549 BLOCK_DEVICE_IO_STATUS_FAILED;
5553 void bdrv_img_create(const char *filename, const char *fmt,
5554 const char *base_filename, const char *base_fmt,
5555 char *options, uint64_t img_size, int flags,
5556 Error **errp, bool quiet)
5558 QemuOptsList *create_opts = NULL;
5559 QemuOpts *opts = NULL;
5560 const char *backing_fmt, *backing_file;
5561 int64_t size;
5562 BlockDriver *drv, *proto_drv;
5563 BlockDriver *backing_drv = NULL;
5564 Error *local_err = NULL;
5565 int ret = 0;
5567 /* Find driver and parse its options */
5568 drv = bdrv_find_format(fmt);
5569 if (!drv) {
5570 error_setg(errp, "Unknown file format '%s'", fmt);
5571 return;
5574 proto_drv = bdrv_find_protocol(filename, true);
5575 if (!proto_drv) {
5576 error_setg(errp, "Unknown protocol '%s'", filename);
5577 return;
5580 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5581 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5583 /* Create parameter list with default values */
5584 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5585 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5587 /* Parse -o options */
5588 if (options) {
5589 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5590 error_setg(errp, "Invalid options for file format '%s'", fmt);
5591 goto out;
5595 if (base_filename) {
5596 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5597 error_setg(errp, "Backing file not supported for file format '%s'",
5598 fmt);
5599 goto out;
5603 if (base_fmt) {
5604 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5605 error_setg(errp, "Backing file format not supported for file "
5606 "format '%s'", fmt);
5607 goto out;
5611 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5612 if (backing_file) {
5613 if (!strcmp(filename, backing_file)) {
5614 error_setg(errp, "Error: Trying to create an image with the "
5615 "same filename as the backing file");
5616 goto out;
5620 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5621 if (backing_fmt) {
5622 backing_drv = bdrv_find_format(backing_fmt);
5623 if (!backing_drv) {
5624 error_setg(errp, "Unknown backing file format '%s'",
5625 backing_fmt);
5626 goto out;
5630 // The size for the image must always be specified, with one exception:
5631 // If we are using a backing file, we can obtain the size from there
5632 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5633 if (size == -1) {
5634 if (backing_file) {
5635 BlockDriverState *bs;
5636 int64_t size;
5637 int back_flags;
5639 /* backing files always opened read-only */
5640 back_flags =
5641 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5643 bs = NULL;
5644 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5645 backing_drv, &local_err);
5646 if (ret < 0) {
5647 goto out;
5649 size = bdrv_getlength(bs);
5650 if (size < 0) {
5651 error_setg_errno(errp, -size, "Could not get size of '%s'",
5652 backing_file);
5653 bdrv_unref(bs);
5654 goto out;
5657 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5659 bdrv_unref(bs);
5660 } else {
5661 error_setg(errp, "Image creation needs a size parameter");
5662 goto out;
5666 if (!quiet) {
5667 printf("Formatting '%s', fmt=%s ", filename, fmt);
5668 qemu_opts_print(opts);
5669 puts("");
5672 ret = bdrv_create(drv, filename, opts, &local_err);
5674 if (ret == -EFBIG) {
5675 /* This is generally a better message than whatever the driver would
5676 * deliver (especially because of the cluster_size_hint), since that
5677 * is most probably not much different from "image too large". */
5678 const char *cluster_size_hint = "";
5679 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5680 cluster_size_hint = " (try using a larger cluster size)";
5682 error_setg(errp, "The image size is too large for file format '%s'"
5683 "%s", fmt, cluster_size_hint);
5684 error_free(local_err);
5685 local_err = NULL;
5688 out:
5689 qemu_opts_del(opts);
5690 qemu_opts_free(create_opts);
5691 if (local_err) {
5692 error_propagate(errp, local_err);
5696 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5698 return bs->aio_context;
5701 void bdrv_detach_aio_context(BlockDriverState *bs)
5703 BdrvAioNotifier *baf;
5705 if (!bs->drv) {
5706 return;
5709 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5710 baf->detach_aio_context(baf->opaque);
5713 if (bs->io_limits_enabled) {
5714 throttle_detach_aio_context(&bs->throttle_state);
5716 if (bs->drv->bdrv_detach_aio_context) {
5717 bs->drv->bdrv_detach_aio_context(bs);
5719 if (bs->file) {
5720 bdrv_detach_aio_context(bs->file);
5722 if (bs->backing_hd) {
5723 bdrv_detach_aio_context(bs->backing_hd);
5726 bs->aio_context = NULL;
5729 void bdrv_attach_aio_context(BlockDriverState *bs,
5730 AioContext *new_context)
5732 BdrvAioNotifier *ban;
5734 if (!bs->drv) {
5735 return;
5738 bs->aio_context = new_context;
5740 if (bs->backing_hd) {
5741 bdrv_attach_aio_context(bs->backing_hd, new_context);
5743 if (bs->file) {
5744 bdrv_attach_aio_context(bs->file, new_context);
5746 if (bs->drv->bdrv_attach_aio_context) {
5747 bs->drv->bdrv_attach_aio_context(bs, new_context);
5749 if (bs->io_limits_enabled) {
5750 throttle_attach_aio_context(&bs->throttle_state, new_context);
5753 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5754 ban->attached_aio_context(new_context, ban->opaque);
5758 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5760 bdrv_drain_all(); /* ensure there are no in-flight requests */
5762 bdrv_detach_aio_context(bs);
5764 /* This function executes in the old AioContext so acquire the new one in
5765 * case it runs in a different thread.
5767 aio_context_acquire(new_context);
5768 bdrv_attach_aio_context(bs, new_context);
5769 aio_context_release(new_context);
5772 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5773 void (*attached_aio_context)(AioContext *new_context, void *opaque),
5774 void (*detach_aio_context)(void *opaque), void *opaque)
5776 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5777 *ban = (BdrvAioNotifier){
5778 .attached_aio_context = attached_aio_context,
5779 .detach_aio_context = detach_aio_context,
5780 .opaque = opaque
5783 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5786 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5787 void (*attached_aio_context)(AioContext *,
5788 void *),
5789 void (*detach_aio_context)(void *),
5790 void *opaque)
5792 BdrvAioNotifier *ban, *ban_next;
5794 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5795 if (ban->attached_aio_context == attached_aio_context &&
5796 ban->detach_aio_context == detach_aio_context &&
5797 ban->opaque == opaque)
5799 QLIST_REMOVE(ban, list);
5800 g_free(ban);
5802 return;
5806 abort();
5809 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5810 NotifierWithReturn *notifier)
5812 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5815 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
5816 BlockDriverAmendStatusCB *status_cb)
5818 if (!bs->drv->bdrv_amend_options) {
5819 return -ENOTSUP;
5821 return bs->drv->bdrv_amend_options(bs, opts, status_cb);
5824 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5825 * of block filter and by bdrv_is_first_non_filter.
5826 * It is used to test if the given bs is the candidate or recurse more in the
5827 * node graph.
5829 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5830 BlockDriverState *candidate)
5832 /* return false if basic checks fails */
5833 if (!bs || !bs->drv) {
5834 return false;
5837 /* the code reached a non block filter driver -> check if the bs is
5838 * the same as the candidate. It's the recursion termination condition.
5840 if (!bs->drv->is_filter) {
5841 return bs == candidate;
5843 /* Down this path the driver is a block filter driver */
5845 /* If the block filter recursion method is defined use it to recurse down
5846 * the node graph.
5848 if (bs->drv->bdrv_recurse_is_first_non_filter) {
5849 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5852 /* the driver is a block filter but don't allow to recurse -> return false
5854 return false;
5857 /* This function checks if the candidate is the first non filter bs down it's
5858 * bs chain. Since we don't have pointers to parents it explore all bs chains
5859 * from the top. Some filters can choose not to pass down the recursion.
5861 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5863 BlockDriverState *bs;
5865 /* walk down the bs forest recursively */
5866 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5867 bool perm;
5869 /* try to recurse in this top level bs */
5870 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5872 /* candidate is the first non filter */
5873 if (perm) {
5874 return true;
5878 return false;
5881 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5883 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5884 AioContext *aio_context;
5886 if (!to_replace_bs) {
5887 error_setg(errp, "Node name '%s' not found", node_name);
5888 return NULL;
5891 aio_context = bdrv_get_aio_context(to_replace_bs);
5892 aio_context_acquire(aio_context);
5894 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5895 to_replace_bs = NULL;
5896 goto out;
5899 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5900 * most non filter in order to prevent data corruption.
5901 * Another benefit is that this tests exclude backing files which are
5902 * blocked by the backing blockers.
5904 if (!bdrv_is_first_non_filter(to_replace_bs)) {
5905 error_setg(errp, "Only top most non filter can be replaced");
5906 to_replace_bs = NULL;
5907 goto out;
5910 out:
5911 aio_context_release(aio_context);
5912 return to_replace_bs;
5915 void bdrv_io_plug(BlockDriverState *bs)
5917 BlockDriver *drv = bs->drv;
5918 if (drv && drv->bdrv_io_plug) {
5919 drv->bdrv_io_plug(bs);
5920 } else if (bs->file) {
5921 bdrv_io_plug(bs->file);
5925 void bdrv_io_unplug(BlockDriverState *bs)
5927 BlockDriver *drv = bs->drv;
5928 if (drv && drv->bdrv_io_unplug) {
5929 drv->bdrv_io_unplug(bs);
5930 } else if (bs->file) {
5931 bdrv_io_unplug(bs->file);
5935 void bdrv_flush_io_queue(BlockDriverState *bs)
5937 BlockDriver *drv = bs->drv;
5938 if (drv && drv->bdrv_flush_io_queue) {
5939 drv->bdrv_flush_io_queue(bs);
5940 } else if (bs->file) {
5941 bdrv_flush_io_queue(bs->file);
5945 static bool append_open_options(QDict *d, BlockDriverState *bs)
5947 const QDictEntry *entry;
5948 bool found_any = false;
5950 for (entry = qdict_first(bs->options); entry;
5951 entry = qdict_next(bs->options, entry))
5953 /* Only take options for this level and exclude all non-driver-specific
5954 * options */
5955 if (!strchr(qdict_entry_key(entry), '.') &&
5956 strcmp(qdict_entry_key(entry), "node-name"))
5958 qobject_incref(qdict_entry_value(entry));
5959 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5960 found_any = true;
5964 return found_any;
5967 /* Updates the following BDS fields:
5968 * - exact_filename: A filename which may be used for opening a block device
5969 * which (mostly) equals the given BDS (even without any
5970 * other options; so reading and writing must return the same
5971 * results, but caching etc. may be different)
5972 * - full_open_options: Options which, when given when opening a block device
5973 * (without a filename), result in a BDS (mostly)
5974 * equalling the given one
5975 * - filename: If exact_filename is set, it is copied here. Otherwise,
5976 * full_open_options is converted to a JSON object, prefixed with
5977 * "json:" (for use through the JSON pseudo protocol) and put here.
5979 void bdrv_refresh_filename(BlockDriverState *bs)
5981 BlockDriver *drv = bs->drv;
5982 QDict *opts;
5984 if (!drv) {
5985 return;
5988 /* This BDS's file name will most probably depend on its file's name, so
5989 * refresh that first */
5990 if (bs->file) {
5991 bdrv_refresh_filename(bs->file);
5994 if (drv->bdrv_refresh_filename) {
5995 /* Obsolete information is of no use here, so drop the old file name
5996 * information before refreshing it */
5997 bs->exact_filename[0] = '\0';
5998 if (bs->full_open_options) {
5999 QDECREF(bs->full_open_options);
6000 bs->full_open_options = NULL;
6003 drv->bdrv_refresh_filename(bs);
6004 } else if (bs->file) {
6005 /* Try to reconstruct valid information from the underlying file */
6006 bool has_open_options;
6008 bs->exact_filename[0] = '\0';
6009 if (bs->full_open_options) {
6010 QDECREF(bs->full_open_options);
6011 bs->full_open_options = NULL;
6014 opts = qdict_new();
6015 has_open_options = append_open_options(opts, bs);
6017 /* If no specific options have been given for this BDS, the filename of
6018 * the underlying file should suffice for this one as well */
6019 if (bs->file->exact_filename[0] && !has_open_options) {
6020 strcpy(bs->exact_filename, bs->file->exact_filename);
6022 /* Reconstructing the full options QDict is simple for most format block
6023 * drivers, as long as the full options are known for the underlying
6024 * file BDS. The full options QDict of that file BDS should somehow
6025 * contain a representation of the filename, therefore the following
6026 * suffices without querying the (exact_)filename of this BDS. */
6027 if (bs->file->full_open_options) {
6028 qdict_put_obj(opts, "driver",
6029 QOBJECT(qstring_from_str(drv->format_name)));
6030 QINCREF(bs->file->full_open_options);
6031 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6033 bs->full_open_options = opts;
6034 } else {
6035 QDECREF(opts);
6037 } else if (!bs->full_open_options && qdict_size(bs->options)) {
6038 /* There is no underlying file BDS (at least referenced by BDS.file),
6039 * so the full options QDict should be equal to the options given
6040 * specifically for this block device when it was opened (plus the
6041 * driver specification).
6042 * Because those options don't change, there is no need to update
6043 * full_open_options when it's already set. */
6045 opts = qdict_new();
6046 append_open_options(opts, bs);
6047 qdict_put_obj(opts, "driver",
6048 QOBJECT(qstring_from_str(drv->format_name)));
6050 if (bs->exact_filename[0]) {
6051 /* This may not work for all block protocol drivers (some may
6052 * require this filename to be parsed), but we have to find some
6053 * default solution here, so just include it. If some block driver
6054 * does not support pure options without any filename at all or
6055 * needs some special format of the options QDict, it needs to
6056 * implement the driver-specific bdrv_refresh_filename() function.
6058 qdict_put_obj(opts, "filename",
6059 QOBJECT(qstring_from_str(bs->exact_filename)));
6062 bs->full_open_options = opts;
6065 if (bs->exact_filename[0]) {
6066 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6067 } else if (bs->full_open_options) {
6068 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6069 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6070 qstring_get_str(json));
6071 QDECREF(json);
6075 /* This accessor function purpose is to allow the device models to access the
6076 * BlockAcctStats structure embedded inside a BlockDriverState without being
6077 * aware of the BlockDriverState structure layout.
6078 * It will go away when the BlockAcctStats structure will be moved inside
6079 * the device models.
6081 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6083 return &bs->stats;