exec: add wrapper for host pointer access
[qemu/qmp-unstable.git] / block.c
blob4165d4265cd3ac6ac009f6af101d82d5de4440db
1 /*
2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
40 #ifdef CONFIG_BSD
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
45 #ifndef __DragonFly__
46 #include <sys/disk.h>
47 #endif
48 #endif
50 #ifdef _WIN32
51 #include <windows.h>
52 #endif
54 struct BdrvDirtyBitmap {
55 HBitmap *bitmap;
56 QLIST_ENTRY(BdrvDirtyBitmap) list;
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63 BlockCompletionFunc *cb, void *opaque);
64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66 BlockCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75 BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78 BdrvRequestFlags flags);
79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
83 BdrvRequestFlags flags,
84 BlockCompletionFunc *cb,
85 void *opaque,
86 bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
111 int is_windows_drive(const char *filename)
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
121 #endif
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
127 int i;
129 throttle_config(&bs->throttle_state, cfg);
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
143 bs->io_limits_enabled = false;
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
151 bs->io_limits_enabled = enabled;
153 return drained;
156 void bdrv_io_limits_disable(BlockDriverState *bs)
158 bs->io_limits_enabled = false;
160 bdrv_start_throttled_reqs(bs);
162 throttle_destroy(&bs->throttle_state);
165 static void bdrv_throttle_read_timer_cb(void *opaque)
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
171 static void bdrv_throttle_write_timer_cb(void *opaque)
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 bdrv_get_aio_context(bs),
183 QEMU_CLOCK_VIRTUAL,
184 bdrv_throttle_read_timer_cb,
185 bdrv_throttle_write_timer_cb,
186 bs);
187 bs->io_limits_enabled = true;
190 /* This function makes an IO wait if needed
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
195 static void bdrv_io_limits_intercept(BlockDriverState *bs,
196 unsigned int bytes,
197 bool is_write)
199 /* does this io must wait */
200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
202 /* if must wait or any request of this type throttled queue the IO */
203 if (must_wait ||
204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
208 /* the IO will be executed, do the accounting */
209 throttle_account(&bs->throttle_state, is_write, bytes);
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214 return;
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
221 size_t bdrv_opt_mem_align(BlockDriverState *bs)
223 if (!bs || !bs->drv) {
224 /* 4k should be on the safe side */
225 return 4096;
228 return bs->bl.opt_mem_alignment;
231 /* check if the path starts with "<protocol>:" */
232 int path_has_protocol(const char *path)
234 const char *p;
236 #ifdef _WIN32
237 if (is_windows_drive(path) ||
238 is_windows_drive_prefix(path)) {
239 return 0;
241 p = path + strcspn(path, ":/\\");
242 #else
243 p = path + strcspn(path, ":/");
244 #endif
246 return *p == ':';
249 int path_is_absolute(const char *path)
251 #ifdef _WIN32
252 /* specific case for names like: "\\.\d:" */
253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254 return 1;
256 return (*path == '/' || *path == '\\');
257 #else
258 return (*path == '/');
259 #endif
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
264 supported. */
265 void path_combine(char *dest, int dest_size,
266 const char *base_path,
267 const char *filename)
269 const char *p, *p1;
270 int len;
272 if (dest_size <= 0)
273 return;
274 if (path_is_absolute(filename)) {
275 pstrcpy(dest, dest_size, filename);
276 } else {
277 p = strchr(base_path, ':');
278 if (p)
279 p++;
280 else
281 p = base_path;
282 p1 = strrchr(base_path, '/');
283 #ifdef _WIN32
285 const char *p2;
286 p2 = strrchr(base_path, '\\');
287 if (!p1 || p2 > p1)
288 p1 = p2;
290 #endif
291 if (p1)
292 p1++;
293 else
294 p1 = base_path;
295 if (p1 > p)
296 p = p1;
297 len = p - base_path;
298 if (len > dest_size - 1)
299 len = dest_size - 1;
300 memcpy(dest, base_path, len);
301 dest[len] = '\0';
302 pstrcat(dest, dest_size, filename);
306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
308 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309 pstrcpy(dest, sz, bs->backing_file);
310 } else {
311 path_combine(dest, sz, bs->filename, bs->backing_file);
315 void bdrv_register(BlockDriver *bdrv)
317 /* Block drivers without coroutine functions need emulation */
318 if (!bdrv->bdrv_co_readv) {
319 bdrv->bdrv_co_readv = bdrv_co_readv_em;
320 bdrv->bdrv_co_writev = bdrv_co_writev_em;
322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323 * the block driver lacks aio we need to emulate that too.
325 if (!bdrv->bdrv_aio_readv) {
326 /* add AIO emulation layer */
327 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
332 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
335 BlockDriverState *bdrv_new_root(void)
337 BlockDriverState *bs = bdrv_new();
339 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
340 return bs;
343 BlockDriverState *bdrv_new(void)
345 BlockDriverState *bs;
346 int i;
348 bs = g_new0(BlockDriverState, 1);
349 QLIST_INIT(&bs->dirty_bitmaps);
350 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
351 QLIST_INIT(&bs->op_blockers[i]);
353 bdrv_iostatus_disable(bs);
354 notifier_list_init(&bs->close_notifiers);
355 notifier_with_return_list_init(&bs->before_write_notifiers);
356 qemu_co_queue_init(&bs->throttled_reqs[0]);
357 qemu_co_queue_init(&bs->throttled_reqs[1]);
358 bs->refcnt = 1;
359 bs->aio_context = qemu_get_aio_context();
361 return bs;
364 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
366 notifier_list_add(&bs->close_notifiers, notify);
369 BlockDriver *bdrv_find_format(const char *format_name)
371 BlockDriver *drv1;
372 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
373 if (!strcmp(drv1->format_name, format_name)) {
374 return drv1;
377 return NULL;
380 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
382 static const char *whitelist_rw[] = {
383 CONFIG_BDRV_RW_WHITELIST
385 static const char *whitelist_ro[] = {
386 CONFIG_BDRV_RO_WHITELIST
388 const char **p;
390 if (!whitelist_rw[0] && !whitelist_ro[0]) {
391 return 1; /* no whitelist, anything goes */
394 for (p = whitelist_rw; *p; p++) {
395 if (!strcmp(drv->format_name, *p)) {
396 return 1;
399 if (read_only) {
400 for (p = whitelist_ro; *p; p++) {
401 if (!strcmp(drv->format_name, *p)) {
402 return 1;
406 return 0;
409 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
410 bool read_only)
412 BlockDriver *drv = bdrv_find_format(format_name);
413 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
416 typedef struct CreateCo {
417 BlockDriver *drv;
418 char *filename;
419 QemuOpts *opts;
420 int ret;
421 Error *err;
422 } CreateCo;
424 static void coroutine_fn bdrv_create_co_entry(void *opaque)
426 Error *local_err = NULL;
427 int ret;
429 CreateCo *cco = opaque;
430 assert(cco->drv);
432 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
433 if (local_err) {
434 error_propagate(&cco->err, local_err);
436 cco->ret = ret;
439 int bdrv_create(BlockDriver *drv, const char* filename,
440 QemuOpts *opts, Error **errp)
442 int ret;
444 Coroutine *co;
445 CreateCo cco = {
446 .drv = drv,
447 .filename = g_strdup(filename),
448 .opts = opts,
449 .ret = NOT_DONE,
450 .err = NULL,
453 if (!drv->bdrv_create) {
454 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
455 ret = -ENOTSUP;
456 goto out;
459 if (qemu_in_coroutine()) {
460 /* Fast-path if already in coroutine context */
461 bdrv_create_co_entry(&cco);
462 } else {
463 co = qemu_coroutine_create(bdrv_create_co_entry);
464 qemu_coroutine_enter(co, &cco);
465 while (cco.ret == NOT_DONE) {
466 aio_poll(qemu_get_aio_context(), true);
470 ret = cco.ret;
471 if (ret < 0) {
472 if (cco.err) {
473 error_propagate(errp, cco.err);
474 } else {
475 error_setg_errno(errp, -ret, "Could not create image");
479 out:
480 g_free(cco.filename);
481 return ret;
484 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
486 BlockDriver *drv;
487 Error *local_err = NULL;
488 int ret;
490 drv = bdrv_find_protocol(filename, true);
491 if (drv == NULL) {
492 error_setg(errp, "Could not find protocol for file '%s'", filename);
493 return -ENOENT;
496 ret = bdrv_create(drv, filename, opts, &local_err);
497 if (local_err) {
498 error_propagate(errp, local_err);
500 return ret;
503 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
505 BlockDriver *drv = bs->drv;
506 Error *local_err = NULL;
508 memset(&bs->bl, 0, sizeof(bs->bl));
510 if (!drv) {
511 return;
514 /* Take some limits from the children as a default */
515 if (bs->file) {
516 bdrv_refresh_limits(bs->file, &local_err);
517 if (local_err) {
518 error_propagate(errp, local_err);
519 return;
521 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
522 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
523 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
524 } else {
525 bs->bl.opt_mem_alignment = 512;
528 if (bs->backing_hd) {
529 bdrv_refresh_limits(bs->backing_hd, &local_err);
530 if (local_err) {
531 error_propagate(errp, local_err);
532 return;
534 bs->bl.opt_transfer_length =
535 MAX(bs->bl.opt_transfer_length,
536 bs->backing_hd->bl.opt_transfer_length);
537 bs->bl.max_transfer_length =
538 MIN_NON_ZERO(bs->bl.max_transfer_length,
539 bs->backing_hd->bl.max_transfer_length);
540 bs->bl.opt_mem_alignment =
541 MAX(bs->bl.opt_mem_alignment,
542 bs->backing_hd->bl.opt_mem_alignment);
545 /* Then let the driver override it */
546 if (drv->bdrv_refresh_limits) {
547 drv->bdrv_refresh_limits(bs, errp);
552 * Create a uniquely-named empty temporary file.
553 * Return 0 upon success, otherwise a negative errno value.
555 int get_tmp_filename(char *filename, int size)
557 #ifdef _WIN32
558 char temp_dir[MAX_PATH];
559 /* GetTempFileName requires that its output buffer (4th param)
560 have length MAX_PATH or greater. */
561 assert(size >= MAX_PATH);
562 return (GetTempPath(MAX_PATH, temp_dir)
563 && GetTempFileName(temp_dir, "qem", 0, filename)
564 ? 0 : -GetLastError());
565 #else
566 int fd;
567 const char *tmpdir;
568 tmpdir = getenv("TMPDIR");
569 if (!tmpdir) {
570 tmpdir = "/var/tmp";
572 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
573 return -EOVERFLOW;
575 fd = mkstemp(filename);
576 if (fd < 0) {
577 return -errno;
579 if (close(fd) != 0) {
580 unlink(filename);
581 return -errno;
583 return 0;
584 #endif
588 * Detect host devices. By convention, /dev/cdrom[N] is always
589 * recognized as a host CDROM.
591 static BlockDriver *find_hdev_driver(const char *filename)
593 int score_max = 0, score;
594 BlockDriver *drv = NULL, *d;
596 QLIST_FOREACH(d, &bdrv_drivers, list) {
597 if (d->bdrv_probe_device) {
598 score = d->bdrv_probe_device(filename);
599 if (score > score_max) {
600 score_max = score;
601 drv = d;
606 return drv;
609 BlockDriver *bdrv_find_protocol(const char *filename,
610 bool allow_protocol_prefix)
612 BlockDriver *drv1;
613 char protocol[128];
614 int len;
615 const char *p;
617 /* TODO Drivers without bdrv_file_open must be specified explicitly */
620 * XXX(hch): we really should not let host device detection
621 * override an explicit protocol specification, but moving this
622 * later breaks access to device names with colons in them.
623 * Thanks to the brain-dead persistent naming schemes on udev-
624 * based Linux systems those actually are quite common.
626 drv1 = find_hdev_driver(filename);
627 if (drv1) {
628 return drv1;
631 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
632 return &bdrv_file;
635 p = strchr(filename, ':');
636 assert(p != NULL);
637 len = p - filename;
638 if (len > sizeof(protocol) - 1)
639 len = sizeof(protocol) - 1;
640 memcpy(protocol, filename, len);
641 protocol[len] = '\0';
642 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
643 if (drv1->protocol_name &&
644 !strcmp(drv1->protocol_name, protocol)) {
645 return drv1;
648 return NULL;
652 * Guess image format by probing its contents.
653 * This is not a good idea when your image is raw (CVE-2008-2004), but
654 * we do it anyway for backward compatibility.
656 * @buf contains the image's first @buf_size bytes.
657 * @buf_size is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
658 * but can be smaller if the image file is smaller)
659 * @filename is its filename.
661 * For all block drivers, call the bdrv_probe() method to get its
662 * probing score.
663 * Return the first block driver with the highest probing score.
665 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
666 const char *filename)
668 int score_max = 0, score;
669 BlockDriver *drv = NULL, *d;
671 QLIST_FOREACH(d, &bdrv_drivers, list) {
672 if (d->bdrv_probe) {
673 score = d->bdrv_probe(buf, buf_size, filename);
674 if (score > score_max) {
675 score_max = score;
676 drv = d;
681 return drv;
684 static int find_image_format(BlockDriverState *bs, const char *filename,
685 BlockDriver **pdrv, Error **errp)
687 BlockDriver *drv;
688 uint8_t buf[BLOCK_PROBE_BUF_SIZE];
689 int ret = 0;
691 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
692 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
693 *pdrv = &bdrv_raw;
694 return ret;
697 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
698 if (ret < 0) {
699 error_setg_errno(errp, -ret, "Could not read image for determining its "
700 "format");
701 *pdrv = NULL;
702 return ret;
705 drv = bdrv_probe_all(buf, ret, filename);
706 if (!drv) {
707 error_setg(errp, "Could not determine image format: No compatible "
708 "driver found");
709 ret = -ENOENT;
711 *pdrv = drv;
712 return ret;
716 * Set the current 'total_sectors' value
717 * Return 0 on success, -errno on error.
719 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
721 BlockDriver *drv = bs->drv;
723 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
724 if (bs->sg)
725 return 0;
727 /* query actual device if possible, otherwise just trust the hint */
728 if (drv->bdrv_getlength) {
729 int64_t length = drv->bdrv_getlength(bs);
730 if (length < 0) {
731 return length;
733 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
736 bs->total_sectors = hint;
737 return 0;
741 * Set open flags for a given discard mode
743 * Return 0 on success, -1 if the discard mode was invalid.
745 int bdrv_parse_discard_flags(const char *mode, int *flags)
747 *flags &= ~BDRV_O_UNMAP;
749 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
750 /* do nothing */
751 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
752 *flags |= BDRV_O_UNMAP;
753 } else {
754 return -1;
757 return 0;
761 * Set open flags for a given cache mode
763 * Return 0 on success, -1 if the cache mode was invalid.
765 int bdrv_parse_cache_flags(const char *mode, int *flags)
767 *flags &= ~BDRV_O_CACHE_MASK;
769 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
770 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
771 } else if (!strcmp(mode, "directsync")) {
772 *flags |= BDRV_O_NOCACHE;
773 } else if (!strcmp(mode, "writeback")) {
774 *flags |= BDRV_O_CACHE_WB;
775 } else if (!strcmp(mode, "unsafe")) {
776 *flags |= BDRV_O_CACHE_WB;
777 *flags |= BDRV_O_NO_FLUSH;
778 } else if (!strcmp(mode, "writethrough")) {
779 /* this is the default */
780 } else {
781 return -1;
784 return 0;
788 * The copy-on-read flag is actually a reference count so multiple users may
789 * use the feature without worrying about clobbering its previous state.
790 * Copy-on-read stays enabled until all users have called to disable it.
792 void bdrv_enable_copy_on_read(BlockDriverState *bs)
794 bs->copy_on_read++;
797 void bdrv_disable_copy_on_read(BlockDriverState *bs)
799 assert(bs->copy_on_read > 0);
800 bs->copy_on_read--;
804 * Returns the flags that a temporary snapshot should get, based on the
805 * originally requested flags (the originally requested image will have flags
806 * like a backing file)
808 static int bdrv_temp_snapshot_flags(int flags)
810 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
814 * Returns the flags that bs->file should get, based on the given flags for
815 * the parent BDS
817 static int bdrv_inherited_flags(int flags)
819 /* Enable protocol handling, disable format probing for bs->file */
820 flags |= BDRV_O_PROTOCOL;
822 /* Our block drivers take care to send flushes and respect unmap policy,
823 * so we can enable both unconditionally on lower layers. */
824 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
826 /* Clear flags that only apply to the top layer */
827 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
829 return flags;
833 * Returns the flags that bs->backing_hd should get, based on the given flags
834 * for the parent BDS
836 static int bdrv_backing_flags(int flags)
838 /* backing files always opened read-only */
839 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
841 /* snapshot=on is handled on the top layer */
842 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
844 return flags;
847 static int bdrv_open_flags(BlockDriverState *bs, int flags)
849 int open_flags = flags | BDRV_O_CACHE_WB;
852 * Clear flags that are internal to the block layer before opening the
853 * image.
855 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
858 * Snapshots should be writable.
860 if (flags & BDRV_O_TEMPORARY) {
861 open_flags |= BDRV_O_RDWR;
864 return open_flags;
867 static void bdrv_assign_node_name(BlockDriverState *bs,
868 const char *node_name,
869 Error **errp)
871 if (!node_name) {
872 return;
875 /* Check for empty string or invalid characters */
876 if (!id_wellformed(node_name)) {
877 error_setg(errp, "Invalid node name");
878 return;
881 /* takes care of avoiding namespaces collisions */
882 if (blk_by_name(node_name)) {
883 error_setg(errp, "node-name=%s is conflicting with a device id",
884 node_name);
885 return;
888 /* takes care of avoiding duplicates node names */
889 if (bdrv_find_node(node_name)) {
890 error_setg(errp, "Duplicate node name");
891 return;
894 /* copy node name into the bs and insert it into the graph list */
895 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
896 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
900 * Common part for opening disk images and files
902 * Removes all processed options from *options.
904 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
905 QDict *options, int flags, BlockDriver *drv, Error **errp)
907 int ret, open_flags;
908 const char *filename;
909 const char *node_name = NULL;
910 Error *local_err = NULL;
912 assert(drv != NULL);
913 assert(bs->file == NULL);
914 assert(options != NULL && bs->options != options);
916 if (file != NULL) {
917 filename = file->filename;
918 } else {
919 filename = qdict_get_try_str(options, "filename");
922 if (drv->bdrv_needs_filename && !filename) {
923 error_setg(errp, "The '%s' block driver requires a file name",
924 drv->format_name);
925 return -EINVAL;
928 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
930 node_name = qdict_get_try_str(options, "node-name");
931 bdrv_assign_node_name(bs, node_name, &local_err);
932 if (local_err) {
933 error_propagate(errp, local_err);
934 return -EINVAL;
936 qdict_del(options, "node-name");
938 /* bdrv_open() with directly using a protocol as drv. This layer is already
939 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
940 * and return immediately. */
941 if (file != NULL && drv->bdrv_file_open) {
942 bdrv_swap(file, bs);
943 return 0;
946 bs->open_flags = flags;
947 bs->guest_block_size = 512;
948 bs->request_alignment = 512;
949 bs->zero_beyond_eof = true;
950 open_flags = bdrv_open_flags(bs, flags);
951 bs->read_only = !(open_flags & BDRV_O_RDWR);
952 bs->growable = !!(flags & BDRV_O_PROTOCOL);
954 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
955 error_setg(errp,
956 !bs->read_only && bdrv_is_whitelisted(drv, true)
957 ? "Driver '%s' can only be used for read-only devices"
958 : "Driver '%s' is not whitelisted",
959 drv->format_name);
960 return -ENOTSUP;
963 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
964 if (flags & BDRV_O_COPY_ON_READ) {
965 if (!bs->read_only) {
966 bdrv_enable_copy_on_read(bs);
967 } else {
968 error_setg(errp, "Can't use copy-on-read on read-only device");
969 return -EINVAL;
973 if (filename != NULL) {
974 pstrcpy(bs->filename, sizeof(bs->filename), filename);
975 } else {
976 bs->filename[0] = '\0';
978 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
980 bs->drv = drv;
981 bs->opaque = g_malloc0(drv->instance_size);
983 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
985 /* Open the image, either directly or using a protocol */
986 if (drv->bdrv_file_open) {
987 assert(file == NULL);
988 assert(!drv->bdrv_needs_filename || filename != NULL);
989 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
990 } else {
991 if (file == NULL) {
992 error_setg(errp, "Can't use '%s' as a block driver for the "
993 "protocol level", drv->format_name);
994 ret = -EINVAL;
995 goto free_and_fail;
997 bs->file = file;
998 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1001 if (ret < 0) {
1002 if (local_err) {
1003 error_propagate(errp, local_err);
1004 } else if (bs->filename[0]) {
1005 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1006 } else {
1007 error_setg_errno(errp, -ret, "Could not open image");
1009 goto free_and_fail;
1012 ret = refresh_total_sectors(bs, bs->total_sectors);
1013 if (ret < 0) {
1014 error_setg_errno(errp, -ret, "Could not refresh total sector count");
1015 goto free_and_fail;
1018 bdrv_refresh_limits(bs, &local_err);
1019 if (local_err) {
1020 error_propagate(errp, local_err);
1021 ret = -EINVAL;
1022 goto free_and_fail;
1025 assert(bdrv_opt_mem_align(bs) != 0);
1026 assert((bs->request_alignment != 0) || bs->sg);
1027 return 0;
1029 free_and_fail:
1030 bs->file = NULL;
1031 g_free(bs->opaque);
1032 bs->opaque = NULL;
1033 bs->drv = NULL;
1034 return ret;
1037 static QDict *parse_json_filename(const char *filename, Error **errp)
1039 QObject *options_obj;
1040 QDict *options;
1041 int ret;
1043 ret = strstart(filename, "json:", &filename);
1044 assert(ret);
1046 options_obj = qobject_from_json(filename);
1047 if (!options_obj) {
1048 error_setg(errp, "Could not parse the JSON options");
1049 return NULL;
1052 if (qobject_type(options_obj) != QTYPE_QDICT) {
1053 qobject_decref(options_obj);
1054 error_setg(errp, "Invalid JSON object given");
1055 return NULL;
1058 options = qobject_to_qdict(options_obj);
1059 qdict_flatten(options);
1061 return options;
1065 * Fills in default options for opening images and converts the legacy
1066 * filename/flags pair to option QDict entries.
1068 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1069 BlockDriver *drv, Error **errp)
1071 const char *filename = *pfilename;
1072 const char *drvname;
1073 bool protocol = flags & BDRV_O_PROTOCOL;
1074 bool parse_filename = false;
1075 Error *local_err = NULL;
1077 /* Parse json: pseudo-protocol */
1078 if (filename && g_str_has_prefix(filename, "json:")) {
1079 QDict *json_options = parse_json_filename(filename, &local_err);
1080 if (local_err) {
1081 error_propagate(errp, local_err);
1082 return -EINVAL;
1085 /* Options given in the filename have lower priority than options
1086 * specified directly */
1087 qdict_join(*options, json_options, false);
1088 QDECREF(json_options);
1089 *pfilename = filename = NULL;
1092 /* Fetch the file name from the options QDict if necessary */
1093 if (protocol && filename) {
1094 if (!qdict_haskey(*options, "filename")) {
1095 qdict_put(*options, "filename", qstring_from_str(filename));
1096 parse_filename = true;
1097 } else {
1098 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1099 "the same time");
1100 return -EINVAL;
1104 /* Find the right block driver */
1105 filename = qdict_get_try_str(*options, "filename");
1106 drvname = qdict_get_try_str(*options, "driver");
1108 if (drv) {
1109 if (drvname) {
1110 error_setg(errp, "Driver specified twice");
1111 return -EINVAL;
1113 drvname = drv->format_name;
1114 qdict_put(*options, "driver", qstring_from_str(drvname));
1115 } else {
1116 if (!drvname && protocol) {
1117 if (filename) {
1118 drv = bdrv_find_protocol(filename, parse_filename);
1119 if (!drv) {
1120 error_setg(errp, "Unknown protocol");
1121 return -EINVAL;
1124 drvname = drv->format_name;
1125 qdict_put(*options, "driver", qstring_from_str(drvname));
1126 } else {
1127 error_setg(errp, "Must specify either driver or file");
1128 return -EINVAL;
1130 } else if (drvname) {
1131 drv = bdrv_find_format(drvname);
1132 if (!drv) {
1133 error_setg(errp, "Unknown driver '%s'", drvname);
1134 return -ENOENT;
1139 assert(drv || !protocol);
1141 /* Driver-specific filename parsing */
1142 if (drv && drv->bdrv_parse_filename && parse_filename) {
1143 drv->bdrv_parse_filename(filename, *options, &local_err);
1144 if (local_err) {
1145 error_propagate(errp, local_err);
1146 return -EINVAL;
1149 if (!drv->bdrv_needs_filename) {
1150 qdict_del(*options, "filename");
1154 return 0;
1157 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1160 if (bs->backing_hd) {
1161 assert(bs->backing_blocker);
1162 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1163 } else if (backing_hd) {
1164 error_setg(&bs->backing_blocker,
1165 "device is used as backing hd of '%s'",
1166 bdrv_get_device_name(bs));
1169 bs->backing_hd = backing_hd;
1170 if (!backing_hd) {
1171 error_free(bs->backing_blocker);
1172 bs->backing_blocker = NULL;
1173 goto out;
1175 bs->open_flags &= ~BDRV_O_NO_BACKING;
1176 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1177 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1178 backing_hd->drv ? backing_hd->drv->format_name : "");
1180 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1181 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1182 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1183 bs->backing_blocker);
1184 out:
1185 bdrv_refresh_limits(bs, NULL);
1189 * Opens the backing file for a BlockDriverState if not yet open
1191 * options is a QDict of options to pass to the block drivers, or NULL for an
1192 * empty set of options. The reference to the QDict is transferred to this
1193 * function (even on failure), so if the caller intends to reuse the dictionary,
1194 * it needs to use QINCREF() before calling bdrv_file_open.
1196 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1198 char *backing_filename = g_malloc0(PATH_MAX);
1199 int ret = 0;
1200 BlockDriverState *backing_hd;
1201 Error *local_err = NULL;
1203 if (bs->backing_hd != NULL) {
1204 QDECREF(options);
1205 goto free_exit;
1208 /* NULL means an empty set of options */
1209 if (options == NULL) {
1210 options = qdict_new();
1213 bs->open_flags &= ~BDRV_O_NO_BACKING;
1214 if (qdict_haskey(options, "file.filename")) {
1215 backing_filename[0] = '\0';
1216 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1217 QDECREF(options);
1218 goto free_exit;
1219 } else {
1220 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1223 if (!bs->drv || !bs->drv->supports_backing) {
1224 ret = -EINVAL;
1225 error_setg(errp, "Driver doesn't support backing files");
1226 QDECREF(options);
1227 goto free_exit;
1230 backing_hd = bdrv_new();
1232 if (bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
1233 qdict_put(options, "driver", qstring_from_str(bs->backing_format));
1236 assert(bs->backing_hd == NULL);
1237 ret = bdrv_open(&backing_hd,
1238 *backing_filename ? backing_filename : NULL, NULL, options,
1239 bdrv_backing_flags(bs->open_flags), NULL, &local_err);
1240 if (ret < 0) {
1241 bdrv_unref(backing_hd);
1242 backing_hd = NULL;
1243 bs->open_flags |= BDRV_O_NO_BACKING;
1244 error_setg(errp, "Could not open backing file: %s",
1245 error_get_pretty(local_err));
1246 error_free(local_err);
1247 goto free_exit;
1249 bdrv_set_backing_hd(bs, backing_hd);
1251 free_exit:
1252 g_free(backing_filename);
1253 return ret;
1257 * Opens a disk image whose options are given as BlockdevRef in another block
1258 * device's options.
1260 * If allow_none is true, no image will be opened if filename is false and no
1261 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1263 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1264 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1265 * itself, all options starting with "${bdref_key}." are considered part of the
1266 * BlockdevRef.
1268 * The BlockdevRef will be removed from the options QDict.
1270 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1272 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1273 QDict *options, const char *bdref_key, int flags,
1274 bool allow_none, Error **errp)
1276 QDict *image_options;
1277 int ret;
1278 char *bdref_key_dot;
1279 const char *reference;
1281 assert(pbs);
1282 assert(*pbs == NULL);
1284 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1285 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1286 g_free(bdref_key_dot);
1288 reference = qdict_get_try_str(options, bdref_key);
1289 if (!filename && !reference && !qdict_size(image_options)) {
1290 if (allow_none) {
1291 ret = 0;
1292 } else {
1293 error_setg(errp, "A block device must be specified for \"%s\"",
1294 bdref_key);
1295 ret = -EINVAL;
1297 QDECREF(image_options);
1298 goto done;
1301 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1303 done:
1304 qdict_del(options, bdref_key);
1305 return ret;
1308 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1310 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1311 char *tmp_filename = g_malloc0(PATH_MAX + 1);
1312 int64_t total_size;
1313 QemuOpts *opts = NULL;
1314 QDict *snapshot_options;
1315 BlockDriverState *bs_snapshot;
1316 Error *local_err;
1317 int ret;
1319 /* if snapshot, we create a temporary backing file and open it
1320 instead of opening 'filename' directly */
1322 /* Get the required size from the image */
1323 total_size = bdrv_getlength(bs);
1324 if (total_size < 0) {
1325 ret = total_size;
1326 error_setg_errno(errp, -total_size, "Could not get image size");
1327 goto out;
1330 /* Create the temporary image */
1331 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1332 if (ret < 0) {
1333 error_setg_errno(errp, -ret, "Could not get temporary filename");
1334 goto out;
1337 opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
1338 &error_abort);
1339 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1340 ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, &local_err);
1341 qemu_opts_del(opts);
1342 if (ret < 0) {
1343 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1344 "'%s': %s", tmp_filename,
1345 error_get_pretty(local_err));
1346 error_free(local_err);
1347 goto out;
1350 /* Prepare a new options QDict for the temporary file */
1351 snapshot_options = qdict_new();
1352 qdict_put(snapshot_options, "file.driver",
1353 qstring_from_str("file"));
1354 qdict_put(snapshot_options, "file.filename",
1355 qstring_from_str(tmp_filename));
1357 bs_snapshot = bdrv_new();
1359 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1360 flags, &bdrv_qcow2, &local_err);
1361 if (ret < 0) {
1362 error_propagate(errp, local_err);
1363 goto out;
1366 bdrv_append(bs_snapshot, bs);
1368 out:
1369 g_free(tmp_filename);
1370 return ret;
1374 * Opens a disk image (raw, qcow2, vmdk, ...)
1376 * options is a QDict of options to pass to the block drivers, or NULL for an
1377 * empty set of options. The reference to the QDict belongs to the block layer
1378 * after the call (even on failure), so if the caller intends to reuse the
1379 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1381 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1382 * If it is not NULL, the referenced BDS will be reused.
1384 * The reference parameter may be used to specify an existing block device which
1385 * should be opened. If specified, neither options nor a filename may be given,
1386 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1388 int bdrv_open(BlockDriverState **pbs, const char *filename,
1389 const char *reference, QDict *options, int flags,
1390 BlockDriver *drv, Error **errp)
1392 int ret;
1393 BlockDriverState *file = NULL, *bs;
1394 const char *drvname;
1395 Error *local_err = NULL;
1396 int snapshot_flags = 0;
1398 assert(pbs);
1400 if (reference) {
1401 bool options_non_empty = options ? qdict_size(options) : false;
1402 QDECREF(options);
1404 if (*pbs) {
1405 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1406 "another block device");
1407 return -EINVAL;
1410 if (filename || options_non_empty) {
1411 error_setg(errp, "Cannot reference an existing block device with "
1412 "additional options or a new filename");
1413 return -EINVAL;
1416 bs = bdrv_lookup_bs(reference, reference, errp);
1417 if (!bs) {
1418 return -ENODEV;
1420 bdrv_ref(bs);
1421 *pbs = bs;
1422 return 0;
1425 if (*pbs) {
1426 bs = *pbs;
1427 } else {
1428 bs = bdrv_new();
1431 /* NULL means an empty set of options */
1432 if (options == NULL) {
1433 options = qdict_new();
1436 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1437 if (local_err) {
1438 goto fail;
1441 /* Find the right image format driver */
1442 drv = NULL;
1443 drvname = qdict_get_try_str(options, "driver");
1444 if (drvname) {
1445 drv = bdrv_find_format(drvname);
1446 qdict_del(options, "driver");
1447 if (!drv) {
1448 error_setg(errp, "Unknown driver: '%s'", drvname);
1449 ret = -EINVAL;
1450 goto fail;
1454 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1455 if (drv && !drv->bdrv_file_open) {
1456 /* If the user explicitly wants a format driver here, we'll need to add
1457 * another layer for the protocol in bs->file */
1458 flags &= ~BDRV_O_PROTOCOL;
1461 bs->options = options;
1462 options = qdict_clone_shallow(options);
1464 /* Open image file without format layer */
1465 if ((flags & BDRV_O_PROTOCOL) == 0) {
1466 if (flags & BDRV_O_RDWR) {
1467 flags |= BDRV_O_ALLOW_RDWR;
1469 if (flags & BDRV_O_SNAPSHOT) {
1470 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1471 flags = bdrv_backing_flags(flags);
1474 assert(file == NULL);
1475 ret = bdrv_open_image(&file, filename, options, "file",
1476 bdrv_inherited_flags(flags),
1477 true, &local_err);
1478 if (ret < 0) {
1479 goto fail;
1483 /* Image format probing */
1484 bs->probed = !drv;
1485 if (!drv && file) {
1486 ret = find_image_format(file, filename, &drv, &local_err);
1487 if (ret < 0) {
1488 goto fail;
1490 } else if (!drv) {
1491 error_setg(errp, "Must specify either driver or file");
1492 ret = -EINVAL;
1493 goto fail;
1496 /* Open the image */
1497 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1498 if (ret < 0) {
1499 goto fail;
1502 if (file && (bs->file != file)) {
1503 bdrv_unref(file);
1504 file = NULL;
1507 /* If there is a backing file, use it */
1508 if ((flags & BDRV_O_NO_BACKING) == 0) {
1509 QDict *backing_options;
1511 qdict_extract_subqdict(options, &backing_options, "backing.");
1512 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1513 if (ret < 0) {
1514 goto close_and_fail;
1518 bdrv_refresh_filename(bs);
1520 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1521 * temporary snapshot afterwards. */
1522 if (snapshot_flags) {
1523 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1524 if (local_err) {
1525 goto close_and_fail;
1529 /* Check if any unknown options were used */
1530 if (options && (qdict_size(options) != 0)) {
1531 const QDictEntry *entry = qdict_first(options);
1532 if (flags & BDRV_O_PROTOCOL) {
1533 error_setg(errp, "Block protocol '%s' doesn't support the option "
1534 "'%s'", drv->format_name, entry->key);
1535 } else {
1536 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1537 "support the option '%s'", drv->format_name,
1538 bdrv_get_device_name(bs), entry->key);
1541 ret = -EINVAL;
1542 goto close_and_fail;
1545 if (!bdrv_key_required(bs)) {
1546 if (bs->blk) {
1547 blk_dev_change_media_cb(bs->blk, true);
1549 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1550 && !runstate_check(RUN_STATE_INMIGRATE)
1551 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1552 error_setg(errp,
1553 "Guest must be stopped for opening of encrypted image");
1554 ret = -EBUSY;
1555 goto close_and_fail;
1558 QDECREF(options);
1559 *pbs = bs;
1560 return 0;
1562 fail:
1563 if (file != NULL) {
1564 bdrv_unref(file);
1566 QDECREF(bs->options);
1567 QDECREF(options);
1568 bs->options = NULL;
1569 if (!*pbs) {
1570 /* If *pbs is NULL, a new BDS has been created in this function and
1571 needs to be freed now. Otherwise, it does not need to be closed,
1572 since it has not really been opened yet. */
1573 bdrv_unref(bs);
1575 if (local_err) {
1576 error_propagate(errp, local_err);
1578 return ret;
1580 close_and_fail:
1581 /* See fail path, but now the BDS has to be always closed */
1582 if (*pbs) {
1583 bdrv_close(bs);
1584 } else {
1585 bdrv_unref(bs);
1587 QDECREF(options);
1588 if (local_err) {
1589 error_propagate(errp, local_err);
1591 return ret;
1594 typedef struct BlockReopenQueueEntry {
1595 bool prepared;
1596 BDRVReopenState state;
1597 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1598 } BlockReopenQueueEntry;
1601 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1602 * reopen of multiple devices.
1604 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1605 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1606 * be created and initialized. This newly created BlockReopenQueue should be
1607 * passed back in for subsequent calls that are intended to be of the same
1608 * atomic 'set'.
1610 * bs is the BlockDriverState to add to the reopen queue.
1612 * flags contains the open flags for the associated bs
1614 * returns a pointer to bs_queue, which is either the newly allocated
1615 * bs_queue, or the existing bs_queue being used.
1618 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1619 BlockDriverState *bs, int flags)
1621 assert(bs != NULL);
1623 BlockReopenQueueEntry *bs_entry;
1624 if (bs_queue == NULL) {
1625 bs_queue = g_new0(BlockReopenQueue, 1);
1626 QSIMPLEQ_INIT(bs_queue);
1629 /* bdrv_open() masks this flag out */
1630 flags &= ~BDRV_O_PROTOCOL;
1632 if (bs->file) {
1633 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1636 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1637 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1639 bs_entry->state.bs = bs;
1640 bs_entry->state.flags = flags;
1642 return bs_queue;
1646 * Reopen multiple BlockDriverStates atomically & transactionally.
1648 * The queue passed in (bs_queue) must have been built up previous
1649 * via bdrv_reopen_queue().
1651 * Reopens all BDS specified in the queue, with the appropriate
1652 * flags. All devices are prepared for reopen, and failure of any
1653 * device will cause all device changes to be abandonded, and intermediate
1654 * data cleaned up.
1656 * If all devices prepare successfully, then the changes are committed
1657 * to all devices.
1660 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1662 int ret = -1;
1663 BlockReopenQueueEntry *bs_entry, *next;
1664 Error *local_err = NULL;
1666 assert(bs_queue != NULL);
1668 bdrv_drain_all();
1670 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1671 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1672 error_propagate(errp, local_err);
1673 goto cleanup;
1675 bs_entry->prepared = true;
1678 /* If we reach this point, we have success and just need to apply the
1679 * changes
1681 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1682 bdrv_reopen_commit(&bs_entry->state);
1685 ret = 0;
1687 cleanup:
1688 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1689 if (ret && bs_entry->prepared) {
1690 bdrv_reopen_abort(&bs_entry->state);
1692 g_free(bs_entry);
1694 g_free(bs_queue);
1695 return ret;
1699 /* Reopen a single BlockDriverState with the specified flags. */
1700 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1702 int ret = -1;
1703 Error *local_err = NULL;
1704 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1706 ret = bdrv_reopen_multiple(queue, &local_err);
1707 if (local_err != NULL) {
1708 error_propagate(errp, local_err);
1710 return ret;
1715 * Prepares a BlockDriverState for reopen. All changes are staged in the
1716 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1717 * the block driver layer .bdrv_reopen_prepare()
1719 * bs is the BlockDriverState to reopen
1720 * flags are the new open flags
1721 * queue is the reopen queue
1723 * Returns 0 on success, non-zero on error. On error errp will be set
1724 * as well.
1726 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1727 * It is the responsibility of the caller to then call the abort() or
1728 * commit() for any other BDS that have been left in a prepare() state
1731 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1732 Error **errp)
1734 int ret = -1;
1735 Error *local_err = NULL;
1736 BlockDriver *drv;
1738 assert(reopen_state != NULL);
1739 assert(reopen_state->bs->drv != NULL);
1740 drv = reopen_state->bs->drv;
1742 /* if we are to stay read-only, do not allow permission change
1743 * to r/w */
1744 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1745 reopen_state->flags & BDRV_O_RDWR) {
1746 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1747 bdrv_get_device_name(reopen_state->bs));
1748 goto error;
1752 ret = bdrv_flush(reopen_state->bs);
1753 if (ret) {
1754 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1755 strerror(-ret));
1756 goto error;
1759 if (drv->bdrv_reopen_prepare) {
1760 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1761 if (ret) {
1762 if (local_err != NULL) {
1763 error_propagate(errp, local_err);
1764 } else {
1765 error_setg(errp, "failed while preparing to reopen image '%s'",
1766 reopen_state->bs->filename);
1768 goto error;
1770 } else {
1771 /* It is currently mandatory to have a bdrv_reopen_prepare()
1772 * handler for each supported drv. */
1773 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1774 drv->format_name, bdrv_get_device_name(reopen_state->bs),
1775 "reopening of file");
1776 ret = -1;
1777 goto error;
1780 ret = 0;
1782 error:
1783 return ret;
1787 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1788 * makes them final by swapping the staging BlockDriverState contents into
1789 * the active BlockDriverState contents.
1791 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1793 BlockDriver *drv;
1795 assert(reopen_state != NULL);
1796 drv = reopen_state->bs->drv;
1797 assert(drv != NULL);
1799 /* If there are any driver level actions to take */
1800 if (drv->bdrv_reopen_commit) {
1801 drv->bdrv_reopen_commit(reopen_state);
1804 /* set BDS specific flags now */
1805 reopen_state->bs->open_flags = reopen_state->flags;
1806 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1807 BDRV_O_CACHE_WB);
1808 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1810 bdrv_refresh_limits(reopen_state->bs, NULL);
1814 * Abort the reopen, and delete and free the staged changes in
1815 * reopen_state
1817 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1819 BlockDriver *drv;
1821 assert(reopen_state != NULL);
1822 drv = reopen_state->bs->drv;
1823 assert(drv != NULL);
1825 if (drv->bdrv_reopen_abort) {
1826 drv->bdrv_reopen_abort(reopen_state);
1831 void bdrv_close(BlockDriverState *bs)
1833 BdrvAioNotifier *ban, *ban_next;
1835 if (bs->job) {
1836 block_job_cancel_sync(bs->job);
1838 bdrv_drain_all(); /* complete I/O */
1839 bdrv_flush(bs);
1840 bdrv_drain_all(); /* in case flush left pending I/O */
1841 notifier_list_notify(&bs->close_notifiers, bs);
1843 if (bs->drv) {
1844 if (bs->backing_hd) {
1845 BlockDriverState *backing_hd = bs->backing_hd;
1846 bdrv_set_backing_hd(bs, NULL);
1847 bdrv_unref(backing_hd);
1849 bs->drv->bdrv_close(bs);
1850 g_free(bs->opaque);
1851 bs->opaque = NULL;
1852 bs->drv = NULL;
1853 bs->copy_on_read = 0;
1854 bs->backing_file[0] = '\0';
1855 bs->backing_format[0] = '\0';
1856 bs->total_sectors = 0;
1857 bs->encrypted = 0;
1858 bs->valid_key = 0;
1859 bs->sg = 0;
1860 bs->growable = 0;
1861 bs->zero_beyond_eof = false;
1862 QDECREF(bs->options);
1863 bs->options = NULL;
1864 QDECREF(bs->full_open_options);
1865 bs->full_open_options = NULL;
1867 if (bs->file != NULL) {
1868 bdrv_unref(bs->file);
1869 bs->file = NULL;
1873 if (bs->blk) {
1874 blk_dev_change_media_cb(bs->blk, false);
1877 /*throttling disk I/O limits*/
1878 if (bs->io_limits_enabled) {
1879 bdrv_io_limits_disable(bs);
1882 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1883 g_free(ban);
1885 QLIST_INIT(&bs->aio_notifiers);
1888 void bdrv_close_all(void)
1890 BlockDriverState *bs;
1892 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1893 AioContext *aio_context = bdrv_get_aio_context(bs);
1895 aio_context_acquire(aio_context);
1896 bdrv_close(bs);
1897 aio_context_release(aio_context);
1901 /* Check if any requests are in-flight (including throttled requests) */
1902 static bool bdrv_requests_pending(BlockDriverState *bs)
1904 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1905 return true;
1907 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1908 return true;
1910 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1911 return true;
1913 if (bs->file && bdrv_requests_pending(bs->file)) {
1914 return true;
1916 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1917 return true;
1919 return false;
1922 static bool bdrv_drain_one(BlockDriverState *bs)
1924 bool bs_busy;
1926 bdrv_flush_io_queue(bs);
1927 bdrv_start_throttled_reqs(bs);
1928 bs_busy = bdrv_requests_pending(bs);
1929 bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
1930 return bs_busy;
1934 * Wait for pending requests to complete on a single BlockDriverState subtree
1936 * See the warning in bdrv_drain_all(). This function can only be called if
1937 * you are sure nothing can generate I/O because you have op blockers
1938 * installed.
1940 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
1941 * AioContext.
1943 void bdrv_drain(BlockDriverState *bs)
1945 while (bdrv_drain_one(bs)) {
1946 /* Keep iterating */
1951 * Wait for pending requests to complete across all BlockDriverStates
1953 * This function does not flush data to disk, use bdrv_flush_all() for that
1954 * after calling this function.
1956 * Note that completion of an asynchronous I/O operation can trigger any
1957 * number of other I/O operations on other devices---for example a coroutine
1958 * can be arbitrarily complex and a constant flow of I/O can come until the
1959 * coroutine is complete. Because of this, it is not possible to have a
1960 * function to drain a single device's I/O queue.
1962 void bdrv_drain_all(void)
1964 /* Always run first iteration so any pending completion BHs run */
1965 bool busy = true;
1966 BlockDriverState *bs;
1968 while (busy) {
1969 busy = false;
1971 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1972 AioContext *aio_context = bdrv_get_aio_context(bs);
1974 aio_context_acquire(aio_context);
1975 busy |= bdrv_drain_one(bs);
1976 aio_context_release(aio_context);
1981 /* make a BlockDriverState anonymous by removing from bdrv_state and
1982 * graph_bdrv_state list.
1983 Also, NULL terminate the device_name to prevent double remove */
1984 void bdrv_make_anon(BlockDriverState *bs)
1987 * Take care to remove bs from bdrv_states only when it's actually
1988 * in it. Note that bs->device_list.tqe_prev is initially null,
1989 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
1990 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1991 * resetting it to null on remove.
1993 if (bs->device_list.tqe_prev) {
1994 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1995 bs->device_list.tqe_prev = NULL;
1997 if (bs->node_name[0] != '\0') {
1998 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
2000 bs->node_name[0] = '\0';
2003 static void bdrv_rebind(BlockDriverState *bs)
2005 if (bs->drv && bs->drv->bdrv_rebind) {
2006 bs->drv->bdrv_rebind(bs);
2010 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2011 BlockDriverState *bs_src)
2013 /* move some fields that need to stay attached to the device */
2015 /* dev info */
2016 bs_dest->guest_block_size = bs_src->guest_block_size;
2017 bs_dest->copy_on_read = bs_src->copy_on_read;
2019 bs_dest->enable_write_cache = bs_src->enable_write_cache;
2021 /* i/o throttled req */
2022 memcpy(&bs_dest->throttle_state,
2023 &bs_src->throttle_state,
2024 sizeof(ThrottleState));
2025 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
2026 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
2027 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
2029 /* r/w error */
2030 bs_dest->on_read_error = bs_src->on_read_error;
2031 bs_dest->on_write_error = bs_src->on_write_error;
2033 /* i/o status */
2034 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2035 bs_dest->iostatus = bs_src->iostatus;
2037 /* dirty bitmap */
2038 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
2040 /* reference count */
2041 bs_dest->refcnt = bs_src->refcnt;
2043 /* job */
2044 bs_dest->job = bs_src->job;
2046 /* keep the same entry in bdrv_states */
2047 bs_dest->device_list = bs_src->device_list;
2048 bs_dest->blk = bs_src->blk;
2050 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2051 sizeof(bs_dest->op_blockers));
2055 * Swap bs contents for two image chains while they are live,
2056 * while keeping required fields on the BlockDriverState that is
2057 * actually attached to a device.
2059 * This will modify the BlockDriverState fields, and swap contents
2060 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2062 * bs_new must not be attached to a BlockBackend.
2064 * This function does not create any image files.
2066 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2068 BlockDriverState tmp;
2070 /* The code needs to swap the node_name but simply swapping node_list won't
2071 * work so first remove the nodes from the graph list, do the swap then
2072 * insert them back if needed.
2074 if (bs_new->node_name[0] != '\0') {
2075 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2077 if (bs_old->node_name[0] != '\0') {
2078 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2081 /* bs_new must be unattached and shouldn't have anything fancy enabled */
2082 assert(!bs_new->blk);
2083 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2084 assert(bs_new->job == NULL);
2085 assert(bs_new->io_limits_enabled == false);
2086 assert(!throttle_have_timer(&bs_new->throttle_state));
2088 tmp = *bs_new;
2089 *bs_new = *bs_old;
2090 *bs_old = tmp;
2092 /* there are some fields that should not be swapped, move them back */
2093 bdrv_move_feature_fields(&tmp, bs_old);
2094 bdrv_move_feature_fields(bs_old, bs_new);
2095 bdrv_move_feature_fields(bs_new, &tmp);
2097 /* bs_new must remain unattached */
2098 assert(!bs_new->blk);
2100 /* Check a few fields that should remain attached to the device */
2101 assert(bs_new->job == NULL);
2102 assert(bs_new->io_limits_enabled == false);
2103 assert(!throttle_have_timer(&bs_new->throttle_state));
2105 /* insert the nodes back into the graph node list if needed */
2106 if (bs_new->node_name[0] != '\0') {
2107 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2109 if (bs_old->node_name[0] != '\0') {
2110 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2113 bdrv_rebind(bs_new);
2114 bdrv_rebind(bs_old);
2118 * Add new bs contents at the top of an image chain while the chain is
2119 * live, while keeping required fields on the top layer.
2121 * This will modify the BlockDriverState fields, and swap contents
2122 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2124 * bs_new must not be attached to a BlockBackend.
2126 * This function does not create any image files.
2128 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2130 bdrv_swap(bs_new, bs_top);
2132 /* The contents of 'tmp' will become bs_top, as we are
2133 * swapping bs_new and bs_top contents. */
2134 bdrv_set_backing_hd(bs_top, bs_new);
2137 static void bdrv_delete(BlockDriverState *bs)
2139 assert(!bs->job);
2140 assert(bdrv_op_blocker_is_empty(bs));
2141 assert(!bs->refcnt);
2142 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2144 bdrv_close(bs);
2146 /* remove from list, if necessary */
2147 bdrv_make_anon(bs);
2149 g_free(bs);
2153 * Run consistency checks on an image
2155 * Returns 0 if the check could be completed (it doesn't mean that the image is
2156 * free of errors) or -errno when an internal error occurred. The results of the
2157 * check are stored in res.
2159 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2161 if (bs->drv == NULL) {
2162 return -ENOMEDIUM;
2164 if (bs->drv->bdrv_check == NULL) {
2165 return -ENOTSUP;
2168 memset(res, 0, sizeof(*res));
2169 return bs->drv->bdrv_check(bs, res, fix);
2172 #define COMMIT_BUF_SECTORS 2048
2174 /* commit COW file into the raw image */
2175 int bdrv_commit(BlockDriverState *bs)
2177 BlockDriver *drv = bs->drv;
2178 int64_t sector, total_sectors, length, backing_length;
2179 int n, ro, open_flags;
2180 int ret = 0;
2181 uint8_t *buf = NULL;
2182 char filename[PATH_MAX];
2184 if (!drv)
2185 return -ENOMEDIUM;
2187 if (!bs->backing_hd) {
2188 return -ENOTSUP;
2191 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2192 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2193 return -EBUSY;
2196 ro = bs->backing_hd->read_only;
2197 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2198 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2199 open_flags = bs->backing_hd->open_flags;
2201 if (ro) {
2202 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2203 return -EACCES;
2207 length = bdrv_getlength(bs);
2208 if (length < 0) {
2209 ret = length;
2210 goto ro_cleanup;
2213 backing_length = bdrv_getlength(bs->backing_hd);
2214 if (backing_length < 0) {
2215 ret = backing_length;
2216 goto ro_cleanup;
2219 /* If our top snapshot is larger than the backing file image,
2220 * grow the backing file image if possible. If not possible,
2221 * we must return an error */
2222 if (length > backing_length) {
2223 ret = bdrv_truncate(bs->backing_hd, length);
2224 if (ret < 0) {
2225 goto ro_cleanup;
2229 total_sectors = length >> BDRV_SECTOR_BITS;
2231 /* qemu_try_blockalign() for bs will choose an alignment that works for
2232 * bs->backing_hd as well, so no need to compare the alignment manually. */
2233 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2234 if (buf == NULL) {
2235 ret = -ENOMEM;
2236 goto ro_cleanup;
2239 for (sector = 0; sector < total_sectors; sector += n) {
2240 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2241 if (ret < 0) {
2242 goto ro_cleanup;
2244 if (ret) {
2245 ret = bdrv_read(bs, sector, buf, n);
2246 if (ret < 0) {
2247 goto ro_cleanup;
2250 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2251 if (ret < 0) {
2252 goto ro_cleanup;
2257 if (drv->bdrv_make_empty) {
2258 ret = drv->bdrv_make_empty(bs);
2259 if (ret < 0) {
2260 goto ro_cleanup;
2262 bdrv_flush(bs);
2266 * Make sure all data we wrote to the backing device is actually
2267 * stable on disk.
2269 if (bs->backing_hd) {
2270 bdrv_flush(bs->backing_hd);
2273 ret = 0;
2274 ro_cleanup:
2275 qemu_vfree(buf);
2277 if (ro) {
2278 /* ignoring error return here */
2279 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2282 return ret;
2285 int bdrv_commit_all(void)
2287 BlockDriverState *bs;
2289 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2290 AioContext *aio_context = bdrv_get_aio_context(bs);
2292 aio_context_acquire(aio_context);
2293 if (bs->drv && bs->backing_hd) {
2294 int ret = bdrv_commit(bs);
2295 if (ret < 0) {
2296 aio_context_release(aio_context);
2297 return ret;
2300 aio_context_release(aio_context);
2302 return 0;
2306 * Remove an active request from the tracked requests list
2308 * This function should be called when a tracked request is completing.
2310 static void tracked_request_end(BdrvTrackedRequest *req)
2312 if (req->serialising) {
2313 req->bs->serialising_in_flight--;
2316 QLIST_REMOVE(req, list);
2317 qemu_co_queue_restart_all(&req->wait_queue);
2321 * Add an active request to the tracked requests list
2323 static void tracked_request_begin(BdrvTrackedRequest *req,
2324 BlockDriverState *bs,
2325 int64_t offset,
2326 unsigned int bytes, bool is_write)
2328 *req = (BdrvTrackedRequest){
2329 .bs = bs,
2330 .offset = offset,
2331 .bytes = bytes,
2332 .is_write = is_write,
2333 .co = qemu_coroutine_self(),
2334 .serialising = false,
2335 .overlap_offset = offset,
2336 .overlap_bytes = bytes,
2339 qemu_co_queue_init(&req->wait_queue);
2341 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2344 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2346 int64_t overlap_offset = req->offset & ~(align - 1);
2347 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2348 - overlap_offset;
2350 if (!req->serialising) {
2351 req->bs->serialising_in_flight++;
2352 req->serialising = true;
2355 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2356 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2360 * Round a region to cluster boundaries
2362 void bdrv_round_to_clusters(BlockDriverState *bs,
2363 int64_t sector_num, int nb_sectors,
2364 int64_t *cluster_sector_num,
2365 int *cluster_nb_sectors)
2367 BlockDriverInfo bdi;
2369 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2370 *cluster_sector_num = sector_num;
2371 *cluster_nb_sectors = nb_sectors;
2372 } else {
2373 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2374 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2375 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2376 nb_sectors, c);
2380 static int bdrv_get_cluster_size(BlockDriverState *bs)
2382 BlockDriverInfo bdi;
2383 int ret;
2385 ret = bdrv_get_info(bs, &bdi);
2386 if (ret < 0 || bdi.cluster_size == 0) {
2387 return bs->request_alignment;
2388 } else {
2389 return bdi.cluster_size;
2393 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2394 int64_t offset, unsigned int bytes)
2396 /* aaaa bbbb */
2397 if (offset >= req->overlap_offset + req->overlap_bytes) {
2398 return false;
2400 /* bbbb aaaa */
2401 if (req->overlap_offset >= offset + bytes) {
2402 return false;
2404 return true;
2407 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2409 BlockDriverState *bs = self->bs;
2410 BdrvTrackedRequest *req;
2411 bool retry;
2412 bool waited = false;
2414 if (!bs->serialising_in_flight) {
2415 return false;
2418 do {
2419 retry = false;
2420 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2421 if (req == self || (!req->serialising && !self->serialising)) {
2422 continue;
2424 if (tracked_request_overlaps(req, self->overlap_offset,
2425 self->overlap_bytes))
2427 /* Hitting this means there was a reentrant request, for
2428 * example, a block driver issuing nested requests. This must
2429 * never happen since it means deadlock.
2431 assert(qemu_coroutine_self() != req->co);
2433 /* If the request is already (indirectly) waiting for us, or
2434 * will wait for us as soon as it wakes up, then just go on
2435 * (instead of producing a deadlock in the former case). */
2436 if (!req->waiting_for) {
2437 self->waiting_for = req;
2438 qemu_co_queue_wait(&req->wait_queue);
2439 self->waiting_for = NULL;
2440 retry = true;
2441 waited = true;
2442 break;
2446 } while (retry);
2448 return waited;
2452 * Return values:
2453 * 0 - success
2454 * -EINVAL - backing format specified, but no file
2455 * -ENOSPC - can't update the backing file because no space is left in the
2456 * image file header
2457 * -ENOTSUP - format driver doesn't support changing the backing file
2459 int bdrv_change_backing_file(BlockDriverState *bs,
2460 const char *backing_file, const char *backing_fmt)
2462 BlockDriver *drv = bs->drv;
2463 int ret;
2465 /* Backing file format doesn't make sense without a backing file */
2466 if (backing_fmt && !backing_file) {
2467 return -EINVAL;
2470 if (drv->bdrv_change_backing_file != NULL) {
2471 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2472 } else {
2473 ret = -ENOTSUP;
2476 if (ret == 0) {
2477 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2478 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2480 return ret;
2484 * Finds the image layer in the chain that has 'bs' as its backing file.
2486 * active is the current topmost image.
2488 * Returns NULL if bs is not found in active's image chain,
2489 * or if active == bs.
2491 * Returns the bottommost base image if bs == NULL.
2493 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2494 BlockDriverState *bs)
2496 while (active && bs != active->backing_hd) {
2497 active = active->backing_hd;
2500 return active;
2503 /* Given a BDS, searches for the base layer. */
2504 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2506 return bdrv_find_overlay(bs, NULL);
2509 typedef struct BlkIntermediateStates {
2510 BlockDriverState *bs;
2511 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2512 } BlkIntermediateStates;
2516 * Drops images above 'base' up to and including 'top', and sets the image
2517 * above 'top' to have base as its backing file.
2519 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2520 * information in 'bs' can be properly updated.
2522 * E.g., this will convert the following chain:
2523 * bottom <- base <- intermediate <- top <- active
2525 * to
2527 * bottom <- base <- active
2529 * It is allowed for bottom==base, in which case it converts:
2531 * base <- intermediate <- top <- active
2533 * to
2535 * base <- active
2537 * If backing_file_str is non-NULL, it will be used when modifying top's
2538 * overlay image metadata.
2540 * Error conditions:
2541 * if active == top, that is considered an error
2544 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2545 BlockDriverState *base, const char *backing_file_str)
2547 BlockDriverState *intermediate;
2548 BlockDriverState *base_bs = NULL;
2549 BlockDriverState *new_top_bs = NULL;
2550 BlkIntermediateStates *intermediate_state, *next;
2551 int ret = -EIO;
2553 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2554 QSIMPLEQ_INIT(&states_to_delete);
2556 if (!top->drv || !base->drv) {
2557 goto exit;
2560 new_top_bs = bdrv_find_overlay(active, top);
2562 if (new_top_bs == NULL) {
2563 /* we could not find the image above 'top', this is an error */
2564 goto exit;
2567 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2568 * to do, no intermediate images */
2569 if (new_top_bs->backing_hd == base) {
2570 ret = 0;
2571 goto exit;
2574 intermediate = top;
2576 /* now we will go down through the list, and add each BDS we find
2577 * into our deletion queue, until we hit the 'base'
2579 while (intermediate) {
2580 intermediate_state = g_new0(BlkIntermediateStates, 1);
2581 intermediate_state->bs = intermediate;
2582 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2584 if (intermediate->backing_hd == base) {
2585 base_bs = intermediate->backing_hd;
2586 break;
2588 intermediate = intermediate->backing_hd;
2590 if (base_bs == NULL) {
2591 /* something went wrong, we did not end at the base. safely
2592 * unravel everything, and exit with error */
2593 goto exit;
2596 /* success - we can delete the intermediate states, and link top->base */
2597 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2598 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2599 base_bs->drv ? base_bs->drv->format_name : "");
2600 if (ret) {
2601 goto exit;
2603 bdrv_set_backing_hd(new_top_bs, base_bs);
2605 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2606 /* so that bdrv_close() does not recursively close the chain */
2607 bdrv_set_backing_hd(intermediate_state->bs, NULL);
2608 bdrv_unref(intermediate_state->bs);
2610 ret = 0;
2612 exit:
2613 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2614 g_free(intermediate_state);
2616 return ret;
2620 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2621 size_t size)
2623 int64_t len;
2625 if (size > INT_MAX) {
2626 return -EIO;
2629 if (!bdrv_is_inserted(bs))
2630 return -ENOMEDIUM;
2632 if (bs->growable)
2633 return 0;
2635 len = bdrv_getlength(bs);
2637 if (offset < 0)
2638 return -EIO;
2640 if ((offset > len) || (len - offset < size))
2641 return -EIO;
2643 return 0;
2646 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2647 int nb_sectors)
2649 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2650 return -EIO;
2653 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2654 nb_sectors * BDRV_SECTOR_SIZE);
2657 typedef struct RwCo {
2658 BlockDriverState *bs;
2659 int64_t offset;
2660 QEMUIOVector *qiov;
2661 bool is_write;
2662 int ret;
2663 BdrvRequestFlags flags;
2664 } RwCo;
2666 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2668 RwCo *rwco = opaque;
2670 if (!rwco->is_write) {
2671 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2672 rwco->qiov->size, rwco->qiov,
2673 rwco->flags);
2674 } else {
2675 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2676 rwco->qiov->size, rwco->qiov,
2677 rwco->flags);
2682 * Process a vectored synchronous request using coroutines
2684 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2685 QEMUIOVector *qiov, bool is_write,
2686 BdrvRequestFlags flags)
2688 Coroutine *co;
2689 RwCo rwco = {
2690 .bs = bs,
2691 .offset = offset,
2692 .qiov = qiov,
2693 .is_write = is_write,
2694 .ret = NOT_DONE,
2695 .flags = flags,
2699 * In sync call context, when the vcpu is blocked, this throttling timer
2700 * will not fire; so the I/O throttling function has to be disabled here
2701 * if it has been enabled.
2703 if (bs->io_limits_enabled) {
2704 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2705 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2706 bdrv_io_limits_disable(bs);
2709 if (qemu_in_coroutine()) {
2710 /* Fast-path if already in coroutine context */
2711 bdrv_rw_co_entry(&rwco);
2712 } else {
2713 AioContext *aio_context = bdrv_get_aio_context(bs);
2715 co = qemu_coroutine_create(bdrv_rw_co_entry);
2716 qemu_coroutine_enter(co, &rwco);
2717 while (rwco.ret == NOT_DONE) {
2718 aio_poll(aio_context, true);
2721 return rwco.ret;
2725 * Process a synchronous request using coroutines
2727 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2728 int nb_sectors, bool is_write, BdrvRequestFlags flags)
2730 QEMUIOVector qiov;
2731 struct iovec iov = {
2732 .iov_base = (void *)buf,
2733 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2736 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2737 return -EINVAL;
2740 qemu_iovec_init_external(&qiov, &iov, 1);
2741 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2742 &qiov, is_write, flags);
2745 /* return < 0 if error. See bdrv_write() for the return codes */
2746 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2747 uint8_t *buf, int nb_sectors)
2749 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2752 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2753 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2754 uint8_t *buf, int nb_sectors)
2756 bool enabled;
2757 int ret;
2759 enabled = bs->io_limits_enabled;
2760 bs->io_limits_enabled = false;
2761 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2762 bs->io_limits_enabled = enabled;
2763 return ret;
2766 /* Return < 0 if error. Important errors are:
2767 -EIO generic I/O error (may happen for all errors)
2768 -ENOMEDIUM No media inserted.
2769 -EINVAL Invalid sector number or nb_sectors
2770 -EACCES Trying to write a read-only device
2772 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2773 const uint8_t *buf, int nb_sectors)
2775 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2778 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2779 int nb_sectors, BdrvRequestFlags flags)
2781 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2782 BDRV_REQ_ZERO_WRITE | flags);
2786 * Completely zero out a block device with the help of bdrv_write_zeroes.
2787 * The operation is sped up by checking the block status and only writing
2788 * zeroes to the device if they currently do not return zeroes. Optional
2789 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2791 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2793 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2795 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2796 int n;
2798 target_sectors = bdrv_nb_sectors(bs);
2799 if (target_sectors < 0) {
2800 return target_sectors;
2803 for (;;) {
2804 nb_sectors = target_sectors - sector_num;
2805 if (nb_sectors <= 0) {
2806 return 0;
2808 if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2809 nb_sectors = INT_MAX / BDRV_SECTOR_SIZE;
2811 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2812 if (ret < 0) {
2813 error_report("error getting block status at sector %" PRId64 ": %s",
2814 sector_num, strerror(-ret));
2815 return ret;
2817 if (ret & BDRV_BLOCK_ZERO) {
2818 sector_num += n;
2819 continue;
2821 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2822 if (ret < 0) {
2823 error_report("error writing zeroes at sector %" PRId64 ": %s",
2824 sector_num, strerror(-ret));
2825 return ret;
2827 sector_num += n;
2831 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2833 QEMUIOVector qiov;
2834 struct iovec iov = {
2835 .iov_base = (void *)buf,
2836 .iov_len = bytes,
2838 int ret;
2840 if (bytes < 0) {
2841 return -EINVAL;
2844 qemu_iovec_init_external(&qiov, &iov, 1);
2845 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2846 if (ret < 0) {
2847 return ret;
2850 return bytes;
2853 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2855 int ret;
2857 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2858 if (ret < 0) {
2859 return ret;
2862 return qiov->size;
2865 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2866 const void *buf, int bytes)
2868 QEMUIOVector qiov;
2869 struct iovec iov = {
2870 .iov_base = (void *) buf,
2871 .iov_len = bytes,
2874 if (bytes < 0) {
2875 return -EINVAL;
2878 qemu_iovec_init_external(&qiov, &iov, 1);
2879 return bdrv_pwritev(bs, offset, &qiov);
2883 * Writes to the file and ensures that no writes are reordered across this
2884 * request (acts as a barrier)
2886 * Returns 0 on success, -errno in error cases.
2888 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2889 const void *buf, int count)
2891 int ret;
2893 ret = bdrv_pwrite(bs, offset, buf, count);
2894 if (ret < 0) {
2895 return ret;
2898 /* No flush needed for cache modes that already do it */
2899 if (bs->enable_write_cache) {
2900 bdrv_flush(bs);
2903 return 0;
2906 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2907 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2909 /* Perform I/O through a temporary buffer so that users who scribble over
2910 * their read buffer while the operation is in progress do not end up
2911 * modifying the image file. This is critical for zero-copy guest I/O
2912 * where anything might happen inside guest memory.
2914 void *bounce_buffer;
2916 BlockDriver *drv = bs->drv;
2917 struct iovec iov;
2918 QEMUIOVector bounce_qiov;
2919 int64_t cluster_sector_num;
2920 int cluster_nb_sectors;
2921 size_t skip_bytes;
2922 int ret;
2924 /* Cover entire cluster so no additional backing file I/O is required when
2925 * allocating cluster in the image file.
2927 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2928 &cluster_sector_num, &cluster_nb_sectors);
2930 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2931 cluster_sector_num, cluster_nb_sectors);
2933 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2934 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2935 if (bounce_buffer == NULL) {
2936 ret = -ENOMEM;
2937 goto err;
2940 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2942 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2943 &bounce_qiov);
2944 if (ret < 0) {
2945 goto err;
2948 if (drv->bdrv_co_write_zeroes &&
2949 buffer_is_zero(bounce_buffer, iov.iov_len)) {
2950 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2951 cluster_nb_sectors, 0);
2952 } else {
2953 /* This does not change the data on the disk, it is not necessary
2954 * to flush even in cache=writethrough mode.
2956 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2957 &bounce_qiov);
2960 if (ret < 0) {
2961 /* It might be okay to ignore write errors for guest requests. If this
2962 * is a deliberate copy-on-read then we don't want to ignore the error.
2963 * Simply report it in all cases.
2965 goto err;
2968 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2969 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2970 nb_sectors * BDRV_SECTOR_SIZE);
2972 err:
2973 qemu_vfree(bounce_buffer);
2974 return ret;
2978 * Forwards an already correctly aligned request to the BlockDriver. This
2979 * handles copy on read and zeroing after EOF; any other features must be
2980 * implemented by the caller.
2982 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2983 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2984 int64_t align, QEMUIOVector *qiov, int flags)
2986 BlockDriver *drv = bs->drv;
2987 int ret;
2989 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2990 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2992 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2993 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2994 assert(!qiov || bytes == qiov->size);
2996 /* Handle Copy on Read and associated serialisation */
2997 if (flags & BDRV_REQ_COPY_ON_READ) {
2998 /* If we touch the same cluster it counts as an overlap. This
2999 * guarantees that allocating writes will be serialized and not race
3000 * with each other for the same cluster. For example, in copy-on-read
3001 * it ensures that the CoR read and write operations are atomic and
3002 * guest writes cannot interleave between them. */
3003 mark_request_serialising(req, bdrv_get_cluster_size(bs));
3006 wait_serialising_requests(req);
3008 if (flags & BDRV_REQ_COPY_ON_READ) {
3009 int pnum;
3011 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3012 if (ret < 0) {
3013 goto out;
3016 if (!ret || pnum != nb_sectors) {
3017 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3018 goto out;
3022 /* Forward the request to the BlockDriver */
3023 if (!(bs->zero_beyond_eof && bs->growable)) {
3024 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3025 } else {
3026 /* Read zeros after EOF of growable BDSes */
3027 int64_t total_sectors, max_nb_sectors;
3029 total_sectors = bdrv_nb_sectors(bs);
3030 if (total_sectors < 0) {
3031 ret = total_sectors;
3032 goto out;
3035 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3036 align >> BDRV_SECTOR_BITS);
3037 if (max_nb_sectors > 0) {
3038 QEMUIOVector local_qiov;
3039 size_t local_sectors;
3041 max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3042 local_sectors = MIN(max_nb_sectors, nb_sectors);
3044 qemu_iovec_init(&local_qiov, qiov->niov);
3045 qemu_iovec_concat(&local_qiov, qiov, 0,
3046 local_sectors * BDRV_SECTOR_SIZE);
3048 ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3049 &local_qiov);
3051 qemu_iovec_destroy(&local_qiov);
3052 } else {
3053 ret = 0;
3056 /* Reading beyond end of file is supposed to produce zeroes */
3057 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3058 uint64_t offset = MAX(0, total_sectors - sector_num);
3059 uint64_t bytes = (sector_num + nb_sectors - offset) *
3060 BDRV_SECTOR_SIZE;
3061 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3065 out:
3066 return ret;
3070 * Handle a read request in coroutine context
3072 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3073 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3074 BdrvRequestFlags flags)
3076 BlockDriver *drv = bs->drv;
3077 BdrvTrackedRequest req;
3079 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3080 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3081 uint8_t *head_buf = NULL;
3082 uint8_t *tail_buf = NULL;
3083 QEMUIOVector local_qiov;
3084 bool use_local_qiov = false;
3085 int ret;
3087 if (!drv) {
3088 return -ENOMEDIUM;
3090 if (bdrv_check_byte_request(bs, offset, bytes)) {
3091 return -EIO;
3094 if (bs->copy_on_read) {
3095 flags |= BDRV_REQ_COPY_ON_READ;
3098 /* throttling disk I/O */
3099 if (bs->io_limits_enabled) {
3100 bdrv_io_limits_intercept(bs, bytes, false);
3103 /* Align read if necessary by padding qiov */
3104 if (offset & (align - 1)) {
3105 head_buf = qemu_blockalign(bs, align);
3106 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3107 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3108 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3109 use_local_qiov = true;
3111 bytes += offset & (align - 1);
3112 offset = offset & ~(align - 1);
3115 if ((offset + bytes) & (align - 1)) {
3116 if (!use_local_qiov) {
3117 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3118 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3119 use_local_qiov = true;
3121 tail_buf = qemu_blockalign(bs, align);
3122 qemu_iovec_add(&local_qiov, tail_buf,
3123 align - ((offset + bytes) & (align - 1)));
3125 bytes = ROUND_UP(bytes, align);
3128 tracked_request_begin(&req, bs, offset, bytes, false);
3129 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3130 use_local_qiov ? &local_qiov : qiov,
3131 flags);
3132 tracked_request_end(&req);
3134 if (use_local_qiov) {
3135 qemu_iovec_destroy(&local_qiov);
3136 qemu_vfree(head_buf);
3137 qemu_vfree(tail_buf);
3140 return ret;
3143 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3144 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3145 BdrvRequestFlags flags)
3147 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3148 return -EINVAL;
3151 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3152 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3155 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3156 int nb_sectors, QEMUIOVector *qiov)
3158 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3160 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3163 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3164 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3166 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3168 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3169 BDRV_REQ_COPY_ON_READ);
3172 /* if no limit is specified in the BlockLimits use a default
3173 * of 32768 512-byte sectors (16 MiB) per request.
3175 #define MAX_WRITE_ZEROES_DEFAULT 32768
3177 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3178 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3180 BlockDriver *drv = bs->drv;
3181 QEMUIOVector qiov;
3182 struct iovec iov = {0};
3183 int ret = 0;
3185 int max_write_zeroes = bs->bl.max_write_zeroes ?
3186 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3188 while (nb_sectors > 0 && !ret) {
3189 int num = nb_sectors;
3191 /* Align request. Block drivers can expect the "bulk" of the request
3192 * to be aligned.
3194 if (bs->bl.write_zeroes_alignment
3195 && num > bs->bl.write_zeroes_alignment) {
3196 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3197 /* Make a small request up to the first aligned sector. */
3198 num = bs->bl.write_zeroes_alignment;
3199 num -= sector_num % bs->bl.write_zeroes_alignment;
3200 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3201 /* Shorten the request to the last aligned sector. num cannot
3202 * underflow because num > bs->bl.write_zeroes_alignment.
3204 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3208 /* limit request size */
3209 if (num > max_write_zeroes) {
3210 num = max_write_zeroes;
3213 ret = -ENOTSUP;
3214 /* First try the efficient write zeroes operation */
3215 if (drv->bdrv_co_write_zeroes) {
3216 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3219 if (ret == -ENOTSUP) {
3220 /* Fall back to bounce buffer if write zeroes is unsupported */
3221 iov.iov_len = num * BDRV_SECTOR_SIZE;
3222 if (iov.iov_base == NULL) {
3223 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3224 if (iov.iov_base == NULL) {
3225 ret = -ENOMEM;
3226 goto fail;
3228 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3230 qemu_iovec_init_external(&qiov, &iov, 1);
3232 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3234 /* Keep bounce buffer around if it is big enough for all
3235 * all future requests.
3237 if (num < max_write_zeroes) {
3238 qemu_vfree(iov.iov_base);
3239 iov.iov_base = NULL;
3243 sector_num += num;
3244 nb_sectors -= num;
3247 fail:
3248 qemu_vfree(iov.iov_base);
3249 return ret;
3253 * Forwards an already correctly aligned write request to the BlockDriver.
3255 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3256 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3257 QEMUIOVector *qiov, int flags)
3259 BlockDriver *drv = bs->drv;
3260 bool waited;
3261 int ret;
3263 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3264 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3266 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3267 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3268 assert(!qiov || bytes == qiov->size);
3270 waited = wait_serialising_requests(req);
3271 assert(!waited || !req->serialising);
3272 assert(req->overlap_offset <= offset);
3273 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3275 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3277 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3278 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3279 qemu_iovec_is_zero(qiov)) {
3280 flags |= BDRV_REQ_ZERO_WRITE;
3281 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3282 flags |= BDRV_REQ_MAY_UNMAP;
3286 if (ret < 0) {
3287 /* Do nothing, write notifier decided to fail this request */
3288 } else if (flags & BDRV_REQ_ZERO_WRITE) {
3289 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3290 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3291 } else {
3292 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3293 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3295 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3297 if (ret == 0 && !bs->enable_write_cache) {
3298 ret = bdrv_co_flush(bs);
3301 bdrv_set_dirty(bs, sector_num, nb_sectors);
3303 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3305 if (bs->growable && ret >= 0) {
3306 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3309 return ret;
3313 * Handle a write request in coroutine context
3315 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3316 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3317 BdrvRequestFlags flags)
3319 BdrvTrackedRequest req;
3320 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3321 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3322 uint8_t *head_buf = NULL;
3323 uint8_t *tail_buf = NULL;
3324 QEMUIOVector local_qiov;
3325 bool use_local_qiov = false;
3326 int ret;
3328 if (!bs->drv) {
3329 return -ENOMEDIUM;
3331 if (bs->read_only) {
3332 return -EACCES;
3334 if (bdrv_check_byte_request(bs, offset, bytes)) {
3335 return -EIO;
3338 /* throttling disk I/O */
3339 if (bs->io_limits_enabled) {
3340 bdrv_io_limits_intercept(bs, bytes, true);
3344 * Align write if necessary by performing a read-modify-write cycle.
3345 * Pad qiov with the read parts and be sure to have a tracked request not
3346 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3348 tracked_request_begin(&req, bs, offset, bytes, true);
3350 if (offset & (align - 1)) {
3351 QEMUIOVector head_qiov;
3352 struct iovec head_iov;
3354 mark_request_serialising(&req, align);
3355 wait_serialising_requests(&req);
3357 head_buf = qemu_blockalign(bs, align);
3358 head_iov = (struct iovec) {
3359 .iov_base = head_buf,
3360 .iov_len = align,
3362 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3364 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3365 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3366 align, &head_qiov, 0);
3367 if (ret < 0) {
3368 goto fail;
3370 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3372 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3373 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3374 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3375 use_local_qiov = true;
3377 bytes += offset & (align - 1);
3378 offset = offset & ~(align - 1);
3381 if ((offset + bytes) & (align - 1)) {
3382 QEMUIOVector tail_qiov;
3383 struct iovec tail_iov;
3384 size_t tail_bytes;
3385 bool waited;
3387 mark_request_serialising(&req, align);
3388 waited = wait_serialising_requests(&req);
3389 assert(!waited || !use_local_qiov);
3391 tail_buf = qemu_blockalign(bs, align);
3392 tail_iov = (struct iovec) {
3393 .iov_base = tail_buf,
3394 .iov_len = align,
3396 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3398 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3399 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3400 align, &tail_qiov, 0);
3401 if (ret < 0) {
3402 goto fail;
3404 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3406 if (!use_local_qiov) {
3407 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3408 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3409 use_local_qiov = true;
3412 tail_bytes = (offset + bytes) & (align - 1);
3413 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3415 bytes = ROUND_UP(bytes, align);
3418 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3419 use_local_qiov ? &local_qiov : qiov,
3420 flags);
3422 fail:
3423 tracked_request_end(&req);
3425 if (use_local_qiov) {
3426 qemu_iovec_destroy(&local_qiov);
3428 qemu_vfree(head_buf);
3429 qemu_vfree(tail_buf);
3431 return ret;
3434 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3435 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3436 BdrvRequestFlags flags)
3438 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3439 return -EINVAL;
3442 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3443 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3446 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3447 int nb_sectors, QEMUIOVector *qiov)
3449 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3451 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3454 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3455 int64_t sector_num, int nb_sectors,
3456 BdrvRequestFlags flags)
3458 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3460 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3461 flags &= ~BDRV_REQ_MAY_UNMAP;
3464 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3465 BDRV_REQ_ZERO_WRITE | flags);
3469 * Truncate file to 'offset' bytes (needed only for file protocols)
3471 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3473 BlockDriver *drv = bs->drv;
3474 int ret;
3475 if (!drv)
3476 return -ENOMEDIUM;
3477 if (!drv->bdrv_truncate)
3478 return -ENOTSUP;
3479 if (bs->read_only)
3480 return -EACCES;
3482 ret = drv->bdrv_truncate(bs, offset);
3483 if (ret == 0) {
3484 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3485 if (bs->blk) {
3486 blk_dev_resize_cb(bs->blk);
3489 return ret;
3493 * Length of a allocated file in bytes. Sparse files are counted by actual
3494 * allocated space. Return < 0 if error or unknown.
3496 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3498 BlockDriver *drv = bs->drv;
3499 if (!drv) {
3500 return -ENOMEDIUM;
3502 if (drv->bdrv_get_allocated_file_size) {
3503 return drv->bdrv_get_allocated_file_size(bs);
3505 if (bs->file) {
3506 return bdrv_get_allocated_file_size(bs->file);
3508 return -ENOTSUP;
3512 * Return number of sectors on success, -errno on error.
3514 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3516 BlockDriver *drv = bs->drv;
3518 if (!drv)
3519 return -ENOMEDIUM;
3521 if (drv->has_variable_length) {
3522 int ret = refresh_total_sectors(bs, bs->total_sectors);
3523 if (ret < 0) {
3524 return ret;
3527 return bs->total_sectors;
3531 * Return length in bytes on success, -errno on error.
3532 * The length is always a multiple of BDRV_SECTOR_SIZE.
3534 int64_t bdrv_getlength(BlockDriverState *bs)
3536 int64_t ret = bdrv_nb_sectors(bs);
3538 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3541 /* return 0 as number of sectors if no device present or error */
3542 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3544 int64_t nb_sectors = bdrv_nb_sectors(bs);
3546 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3549 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3550 BlockdevOnError on_write_error)
3552 bs->on_read_error = on_read_error;
3553 bs->on_write_error = on_write_error;
3556 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3558 return is_read ? bs->on_read_error : bs->on_write_error;
3561 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3563 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3565 switch (on_err) {
3566 case BLOCKDEV_ON_ERROR_ENOSPC:
3567 return (error == ENOSPC) ?
3568 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3569 case BLOCKDEV_ON_ERROR_STOP:
3570 return BLOCK_ERROR_ACTION_STOP;
3571 case BLOCKDEV_ON_ERROR_REPORT:
3572 return BLOCK_ERROR_ACTION_REPORT;
3573 case BLOCKDEV_ON_ERROR_IGNORE:
3574 return BLOCK_ERROR_ACTION_IGNORE;
3575 default:
3576 abort();
3580 static void send_qmp_error_event(BlockDriverState *bs,
3581 BlockErrorAction action,
3582 bool is_read, int error)
3584 IoOperationType optype;
3586 optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3587 qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
3588 bdrv_iostatus_is_enabled(bs),
3589 error == ENOSPC, strerror(error),
3590 &error_abort);
3593 /* This is done by device models because, while the block layer knows
3594 * about the error, it does not know whether an operation comes from
3595 * the device or the block layer (from a job, for example).
3597 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3598 bool is_read, int error)
3600 assert(error >= 0);
3602 if (action == BLOCK_ERROR_ACTION_STOP) {
3603 /* First set the iostatus, so that "info block" returns an iostatus
3604 * that matches the events raised so far (an additional error iostatus
3605 * is fine, but not a lost one).
3607 bdrv_iostatus_set_err(bs, error);
3609 /* Then raise the request to stop the VM and the event.
3610 * qemu_system_vmstop_request_prepare has two effects. First,
3611 * it ensures that the STOP event always comes after the
3612 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3613 * can observe the STOP event and do a "cont" before the STOP
3614 * event is issued, the VM will not stop. In this case, vm_start()
3615 * also ensures that the STOP/RESUME pair of events is emitted.
3617 qemu_system_vmstop_request_prepare();
3618 send_qmp_error_event(bs, action, is_read, error);
3619 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3620 } else {
3621 send_qmp_error_event(bs, action, is_read, error);
3625 int bdrv_is_read_only(BlockDriverState *bs)
3627 return bs->read_only;
3630 int bdrv_is_sg(BlockDriverState *bs)
3632 return bs->sg;
3635 int bdrv_enable_write_cache(BlockDriverState *bs)
3637 return bs->enable_write_cache;
3640 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3642 bs->enable_write_cache = wce;
3644 /* so a reopen() will preserve wce */
3645 if (wce) {
3646 bs->open_flags |= BDRV_O_CACHE_WB;
3647 } else {
3648 bs->open_flags &= ~BDRV_O_CACHE_WB;
3652 int bdrv_is_encrypted(BlockDriverState *bs)
3654 if (bs->backing_hd && bs->backing_hd->encrypted)
3655 return 1;
3656 return bs->encrypted;
3659 int bdrv_key_required(BlockDriverState *bs)
3661 BlockDriverState *backing_hd = bs->backing_hd;
3663 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3664 return 1;
3665 return (bs->encrypted && !bs->valid_key);
3668 int bdrv_set_key(BlockDriverState *bs, const char *key)
3670 int ret;
3671 if (bs->backing_hd && bs->backing_hd->encrypted) {
3672 ret = bdrv_set_key(bs->backing_hd, key);
3673 if (ret < 0)
3674 return ret;
3675 if (!bs->encrypted)
3676 return 0;
3678 if (!bs->encrypted) {
3679 return -EINVAL;
3680 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3681 return -ENOMEDIUM;
3683 ret = bs->drv->bdrv_set_key(bs, key);
3684 if (ret < 0) {
3685 bs->valid_key = 0;
3686 } else if (!bs->valid_key) {
3687 bs->valid_key = 1;
3688 if (bs->blk) {
3689 /* call the change callback now, we skipped it on open */
3690 blk_dev_change_media_cb(bs->blk, true);
3693 return ret;
3696 const char *bdrv_get_format_name(BlockDriverState *bs)
3698 return bs->drv ? bs->drv->format_name : NULL;
3701 static int qsort_strcmp(const void *a, const void *b)
3703 return strcmp(a, b);
3706 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3707 void *opaque)
3709 BlockDriver *drv;
3710 int count = 0;
3711 int i;
3712 const char **formats = NULL;
3714 QLIST_FOREACH(drv, &bdrv_drivers, list) {
3715 if (drv->format_name) {
3716 bool found = false;
3717 int i = count;
3718 while (formats && i && !found) {
3719 found = !strcmp(formats[--i], drv->format_name);
3722 if (!found) {
3723 formats = g_renew(const char *, formats, count + 1);
3724 formats[count++] = drv->format_name;
3729 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3731 for (i = 0; i < count; i++) {
3732 it(opaque, formats[i]);
3735 g_free(formats);
3738 /* This function is to find block backend bs */
3739 /* TODO convert callers to blk_by_name(), then remove */
3740 BlockDriverState *bdrv_find(const char *name)
3742 BlockBackend *blk = blk_by_name(name);
3744 return blk ? blk_bs(blk) : NULL;
3747 /* This function is to find a node in the bs graph */
3748 BlockDriverState *bdrv_find_node(const char *node_name)
3750 BlockDriverState *bs;
3752 assert(node_name);
3754 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3755 if (!strcmp(node_name, bs->node_name)) {
3756 return bs;
3759 return NULL;
3762 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3763 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3765 BlockDeviceInfoList *list, *entry;
3766 BlockDriverState *bs;
3768 list = NULL;
3769 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3770 entry = g_malloc0(sizeof(*entry));
3771 entry->value = bdrv_block_device_info(bs);
3772 entry->next = list;
3773 list = entry;
3776 return list;
3779 BlockDriverState *bdrv_lookup_bs(const char *device,
3780 const char *node_name,
3781 Error **errp)
3783 BlockBackend *blk;
3784 BlockDriverState *bs;
3786 if (device) {
3787 blk = blk_by_name(device);
3789 if (blk) {
3790 return blk_bs(blk);
3794 if (node_name) {
3795 bs = bdrv_find_node(node_name);
3797 if (bs) {
3798 return bs;
3802 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3803 device ? device : "",
3804 node_name ? node_name : "");
3805 return NULL;
3808 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3809 * return false. If either argument is NULL, return false. */
3810 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3812 while (top && top != base) {
3813 top = top->backing_hd;
3816 return top != NULL;
3819 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3821 if (!bs) {
3822 return QTAILQ_FIRST(&graph_bdrv_states);
3824 return QTAILQ_NEXT(bs, node_list);
3827 BlockDriverState *bdrv_next(BlockDriverState *bs)
3829 if (!bs) {
3830 return QTAILQ_FIRST(&bdrv_states);
3832 return QTAILQ_NEXT(bs, device_list);
3835 const char *bdrv_get_node_name(const BlockDriverState *bs)
3837 return bs->node_name;
3840 /* TODO check what callers really want: bs->node_name or blk_name() */
3841 const char *bdrv_get_device_name(const BlockDriverState *bs)
3843 return bs->blk ? blk_name(bs->blk) : "";
3846 int bdrv_get_flags(BlockDriverState *bs)
3848 return bs->open_flags;
3851 int bdrv_flush_all(void)
3853 BlockDriverState *bs;
3854 int result = 0;
3856 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3857 AioContext *aio_context = bdrv_get_aio_context(bs);
3858 int ret;
3860 aio_context_acquire(aio_context);
3861 ret = bdrv_flush(bs);
3862 if (ret < 0 && !result) {
3863 result = ret;
3865 aio_context_release(aio_context);
3868 return result;
3871 int bdrv_has_zero_init_1(BlockDriverState *bs)
3873 return 1;
3876 int bdrv_has_zero_init(BlockDriverState *bs)
3878 assert(bs->drv);
3880 /* If BS is a copy on write image, it is initialized to
3881 the contents of the base image, which may not be zeroes. */
3882 if (bs->backing_hd) {
3883 return 0;
3885 if (bs->drv->bdrv_has_zero_init) {
3886 return bs->drv->bdrv_has_zero_init(bs);
3889 /* safe default */
3890 return 0;
3893 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3895 BlockDriverInfo bdi;
3897 if (bs->backing_hd) {
3898 return false;
3901 if (bdrv_get_info(bs, &bdi) == 0) {
3902 return bdi.unallocated_blocks_are_zero;
3905 return false;
3908 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3910 BlockDriverInfo bdi;
3912 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3913 return false;
3916 if (bdrv_get_info(bs, &bdi) == 0) {
3917 return bdi.can_write_zeroes_with_unmap;
3920 return false;
3923 typedef struct BdrvCoGetBlockStatusData {
3924 BlockDriverState *bs;
3925 BlockDriverState *base;
3926 int64_t sector_num;
3927 int nb_sectors;
3928 int *pnum;
3929 int64_t ret;
3930 bool done;
3931 } BdrvCoGetBlockStatusData;
3934 * Returns the allocation status of the specified sectors.
3935 * Drivers not implementing the functionality are assumed to not support
3936 * backing files, hence all their sectors are reported as allocated.
3938 * If 'sector_num' is beyond the end of the disk image the return value is 0
3939 * and 'pnum' is set to 0.
3941 * 'pnum' is set to the number of sectors (including and immediately following
3942 * the specified sector) that are known to be in the same
3943 * allocated/unallocated state.
3945 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3946 * beyond the end of the disk image it will be clamped.
3948 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3949 int64_t sector_num,
3950 int nb_sectors, int *pnum)
3952 int64_t total_sectors;
3953 int64_t n;
3954 int64_t ret, ret2;
3956 total_sectors = bdrv_nb_sectors(bs);
3957 if (total_sectors < 0) {
3958 return total_sectors;
3961 if (sector_num >= total_sectors) {
3962 *pnum = 0;
3963 return 0;
3966 n = total_sectors - sector_num;
3967 if (n < nb_sectors) {
3968 nb_sectors = n;
3971 if (!bs->drv->bdrv_co_get_block_status) {
3972 *pnum = nb_sectors;
3973 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3974 if (bs->drv->protocol_name) {
3975 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3977 return ret;
3980 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3981 if (ret < 0) {
3982 *pnum = 0;
3983 return ret;
3986 if (ret & BDRV_BLOCK_RAW) {
3987 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3988 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3989 *pnum, pnum);
3992 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
3993 ret |= BDRV_BLOCK_ALLOCATED;
3996 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3997 if (bdrv_unallocated_blocks_are_zero(bs)) {
3998 ret |= BDRV_BLOCK_ZERO;
3999 } else if (bs->backing_hd) {
4000 BlockDriverState *bs2 = bs->backing_hd;
4001 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4002 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4003 ret |= BDRV_BLOCK_ZERO;
4008 if (bs->file &&
4009 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4010 (ret & BDRV_BLOCK_OFFSET_VALID)) {
4011 int file_pnum;
4013 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4014 *pnum, &file_pnum);
4015 if (ret2 >= 0) {
4016 /* Ignore errors. This is just providing extra information, it
4017 * is useful but not necessary.
4019 if (!file_pnum) {
4020 /* !file_pnum indicates an offset at or beyond the EOF; it is
4021 * perfectly valid for the format block driver to point to such
4022 * offsets, so catch it and mark everything as zero */
4023 ret |= BDRV_BLOCK_ZERO;
4024 } else {
4025 /* Limit request to the range reported by the protocol driver */
4026 *pnum = file_pnum;
4027 ret |= (ret2 & BDRV_BLOCK_ZERO);
4032 return ret;
4035 /* Coroutine wrapper for bdrv_get_block_status() */
4036 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4038 BdrvCoGetBlockStatusData *data = opaque;
4039 BlockDriverState *bs = data->bs;
4041 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4042 data->pnum);
4043 data->done = true;
4047 * Synchronous wrapper around bdrv_co_get_block_status().
4049 * See bdrv_co_get_block_status() for details.
4051 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4052 int nb_sectors, int *pnum)
4054 Coroutine *co;
4055 BdrvCoGetBlockStatusData data = {
4056 .bs = bs,
4057 .sector_num = sector_num,
4058 .nb_sectors = nb_sectors,
4059 .pnum = pnum,
4060 .done = false,
4063 if (qemu_in_coroutine()) {
4064 /* Fast-path if already in coroutine context */
4065 bdrv_get_block_status_co_entry(&data);
4066 } else {
4067 AioContext *aio_context = bdrv_get_aio_context(bs);
4069 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4070 qemu_coroutine_enter(co, &data);
4071 while (!data.done) {
4072 aio_poll(aio_context, true);
4075 return data.ret;
4078 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4079 int nb_sectors, int *pnum)
4081 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4082 if (ret < 0) {
4083 return ret;
4085 return !!(ret & BDRV_BLOCK_ALLOCATED);
4089 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4091 * Return true if the given sector is allocated in any image between
4092 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4093 * sector is allocated in any image of the chain. Return false otherwise.
4095 * 'pnum' is set to the number of sectors (including and immediately following
4096 * the specified sector) that are known to be in the same
4097 * allocated/unallocated state.
4100 int bdrv_is_allocated_above(BlockDriverState *top,
4101 BlockDriverState *base,
4102 int64_t sector_num,
4103 int nb_sectors, int *pnum)
4105 BlockDriverState *intermediate;
4106 int ret, n = nb_sectors;
4108 intermediate = top;
4109 while (intermediate && intermediate != base) {
4110 int pnum_inter;
4111 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4112 &pnum_inter);
4113 if (ret < 0) {
4114 return ret;
4115 } else if (ret) {
4116 *pnum = pnum_inter;
4117 return 1;
4121 * [sector_num, nb_sectors] is unallocated on top but intermediate
4122 * might have
4124 * [sector_num+x, nr_sectors] allocated.
4126 if (n > pnum_inter &&
4127 (intermediate == top ||
4128 sector_num + pnum_inter < intermediate->total_sectors)) {
4129 n = pnum_inter;
4132 intermediate = intermediate->backing_hd;
4135 *pnum = n;
4136 return 0;
4139 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4141 if (bs->backing_hd && bs->backing_hd->encrypted)
4142 return bs->backing_file;
4143 else if (bs->encrypted)
4144 return bs->filename;
4145 else
4146 return NULL;
4149 void bdrv_get_backing_filename(BlockDriverState *bs,
4150 char *filename, int filename_size)
4152 pstrcpy(filename, filename_size, bs->backing_file);
4155 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4156 const uint8_t *buf, int nb_sectors)
4158 BlockDriver *drv = bs->drv;
4159 if (!drv)
4160 return -ENOMEDIUM;
4161 if (!drv->bdrv_write_compressed)
4162 return -ENOTSUP;
4163 if (bdrv_check_request(bs, sector_num, nb_sectors))
4164 return -EIO;
4166 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4168 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4171 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4173 BlockDriver *drv = bs->drv;
4174 if (!drv)
4175 return -ENOMEDIUM;
4176 if (!drv->bdrv_get_info)
4177 return -ENOTSUP;
4178 memset(bdi, 0, sizeof(*bdi));
4179 return drv->bdrv_get_info(bs, bdi);
4182 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4184 BlockDriver *drv = bs->drv;
4185 if (drv && drv->bdrv_get_specific_info) {
4186 return drv->bdrv_get_specific_info(bs);
4188 return NULL;
4191 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4192 int64_t pos, int size)
4194 QEMUIOVector qiov;
4195 struct iovec iov = {
4196 .iov_base = (void *) buf,
4197 .iov_len = size,
4200 qemu_iovec_init_external(&qiov, &iov, 1);
4201 return bdrv_writev_vmstate(bs, &qiov, pos);
4204 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4206 BlockDriver *drv = bs->drv;
4208 if (!drv) {
4209 return -ENOMEDIUM;
4210 } else if (drv->bdrv_save_vmstate) {
4211 return drv->bdrv_save_vmstate(bs, qiov, pos);
4212 } else if (bs->file) {
4213 return bdrv_writev_vmstate(bs->file, qiov, pos);
4216 return -ENOTSUP;
4219 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4220 int64_t pos, int size)
4222 BlockDriver *drv = bs->drv;
4223 if (!drv)
4224 return -ENOMEDIUM;
4225 if (drv->bdrv_load_vmstate)
4226 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4227 if (bs->file)
4228 return bdrv_load_vmstate(bs->file, buf, pos, size);
4229 return -ENOTSUP;
4232 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4234 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4235 return;
4238 bs->drv->bdrv_debug_event(bs, event);
4241 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4242 const char *tag)
4244 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4245 bs = bs->file;
4248 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4249 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4252 return -ENOTSUP;
4255 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4257 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4258 bs = bs->file;
4261 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4262 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4265 return -ENOTSUP;
4268 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4270 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4271 bs = bs->file;
4274 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4275 return bs->drv->bdrv_debug_resume(bs, tag);
4278 return -ENOTSUP;
4281 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4283 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4284 bs = bs->file;
4287 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4288 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4291 return false;
4294 int bdrv_is_snapshot(BlockDriverState *bs)
4296 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4299 /* backing_file can either be relative, or absolute, or a protocol. If it is
4300 * relative, it must be relative to the chain. So, passing in bs->filename
4301 * from a BDS as backing_file should not be done, as that may be relative to
4302 * the CWD rather than the chain. */
4303 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4304 const char *backing_file)
4306 char *filename_full = NULL;
4307 char *backing_file_full = NULL;
4308 char *filename_tmp = NULL;
4309 int is_protocol = 0;
4310 BlockDriverState *curr_bs = NULL;
4311 BlockDriverState *retval = NULL;
4313 if (!bs || !bs->drv || !backing_file) {
4314 return NULL;
4317 filename_full = g_malloc(PATH_MAX);
4318 backing_file_full = g_malloc(PATH_MAX);
4319 filename_tmp = g_malloc(PATH_MAX);
4321 is_protocol = path_has_protocol(backing_file);
4323 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4325 /* If either of the filename paths is actually a protocol, then
4326 * compare unmodified paths; otherwise make paths relative */
4327 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4328 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4329 retval = curr_bs->backing_hd;
4330 break;
4332 } else {
4333 /* If not an absolute filename path, make it relative to the current
4334 * image's filename path */
4335 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4336 backing_file);
4338 /* We are going to compare absolute pathnames */
4339 if (!realpath(filename_tmp, filename_full)) {
4340 continue;
4343 /* We need to make sure the backing filename we are comparing against
4344 * is relative to the current image filename (or absolute) */
4345 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4346 curr_bs->backing_file);
4348 if (!realpath(filename_tmp, backing_file_full)) {
4349 continue;
4352 if (strcmp(backing_file_full, filename_full) == 0) {
4353 retval = curr_bs->backing_hd;
4354 break;
4359 g_free(filename_full);
4360 g_free(backing_file_full);
4361 g_free(filename_tmp);
4362 return retval;
4365 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4367 if (!bs->drv) {
4368 return 0;
4371 if (!bs->backing_hd) {
4372 return 0;
4375 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4378 /**************************************************************/
4379 /* async I/Os */
4381 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4382 QEMUIOVector *qiov, int nb_sectors,
4383 BlockCompletionFunc *cb, void *opaque)
4385 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4387 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4388 cb, opaque, false);
4391 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4392 QEMUIOVector *qiov, int nb_sectors,
4393 BlockCompletionFunc *cb, void *opaque)
4395 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4397 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4398 cb, opaque, true);
4401 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4402 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4403 BlockCompletionFunc *cb, void *opaque)
4405 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4407 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4408 BDRV_REQ_ZERO_WRITE | flags,
4409 cb, opaque, true);
4413 typedef struct MultiwriteCB {
4414 int error;
4415 int num_requests;
4416 int num_callbacks;
4417 struct {
4418 BlockCompletionFunc *cb;
4419 void *opaque;
4420 QEMUIOVector *free_qiov;
4421 } callbacks[];
4422 } MultiwriteCB;
4424 static void multiwrite_user_cb(MultiwriteCB *mcb)
4426 int i;
4428 for (i = 0; i < mcb->num_callbacks; i++) {
4429 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4430 if (mcb->callbacks[i].free_qiov) {
4431 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4433 g_free(mcb->callbacks[i].free_qiov);
4437 static void multiwrite_cb(void *opaque, int ret)
4439 MultiwriteCB *mcb = opaque;
4441 trace_multiwrite_cb(mcb, ret);
4443 if (ret < 0 && !mcb->error) {
4444 mcb->error = ret;
4447 mcb->num_requests--;
4448 if (mcb->num_requests == 0) {
4449 multiwrite_user_cb(mcb);
4450 g_free(mcb);
4454 static int multiwrite_req_compare(const void *a, const void *b)
4456 const BlockRequest *req1 = a, *req2 = b;
4459 * Note that we can't simply subtract req2->sector from req1->sector
4460 * here as that could overflow the return value.
4462 if (req1->sector > req2->sector) {
4463 return 1;
4464 } else if (req1->sector < req2->sector) {
4465 return -1;
4466 } else {
4467 return 0;
4472 * Takes a bunch of requests and tries to merge them. Returns the number of
4473 * requests that remain after merging.
4475 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4476 int num_reqs, MultiwriteCB *mcb)
4478 int i, outidx;
4480 // Sort requests by start sector
4481 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4483 // Check if adjacent requests touch the same clusters. If so, combine them,
4484 // filling up gaps with zero sectors.
4485 outidx = 0;
4486 for (i = 1; i < num_reqs; i++) {
4487 int merge = 0;
4488 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4490 // Handle exactly sequential writes and overlapping writes.
4491 if (reqs[i].sector <= oldreq_last) {
4492 merge = 1;
4495 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4496 merge = 0;
4499 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4500 reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4501 merge = 0;
4504 if (merge) {
4505 size_t size;
4506 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4507 qemu_iovec_init(qiov,
4508 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4510 // Add the first request to the merged one. If the requests are
4511 // overlapping, drop the last sectors of the first request.
4512 size = (reqs[i].sector - reqs[outidx].sector) << 9;
4513 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4515 // We should need to add any zeros between the two requests
4516 assert (reqs[i].sector <= oldreq_last);
4518 // Add the second request
4519 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4521 // Add tail of first request, if necessary
4522 if (qiov->size < reqs[outidx].qiov->size) {
4523 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4524 reqs[outidx].qiov->size - qiov->size);
4527 reqs[outidx].nb_sectors = qiov->size >> 9;
4528 reqs[outidx].qiov = qiov;
4530 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4531 } else {
4532 outidx++;
4533 reqs[outidx].sector = reqs[i].sector;
4534 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4535 reqs[outidx].qiov = reqs[i].qiov;
4539 return outidx + 1;
4543 * Submit multiple AIO write requests at once.
4545 * On success, the function returns 0 and all requests in the reqs array have
4546 * been submitted. In error case this function returns -1, and any of the
4547 * requests may or may not be submitted yet. In particular, this means that the
4548 * callback will be called for some of the requests, for others it won't. The
4549 * caller must check the error field of the BlockRequest to wait for the right
4550 * callbacks (if error != 0, no callback will be called).
4552 * The implementation may modify the contents of the reqs array, e.g. to merge
4553 * requests. However, the fields opaque and error are left unmodified as they
4554 * are used to signal failure for a single request to the caller.
4556 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4558 MultiwriteCB *mcb;
4559 int i;
4561 /* don't submit writes if we don't have a medium */
4562 if (bs->drv == NULL) {
4563 for (i = 0; i < num_reqs; i++) {
4564 reqs[i].error = -ENOMEDIUM;
4566 return -1;
4569 if (num_reqs == 0) {
4570 return 0;
4573 // Create MultiwriteCB structure
4574 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4575 mcb->num_requests = 0;
4576 mcb->num_callbacks = num_reqs;
4578 for (i = 0; i < num_reqs; i++) {
4579 mcb->callbacks[i].cb = reqs[i].cb;
4580 mcb->callbacks[i].opaque = reqs[i].opaque;
4583 // Check for mergable requests
4584 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4586 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4588 /* Run the aio requests. */
4589 mcb->num_requests = num_reqs;
4590 for (i = 0; i < num_reqs; i++) {
4591 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4592 reqs[i].nb_sectors, reqs[i].flags,
4593 multiwrite_cb, mcb,
4594 true);
4597 return 0;
4600 void bdrv_aio_cancel(BlockAIOCB *acb)
4602 qemu_aio_ref(acb);
4603 bdrv_aio_cancel_async(acb);
4604 while (acb->refcnt > 1) {
4605 if (acb->aiocb_info->get_aio_context) {
4606 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4607 } else if (acb->bs) {
4608 aio_poll(bdrv_get_aio_context(acb->bs), true);
4609 } else {
4610 abort();
4613 qemu_aio_unref(acb);
4616 /* Async version of aio cancel. The caller is not blocked if the acb implements
4617 * cancel_async, otherwise we do nothing and let the request normally complete.
4618 * In either case the completion callback must be called. */
4619 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4621 if (acb->aiocb_info->cancel_async) {
4622 acb->aiocb_info->cancel_async(acb);
4626 /**************************************************************/
4627 /* async block device emulation */
4629 typedef struct BlockAIOCBSync {
4630 BlockAIOCB common;
4631 QEMUBH *bh;
4632 int ret;
4633 /* vector translation state */
4634 QEMUIOVector *qiov;
4635 uint8_t *bounce;
4636 int is_write;
4637 } BlockAIOCBSync;
4639 static const AIOCBInfo bdrv_em_aiocb_info = {
4640 .aiocb_size = sizeof(BlockAIOCBSync),
4643 static void bdrv_aio_bh_cb(void *opaque)
4645 BlockAIOCBSync *acb = opaque;
4647 if (!acb->is_write && acb->ret >= 0) {
4648 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4650 qemu_vfree(acb->bounce);
4651 acb->common.cb(acb->common.opaque, acb->ret);
4652 qemu_bh_delete(acb->bh);
4653 acb->bh = NULL;
4654 qemu_aio_unref(acb);
4657 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4658 int64_t sector_num,
4659 QEMUIOVector *qiov,
4660 int nb_sectors,
4661 BlockCompletionFunc *cb,
4662 void *opaque,
4663 int is_write)
4666 BlockAIOCBSync *acb;
4668 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4669 acb->is_write = is_write;
4670 acb->qiov = qiov;
4671 acb->bounce = qemu_try_blockalign(bs, qiov->size);
4672 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4674 if (acb->bounce == NULL) {
4675 acb->ret = -ENOMEM;
4676 } else if (is_write) {
4677 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4678 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4679 } else {
4680 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4683 qemu_bh_schedule(acb->bh);
4685 return &acb->common;
4688 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4689 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4690 BlockCompletionFunc *cb, void *opaque)
4692 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4695 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4696 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4697 BlockCompletionFunc *cb, void *opaque)
4699 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4703 typedef struct BlockAIOCBCoroutine {
4704 BlockAIOCB common;
4705 BlockRequest req;
4706 bool is_write;
4707 bool *done;
4708 QEMUBH* bh;
4709 } BlockAIOCBCoroutine;
4711 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4712 .aiocb_size = sizeof(BlockAIOCBCoroutine),
4715 static void bdrv_co_em_bh(void *opaque)
4717 BlockAIOCBCoroutine *acb = opaque;
4719 acb->common.cb(acb->common.opaque, acb->req.error);
4721 qemu_bh_delete(acb->bh);
4722 qemu_aio_unref(acb);
4725 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4726 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4728 BlockAIOCBCoroutine *acb = opaque;
4729 BlockDriverState *bs = acb->common.bs;
4731 if (!acb->is_write) {
4732 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4733 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4734 } else {
4735 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4736 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4739 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4740 qemu_bh_schedule(acb->bh);
4743 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4744 int64_t sector_num,
4745 QEMUIOVector *qiov,
4746 int nb_sectors,
4747 BdrvRequestFlags flags,
4748 BlockCompletionFunc *cb,
4749 void *opaque,
4750 bool is_write)
4752 Coroutine *co;
4753 BlockAIOCBCoroutine *acb;
4755 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4756 acb->req.sector = sector_num;
4757 acb->req.nb_sectors = nb_sectors;
4758 acb->req.qiov = qiov;
4759 acb->req.flags = flags;
4760 acb->is_write = is_write;
4762 co = qemu_coroutine_create(bdrv_co_do_rw);
4763 qemu_coroutine_enter(co, acb);
4765 return &acb->common;
4768 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4770 BlockAIOCBCoroutine *acb = opaque;
4771 BlockDriverState *bs = acb->common.bs;
4773 acb->req.error = bdrv_co_flush(bs);
4774 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4775 qemu_bh_schedule(acb->bh);
4778 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4779 BlockCompletionFunc *cb, void *opaque)
4781 trace_bdrv_aio_flush(bs, opaque);
4783 Coroutine *co;
4784 BlockAIOCBCoroutine *acb;
4786 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4788 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4789 qemu_coroutine_enter(co, acb);
4791 return &acb->common;
4794 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4796 BlockAIOCBCoroutine *acb = opaque;
4797 BlockDriverState *bs = acb->common.bs;
4799 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4800 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4801 qemu_bh_schedule(acb->bh);
4804 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4805 int64_t sector_num, int nb_sectors,
4806 BlockCompletionFunc *cb, void *opaque)
4808 Coroutine *co;
4809 BlockAIOCBCoroutine *acb;
4811 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4813 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4814 acb->req.sector = sector_num;
4815 acb->req.nb_sectors = nb_sectors;
4816 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4817 qemu_coroutine_enter(co, acb);
4819 return &acb->common;
4822 void bdrv_init(void)
4824 module_call_init(MODULE_INIT_BLOCK);
4827 void bdrv_init_with_whitelist(void)
4829 use_bdrv_whitelist = 1;
4830 bdrv_init();
4833 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4834 BlockCompletionFunc *cb, void *opaque)
4836 BlockAIOCB *acb;
4838 acb = g_slice_alloc(aiocb_info->aiocb_size);
4839 acb->aiocb_info = aiocb_info;
4840 acb->bs = bs;
4841 acb->cb = cb;
4842 acb->opaque = opaque;
4843 acb->refcnt = 1;
4844 return acb;
4847 void qemu_aio_ref(void *p)
4849 BlockAIOCB *acb = p;
4850 acb->refcnt++;
4853 void qemu_aio_unref(void *p)
4855 BlockAIOCB *acb = p;
4856 assert(acb->refcnt > 0);
4857 if (--acb->refcnt == 0) {
4858 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4862 /**************************************************************/
4863 /* Coroutine block device emulation */
4865 typedef struct CoroutineIOCompletion {
4866 Coroutine *coroutine;
4867 int ret;
4868 } CoroutineIOCompletion;
4870 static void bdrv_co_io_em_complete(void *opaque, int ret)
4872 CoroutineIOCompletion *co = opaque;
4874 co->ret = ret;
4875 qemu_coroutine_enter(co->coroutine, NULL);
4878 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4879 int nb_sectors, QEMUIOVector *iov,
4880 bool is_write)
4882 CoroutineIOCompletion co = {
4883 .coroutine = qemu_coroutine_self(),
4885 BlockAIOCB *acb;
4887 if (is_write) {
4888 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4889 bdrv_co_io_em_complete, &co);
4890 } else {
4891 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4892 bdrv_co_io_em_complete, &co);
4895 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4896 if (!acb) {
4897 return -EIO;
4899 qemu_coroutine_yield();
4901 return co.ret;
4904 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4905 int64_t sector_num, int nb_sectors,
4906 QEMUIOVector *iov)
4908 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4911 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4912 int64_t sector_num, int nb_sectors,
4913 QEMUIOVector *iov)
4915 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4918 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4920 RwCo *rwco = opaque;
4922 rwco->ret = bdrv_co_flush(rwco->bs);
4925 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4927 int ret;
4929 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4930 return 0;
4933 /* Write back cached data to the OS even with cache=unsafe */
4934 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4935 if (bs->drv->bdrv_co_flush_to_os) {
4936 ret = bs->drv->bdrv_co_flush_to_os(bs);
4937 if (ret < 0) {
4938 return ret;
4942 /* But don't actually force it to the disk with cache=unsafe */
4943 if (bs->open_flags & BDRV_O_NO_FLUSH) {
4944 goto flush_parent;
4947 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4948 if (bs->drv->bdrv_co_flush_to_disk) {
4949 ret = bs->drv->bdrv_co_flush_to_disk(bs);
4950 } else if (bs->drv->bdrv_aio_flush) {
4951 BlockAIOCB *acb;
4952 CoroutineIOCompletion co = {
4953 .coroutine = qemu_coroutine_self(),
4956 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4957 if (acb == NULL) {
4958 ret = -EIO;
4959 } else {
4960 qemu_coroutine_yield();
4961 ret = co.ret;
4963 } else {
4965 * Some block drivers always operate in either writethrough or unsafe
4966 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4967 * know how the server works (because the behaviour is hardcoded or
4968 * depends on server-side configuration), so we can't ensure that
4969 * everything is safe on disk. Returning an error doesn't work because
4970 * that would break guests even if the server operates in writethrough
4971 * mode.
4973 * Let's hope the user knows what he's doing.
4975 ret = 0;
4977 if (ret < 0) {
4978 return ret;
4981 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4982 * in the case of cache=unsafe, so there are no useless flushes.
4984 flush_parent:
4985 return bdrv_co_flush(bs->file);
4988 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4990 Error *local_err = NULL;
4991 int ret;
4993 if (!bs->drv) {
4994 return;
4997 if (!(bs->open_flags & BDRV_O_INCOMING)) {
4998 return;
5000 bs->open_flags &= ~BDRV_O_INCOMING;
5002 if (bs->drv->bdrv_invalidate_cache) {
5003 bs->drv->bdrv_invalidate_cache(bs, &local_err);
5004 } else if (bs->file) {
5005 bdrv_invalidate_cache(bs->file, &local_err);
5007 if (local_err) {
5008 error_propagate(errp, local_err);
5009 return;
5012 ret = refresh_total_sectors(bs, bs->total_sectors);
5013 if (ret < 0) {
5014 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5015 return;
5019 void bdrv_invalidate_cache_all(Error **errp)
5021 BlockDriverState *bs;
5022 Error *local_err = NULL;
5024 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5025 AioContext *aio_context = bdrv_get_aio_context(bs);
5027 aio_context_acquire(aio_context);
5028 bdrv_invalidate_cache(bs, &local_err);
5029 aio_context_release(aio_context);
5030 if (local_err) {
5031 error_propagate(errp, local_err);
5032 return;
5037 int bdrv_flush(BlockDriverState *bs)
5039 Coroutine *co;
5040 RwCo rwco = {
5041 .bs = bs,
5042 .ret = NOT_DONE,
5045 if (qemu_in_coroutine()) {
5046 /* Fast-path if already in coroutine context */
5047 bdrv_flush_co_entry(&rwco);
5048 } else {
5049 AioContext *aio_context = bdrv_get_aio_context(bs);
5051 co = qemu_coroutine_create(bdrv_flush_co_entry);
5052 qemu_coroutine_enter(co, &rwco);
5053 while (rwco.ret == NOT_DONE) {
5054 aio_poll(aio_context, true);
5058 return rwco.ret;
5061 typedef struct DiscardCo {
5062 BlockDriverState *bs;
5063 int64_t sector_num;
5064 int nb_sectors;
5065 int ret;
5066 } DiscardCo;
5067 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5069 DiscardCo *rwco = opaque;
5071 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5074 /* if no limit is specified in the BlockLimits use a default
5075 * of 32768 512-byte sectors (16 MiB) per request.
5077 #define MAX_DISCARD_DEFAULT 32768
5079 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5080 int nb_sectors)
5082 int max_discard;
5084 if (!bs->drv) {
5085 return -ENOMEDIUM;
5086 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5087 return -EIO;
5088 } else if (bs->read_only) {
5089 return -EROFS;
5092 bdrv_reset_dirty(bs, sector_num, nb_sectors);
5094 /* Do nothing if disabled. */
5095 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5096 return 0;
5099 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5100 return 0;
5103 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5104 while (nb_sectors > 0) {
5105 int ret;
5106 int num = nb_sectors;
5108 /* align request */
5109 if (bs->bl.discard_alignment &&
5110 num >= bs->bl.discard_alignment &&
5111 sector_num % bs->bl.discard_alignment) {
5112 if (num > bs->bl.discard_alignment) {
5113 num = bs->bl.discard_alignment;
5115 num -= sector_num % bs->bl.discard_alignment;
5118 /* limit request size */
5119 if (num > max_discard) {
5120 num = max_discard;
5123 if (bs->drv->bdrv_co_discard) {
5124 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5125 } else {
5126 BlockAIOCB *acb;
5127 CoroutineIOCompletion co = {
5128 .coroutine = qemu_coroutine_self(),
5131 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5132 bdrv_co_io_em_complete, &co);
5133 if (acb == NULL) {
5134 return -EIO;
5135 } else {
5136 qemu_coroutine_yield();
5137 ret = co.ret;
5140 if (ret && ret != -ENOTSUP) {
5141 return ret;
5144 sector_num += num;
5145 nb_sectors -= num;
5147 return 0;
5150 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5152 Coroutine *co;
5153 DiscardCo rwco = {
5154 .bs = bs,
5155 .sector_num = sector_num,
5156 .nb_sectors = nb_sectors,
5157 .ret = NOT_DONE,
5160 if (qemu_in_coroutine()) {
5161 /* Fast-path if already in coroutine context */
5162 bdrv_discard_co_entry(&rwco);
5163 } else {
5164 AioContext *aio_context = bdrv_get_aio_context(bs);
5166 co = qemu_coroutine_create(bdrv_discard_co_entry);
5167 qemu_coroutine_enter(co, &rwco);
5168 while (rwco.ret == NOT_DONE) {
5169 aio_poll(aio_context, true);
5173 return rwco.ret;
5176 /**************************************************************/
5177 /* removable device support */
5180 * Return TRUE if the media is present
5182 int bdrv_is_inserted(BlockDriverState *bs)
5184 BlockDriver *drv = bs->drv;
5186 if (!drv)
5187 return 0;
5188 if (!drv->bdrv_is_inserted)
5189 return 1;
5190 return drv->bdrv_is_inserted(bs);
5194 * Return whether the media changed since the last call to this
5195 * function, or -ENOTSUP if we don't know. Most drivers don't know.
5197 int bdrv_media_changed(BlockDriverState *bs)
5199 BlockDriver *drv = bs->drv;
5201 if (drv && drv->bdrv_media_changed) {
5202 return drv->bdrv_media_changed(bs);
5204 return -ENOTSUP;
5208 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5210 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5212 BlockDriver *drv = bs->drv;
5213 const char *device_name;
5215 if (drv && drv->bdrv_eject) {
5216 drv->bdrv_eject(bs, eject_flag);
5219 device_name = bdrv_get_device_name(bs);
5220 if (device_name[0] != '\0') {
5221 qapi_event_send_device_tray_moved(device_name,
5222 eject_flag, &error_abort);
5227 * Lock or unlock the media (if it is locked, the user won't be able
5228 * to eject it manually).
5230 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5232 BlockDriver *drv = bs->drv;
5234 trace_bdrv_lock_medium(bs, locked);
5236 if (drv && drv->bdrv_lock_medium) {
5237 drv->bdrv_lock_medium(bs, locked);
5241 /* needed for generic scsi interface */
5243 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5245 BlockDriver *drv = bs->drv;
5247 if (drv && drv->bdrv_ioctl)
5248 return drv->bdrv_ioctl(bs, req, buf);
5249 return -ENOTSUP;
5252 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5253 unsigned long int req, void *buf,
5254 BlockCompletionFunc *cb, void *opaque)
5256 BlockDriver *drv = bs->drv;
5258 if (drv && drv->bdrv_aio_ioctl)
5259 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5260 return NULL;
5263 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5265 bs->guest_block_size = align;
5268 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5270 return qemu_memalign(bdrv_opt_mem_align(bs), size);
5273 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5275 return memset(qemu_blockalign(bs, size), 0, size);
5278 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5280 size_t align = bdrv_opt_mem_align(bs);
5282 /* Ensure that NULL is never returned on success */
5283 assert(align > 0);
5284 if (size == 0) {
5285 size = align;
5288 return qemu_try_memalign(align, size);
5291 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5293 void *mem = qemu_try_blockalign(bs, size);
5295 if (mem) {
5296 memset(mem, 0, size);
5299 return mem;
5303 * Check if all memory in this vector is sector aligned.
5305 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5307 int i;
5308 size_t alignment = bdrv_opt_mem_align(bs);
5310 for (i = 0; i < qiov->niov; i++) {
5311 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5312 return false;
5314 if (qiov->iov[i].iov_len % alignment) {
5315 return false;
5319 return true;
5322 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5323 Error **errp)
5325 int64_t bitmap_size;
5326 BdrvDirtyBitmap *bitmap;
5328 assert((granularity & (granularity - 1)) == 0);
5330 granularity >>= BDRV_SECTOR_BITS;
5331 assert(granularity);
5332 bitmap_size = bdrv_nb_sectors(bs);
5333 if (bitmap_size < 0) {
5334 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5335 errno = -bitmap_size;
5336 return NULL;
5338 bitmap = g_new0(BdrvDirtyBitmap, 1);
5339 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5340 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5341 return bitmap;
5344 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5346 BdrvDirtyBitmap *bm, *next;
5347 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5348 if (bm == bitmap) {
5349 QLIST_REMOVE(bitmap, list);
5350 hbitmap_free(bitmap->bitmap);
5351 g_free(bitmap);
5352 return;
5357 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5359 BdrvDirtyBitmap *bm;
5360 BlockDirtyInfoList *list = NULL;
5361 BlockDirtyInfoList **plist = &list;
5363 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5364 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5365 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5366 info->count = bdrv_get_dirty_count(bs, bm);
5367 info->granularity =
5368 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5369 entry->value = info;
5370 *plist = entry;
5371 plist = &entry->next;
5374 return list;
5377 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5379 if (bitmap) {
5380 return hbitmap_get(bitmap->bitmap, sector);
5381 } else {
5382 return 0;
5386 void bdrv_dirty_iter_init(BlockDriverState *bs,
5387 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5389 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5392 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5393 int nr_sectors)
5395 BdrvDirtyBitmap *bitmap;
5396 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5397 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5401 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5403 BdrvDirtyBitmap *bitmap;
5404 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5405 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5409 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5411 return hbitmap_count(bitmap->bitmap);
5414 /* Get a reference to bs */
5415 void bdrv_ref(BlockDriverState *bs)
5417 bs->refcnt++;
5420 /* Release a previously grabbed reference to bs.
5421 * If after releasing, reference count is zero, the BlockDriverState is
5422 * deleted. */
5423 void bdrv_unref(BlockDriverState *bs)
5425 if (!bs) {
5426 return;
5428 assert(bs->refcnt > 0);
5429 if (--bs->refcnt == 0) {
5430 bdrv_delete(bs);
5434 struct BdrvOpBlocker {
5435 Error *reason;
5436 QLIST_ENTRY(BdrvOpBlocker) list;
5439 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5441 BdrvOpBlocker *blocker;
5442 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5443 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5444 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5445 if (errp) {
5446 error_setg(errp, "Device '%s' is busy: %s",
5447 bdrv_get_device_name(bs),
5448 error_get_pretty(blocker->reason));
5450 return true;
5452 return false;
5455 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5457 BdrvOpBlocker *blocker;
5458 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5460 blocker = g_new0(BdrvOpBlocker, 1);
5461 blocker->reason = reason;
5462 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5465 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5467 BdrvOpBlocker *blocker, *next;
5468 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5469 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5470 if (blocker->reason == reason) {
5471 QLIST_REMOVE(blocker, list);
5472 g_free(blocker);
5477 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5479 int i;
5480 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5481 bdrv_op_block(bs, i, reason);
5485 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5487 int i;
5488 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5489 bdrv_op_unblock(bs, i, reason);
5493 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5495 int i;
5497 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5498 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5499 return false;
5502 return true;
5505 void bdrv_iostatus_enable(BlockDriverState *bs)
5507 bs->iostatus_enabled = true;
5508 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5511 /* The I/O status is only enabled if the drive explicitly
5512 * enables it _and_ the VM is configured to stop on errors */
5513 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5515 return (bs->iostatus_enabled &&
5516 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5517 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5518 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5521 void bdrv_iostatus_disable(BlockDriverState *bs)
5523 bs->iostatus_enabled = false;
5526 void bdrv_iostatus_reset(BlockDriverState *bs)
5528 if (bdrv_iostatus_is_enabled(bs)) {
5529 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5530 if (bs->job) {
5531 block_job_iostatus_reset(bs->job);
5536 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5538 assert(bdrv_iostatus_is_enabled(bs));
5539 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5540 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5541 BLOCK_DEVICE_IO_STATUS_FAILED;
5545 void bdrv_img_create(const char *filename, const char *fmt,
5546 const char *base_filename, const char *base_fmt,
5547 char *options, uint64_t img_size, int flags,
5548 Error **errp, bool quiet)
5550 QemuOptsList *create_opts = NULL;
5551 QemuOpts *opts = NULL;
5552 const char *backing_fmt, *backing_file;
5553 int64_t size;
5554 BlockDriver *drv, *proto_drv;
5555 BlockDriver *backing_drv = NULL;
5556 Error *local_err = NULL;
5557 int ret = 0;
5559 /* Find driver and parse its options */
5560 drv = bdrv_find_format(fmt);
5561 if (!drv) {
5562 error_setg(errp, "Unknown file format '%s'", fmt);
5563 return;
5566 proto_drv = bdrv_find_protocol(filename, true);
5567 if (!proto_drv) {
5568 error_setg(errp, "Unknown protocol '%s'", filename);
5569 return;
5572 if (!drv->create_opts) {
5573 error_setg(errp, "Format driver '%s' does not support image creation",
5574 drv->format_name);
5575 return;
5578 if (!proto_drv->create_opts) {
5579 error_setg(errp, "Protocol driver '%s' does not support image creation",
5580 proto_drv->format_name);
5581 return;
5584 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5585 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5587 /* Create parameter list with default values */
5588 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5589 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5591 /* Parse -o options */
5592 if (options) {
5593 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5594 error_setg(errp, "Invalid options for file format '%s'", fmt);
5595 goto out;
5599 if (base_filename) {
5600 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5601 error_setg(errp, "Backing file not supported for file format '%s'",
5602 fmt);
5603 goto out;
5607 if (base_fmt) {
5608 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5609 error_setg(errp, "Backing file format not supported for file "
5610 "format '%s'", fmt);
5611 goto out;
5615 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5616 if (backing_file) {
5617 if (!strcmp(filename, backing_file)) {
5618 error_setg(errp, "Error: Trying to create an image with the "
5619 "same filename as the backing file");
5620 goto out;
5624 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5625 if (backing_fmt) {
5626 backing_drv = bdrv_find_format(backing_fmt);
5627 if (!backing_drv) {
5628 error_setg(errp, "Unknown backing file format '%s'",
5629 backing_fmt);
5630 goto out;
5634 // The size for the image must always be specified, with one exception:
5635 // If we are using a backing file, we can obtain the size from there
5636 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5637 if (size == -1) {
5638 if (backing_file) {
5639 BlockDriverState *bs;
5640 int64_t size;
5641 int back_flags;
5643 /* backing files always opened read-only */
5644 back_flags =
5645 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5647 bs = NULL;
5648 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5649 backing_drv, &local_err);
5650 if (ret < 0) {
5651 goto out;
5653 size = bdrv_getlength(bs);
5654 if (size < 0) {
5655 error_setg_errno(errp, -size, "Could not get size of '%s'",
5656 backing_file);
5657 bdrv_unref(bs);
5658 goto out;
5661 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5663 bdrv_unref(bs);
5664 } else {
5665 error_setg(errp, "Image creation needs a size parameter");
5666 goto out;
5670 if (!quiet) {
5671 printf("Formatting '%s', fmt=%s", filename, fmt);
5672 qemu_opts_print(opts, " ");
5673 puts("");
5676 ret = bdrv_create(drv, filename, opts, &local_err);
5678 if (ret == -EFBIG) {
5679 /* This is generally a better message than whatever the driver would
5680 * deliver (especially because of the cluster_size_hint), since that
5681 * is most probably not much different from "image too large". */
5682 const char *cluster_size_hint = "";
5683 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5684 cluster_size_hint = " (try using a larger cluster size)";
5686 error_setg(errp, "The image size is too large for file format '%s'"
5687 "%s", fmt, cluster_size_hint);
5688 error_free(local_err);
5689 local_err = NULL;
5692 out:
5693 qemu_opts_del(opts);
5694 qemu_opts_free(create_opts);
5695 if (local_err) {
5696 error_propagate(errp, local_err);
5700 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5702 return bs->aio_context;
5705 void bdrv_detach_aio_context(BlockDriverState *bs)
5707 BdrvAioNotifier *baf;
5709 if (!bs->drv) {
5710 return;
5713 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5714 baf->detach_aio_context(baf->opaque);
5717 if (bs->io_limits_enabled) {
5718 throttle_detach_aio_context(&bs->throttle_state);
5720 if (bs->drv->bdrv_detach_aio_context) {
5721 bs->drv->bdrv_detach_aio_context(bs);
5723 if (bs->file) {
5724 bdrv_detach_aio_context(bs->file);
5726 if (bs->backing_hd) {
5727 bdrv_detach_aio_context(bs->backing_hd);
5730 bs->aio_context = NULL;
5733 void bdrv_attach_aio_context(BlockDriverState *bs,
5734 AioContext *new_context)
5736 BdrvAioNotifier *ban;
5738 if (!bs->drv) {
5739 return;
5742 bs->aio_context = new_context;
5744 if (bs->backing_hd) {
5745 bdrv_attach_aio_context(bs->backing_hd, new_context);
5747 if (bs->file) {
5748 bdrv_attach_aio_context(bs->file, new_context);
5750 if (bs->drv->bdrv_attach_aio_context) {
5751 bs->drv->bdrv_attach_aio_context(bs, new_context);
5753 if (bs->io_limits_enabled) {
5754 throttle_attach_aio_context(&bs->throttle_state, new_context);
5757 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5758 ban->attached_aio_context(new_context, ban->opaque);
5762 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5764 bdrv_drain_all(); /* ensure there are no in-flight requests */
5766 bdrv_detach_aio_context(bs);
5768 /* This function executes in the old AioContext so acquire the new one in
5769 * case it runs in a different thread.
5771 aio_context_acquire(new_context);
5772 bdrv_attach_aio_context(bs, new_context);
5773 aio_context_release(new_context);
5776 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5777 void (*attached_aio_context)(AioContext *new_context, void *opaque),
5778 void (*detach_aio_context)(void *opaque), void *opaque)
5780 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5781 *ban = (BdrvAioNotifier){
5782 .attached_aio_context = attached_aio_context,
5783 .detach_aio_context = detach_aio_context,
5784 .opaque = opaque
5787 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5790 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5791 void (*attached_aio_context)(AioContext *,
5792 void *),
5793 void (*detach_aio_context)(void *),
5794 void *opaque)
5796 BdrvAioNotifier *ban, *ban_next;
5798 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5799 if (ban->attached_aio_context == attached_aio_context &&
5800 ban->detach_aio_context == detach_aio_context &&
5801 ban->opaque == opaque)
5803 QLIST_REMOVE(ban, list);
5804 g_free(ban);
5806 return;
5810 abort();
5813 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5814 NotifierWithReturn *notifier)
5816 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5819 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
5820 BlockDriverAmendStatusCB *status_cb)
5822 if (!bs->drv->bdrv_amend_options) {
5823 return -ENOTSUP;
5825 return bs->drv->bdrv_amend_options(bs, opts, status_cb);
5828 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5829 * of block filter and by bdrv_is_first_non_filter.
5830 * It is used to test if the given bs is the candidate or recurse more in the
5831 * node graph.
5833 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5834 BlockDriverState *candidate)
5836 /* return false if basic checks fails */
5837 if (!bs || !bs->drv) {
5838 return false;
5841 /* the code reached a non block filter driver -> check if the bs is
5842 * the same as the candidate. It's the recursion termination condition.
5844 if (!bs->drv->is_filter) {
5845 return bs == candidate;
5847 /* Down this path the driver is a block filter driver */
5849 /* If the block filter recursion method is defined use it to recurse down
5850 * the node graph.
5852 if (bs->drv->bdrv_recurse_is_first_non_filter) {
5853 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5856 /* the driver is a block filter but don't allow to recurse -> return false
5858 return false;
5861 /* This function checks if the candidate is the first non filter bs down it's
5862 * bs chain. Since we don't have pointers to parents it explore all bs chains
5863 * from the top. Some filters can choose not to pass down the recursion.
5865 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5867 BlockDriverState *bs;
5869 /* walk down the bs forest recursively */
5870 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5871 bool perm;
5873 /* try to recurse in this top level bs */
5874 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5876 /* candidate is the first non filter */
5877 if (perm) {
5878 return true;
5882 return false;
5885 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5887 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5888 AioContext *aio_context;
5890 if (!to_replace_bs) {
5891 error_setg(errp, "Node name '%s' not found", node_name);
5892 return NULL;
5895 aio_context = bdrv_get_aio_context(to_replace_bs);
5896 aio_context_acquire(aio_context);
5898 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5899 to_replace_bs = NULL;
5900 goto out;
5903 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5904 * most non filter in order to prevent data corruption.
5905 * Another benefit is that this tests exclude backing files which are
5906 * blocked by the backing blockers.
5908 if (!bdrv_is_first_non_filter(to_replace_bs)) {
5909 error_setg(errp, "Only top most non filter can be replaced");
5910 to_replace_bs = NULL;
5911 goto out;
5914 out:
5915 aio_context_release(aio_context);
5916 return to_replace_bs;
5919 void bdrv_io_plug(BlockDriverState *bs)
5921 BlockDriver *drv = bs->drv;
5922 if (drv && drv->bdrv_io_plug) {
5923 drv->bdrv_io_plug(bs);
5924 } else if (bs->file) {
5925 bdrv_io_plug(bs->file);
5929 void bdrv_io_unplug(BlockDriverState *bs)
5931 BlockDriver *drv = bs->drv;
5932 if (drv && drv->bdrv_io_unplug) {
5933 drv->bdrv_io_unplug(bs);
5934 } else if (bs->file) {
5935 bdrv_io_unplug(bs->file);
5939 void bdrv_flush_io_queue(BlockDriverState *bs)
5941 BlockDriver *drv = bs->drv;
5942 if (drv && drv->bdrv_flush_io_queue) {
5943 drv->bdrv_flush_io_queue(bs);
5944 } else if (bs->file) {
5945 bdrv_flush_io_queue(bs->file);
5949 static bool append_open_options(QDict *d, BlockDriverState *bs)
5951 const QDictEntry *entry;
5952 bool found_any = false;
5954 for (entry = qdict_first(bs->options); entry;
5955 entry = qdict_next(bs->options, entry))
5957 /* Only take options for this level and exclude all non-driver-specific
5958 * options */
5959 if (!strchr(qdict_entry_key(entry), '.') &&
5960 strcmp(qdict_entry_key(entry), "node-name"))
5962 qobject_incref(qdict_entry_value(entry));
5963 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5964 found_any = true;
5968 return found_any;
5971 /* Updates the following BDS fields:
5972 * - exact_filename: A filename which may be used for opening a block device
5973 * which (mostly) equals the given BDS (even without any
5974 * other options; so reading and writing must return the same
5975 * results, but caching etc. may be different)
5976 * - full_open_options: Options which, when given when opening a block device
5977 * (without a filename), result in a BDS (mostly)
5978 * equalling the given one
5979 * - filename: If exact_filename is set, it is copied here. Otherwise,
5980 * full_open_options is converted to a JSON object, prefixed with
5981 * "json:" (for use through the JSON pseudo protocol) and put here.
5983 void bdrv_refresh_filename(BlockDriverState *bs)
5985 BlockDriver *drv = bs->drv;
5986 QDict *opts;
5988 if (!drv) {
5989 return;
5992 /* This BDS's file name will most probably depend on its file's name, so
5993 * refresh that first */
5994 if (bs->file) {
5995 bdrv_refresh_filename(bs->file);
5998 if (drv->bdrv_refresh_filename) {
5999 /* Obsolete information is of no use here, so drop the old file name
6000 * information before refreshing it */
6001 bs->exact_filename[0] = '\0';
6002 if (bs->full_open_options) {
6003 QDECREF(bs->full_open_options);
6004 bs->full_open_options = NULL;
6007 drv->bdrv_refresh_filename(bs);
6008 } else if (bs->file) {
6009 /* Try to reconstruct valid information from the underlying file */
6010 bool has_open_options;
6012 bs->exact_filename[0] = '\0';
6013 if (bs->full_open_options) {
6014 QDECREF(bs->full_open_options);
6015 bs->full_open_options = NULL;
6018 opts = qdict_new();
6019 has_open_options = append_open_options(opts, bs);
6021 /* If no specific options have been given for this BDS, the filename of
6022 * the underlying file should suffice for this one as well */
6023 if (bs->file->exact_filename[0] && !has_open_options) {
6024 strcpy(bs->exact_filename, bs->file->exact_filename);
6026 /* Reconstructing the full options QDict is simple for most format block
6027 * drivers, as long as the full options are known for the underlying
6028 * file BDS. The full options QDict of that file BDS should somehow
6029 * contain a representation of the filename, therefore the following
6030 * suffices without querying the (exact_)filename of this BDS. */
6031 if (bs->file->full_open_options) {
6032 qdict_put_obj(opts, "driver",
6033 QOBJECT(qstring_from_str(drv->format_name)));
6034 QINCREF(bs->file->full_open_options);
6035 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6037 bs->full_open_options = opts;
6038 } else {
6039 QDECREF(opts);
6041 } else if (!bs->full_open_options && qdict_size(bs->options)) {
6042 /* There is no underlying file BDS (at least referenced by BDS.file),
6043 * so the full options QDict should be equal to the options given
6044 * specifically for this block device when it was opened (plus the
6045 * driver specification).
6046 * Because those options don't change, there is no need to update
6047 * full_open_options when it's already set. */
6049 opts = qdict_new();
6050 append_open_options(opts, bs);
6051 qdict_put_obj(opts, "driver",
6052 QOBJECT(qstring_from_str(drv->format_name)));
6054 if (bs->exact_filename[0]) {
6055 /* This may not work for all block protocol drivers (some may
6056 * require this filename to be parsed), but we have to find some
6057 * default solution here, so just include it. If some block driver
6058 * does not support pure options without any filename at all or
6059 * needs some special format of the options QDict, it needs to
6060 * implement the driver-specific bdrv_refresh_filename() function.
6062 qdict_put_obj(opts, "filename",
6063 QOBJECT(qstring_from_str(bs->exact_filename)));
6066 bs->full_open_options = opts;
6069 if (bs->exact_filename[0]) {
6070 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6071 } else if (bs->full_open_options) {
6072 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6073 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6074 qstring_get_str(json));
6075 QDECREF(json);
6079 /* This accessor function purpose is to allow the device models to access the
6080 * BlockAcctStats structure embedded inside a BlockDriverState without being
6081 * aware of the BlockDriverState structure layout.
6082 * It will go away when the BlockAcctStats structure will be moved inside
6083 * the device models.
6085 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6087 return &bs->stats;