block: replace g_new0 with g_new for bottom half allocation.
[qemu.git] / block.c
blob58f804228c04106b37385e7cba686614763afa63
1 /*
2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
40 #ifdef CONFIG_BSD
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
45 #ifndef __DragonFly__
46 #include <sys/disk.h>
47 #endif
48 #endif
50 #ifdef _WIN32
51 #include <windows.h>
52 #endif
54 struct BdrvDirtyBitmap {
55 HBitmap *bitmap;
56 QLIST_ENTRY(BdrvDirtyBitmap) list;
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63 BlockCompletionFunc *cb, void *opaque);
64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66 BlockCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75 BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78 BdrvRequestFlags flags);
79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
83 BdrvRequestFlags flags,
84 BlockCompletionFunc *cb,
85 void *opaque,
86 bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
111 int is_windows_drive(const char *filename)
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
121 #endif
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
127 int i;
129 throttle_config(&bs->throttle_state, cfg);
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
143 bs->io_limits_enabled = false;
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
151 bs->io_limits_enabled = enabled;
153 return drained;
156 void bdrv_io_limits_disable(BlockDriverState *bs)
158 bs->io_limits_enabled = false;
160 bdrv_start_throttled_reqs(bs);
162 throttle_destroy(&bs->throttle_state);
165 static void bdrv_throttle_read_timer_cb(void *opaque)
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
171 static void bdrv_throttle_write_timer_cb(void *opaque)
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 bdrv_get_aio_context(bs),
183 QEMU_CLOCK_VIRTUAL,
184 bdrv_throttle_read_timer_cb,
185 bdrv_throttle_write_timer_cb,
186 bs);
187 bs->io_limits_enabled = true;
190 /* This function makes an IO wait if needed
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
195 static void bdrv_io_limits_intercept(BlockDriverState *bs,
196 unsigned int bytes,
197 bool is_write)
199 /* does this io must wait */
200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
202 /* if must wait or any request of this type throttled queue the IO */
203 if (must_wait ||
204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
208 /* the IO will be executed, do the accounting */
209 throttle_account(&bs->throttle_state, is_write, bytes);
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214 return;
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
221 size_t bdrv_opt_mem_align(BlockDriverState *bs)
223 if (!bs || !bs->drv) {
224 /* 4k should be on the safe side */
225 return 4096;
228 return bs->bl.opt_mem_alignment;
231 /* check if the path starts with "<protocol>:" */
232 int path_has_protocol(const char *path)
234 const char *p;
236 #ifdef _WIN32
237 if (is_windows_drive(path) ||
238 is_windows_drive_prefix(path)) {
239 return 0;
241 p = path + strcspn(path, ":/\\");
242 #else
243 p = path + strcspn(path, ":/");
244 #endif
246 return *p == ':';
249 int path_is_absolute(const char *path)
251 #ifdef _WIN32
252 /* specific case for names like: "\\.\d:" */
253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254 return 1;
256 return (*path == '/' || *path == '\\');
257 #else
258 return (*path == '/');
259 #endif
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
264 supported. */
265 void path_combine(char *dest, int dest_size,
266 const char *base_path,
267 const char *filename)
269 const char *p, *p1;
270 int len;
272 if (dest_size <= 0)
273 return;
274 if (path_is_absolute(filename)) {
275 pstrcpy(dest, dest_size, filename);
276 } else {
277 p = strchr(base_path, ':');
278 if (p)
279 p++;
280 else
281 p = base_path;
282 p1 = strrchr(base_path, '/');
283 #ifdef _WIN32
285 const char *p2;
286 p2 = strrchr(base_path, '\\');
287 if (!p1 || p2 > p1)
288 p1 = p2;
290 #endif
291 if (p1)
292 p1++;
293 else
294 p1 = base_path;
295 if (p1 > p)
296 p = p1;
297 len = p - base_path;
298 if (len > dest_size - 1)
299 len = dest_size - 1;
300 memcpy(dest, base_path, len);
301 dest[len] = '\0';
302 pstrcat(dest, dest_size, filename);
306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
308 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309 pstrcpy(dest, sz, bs->backing_file);
310 } else {
311 path_combine(dest, sz, bs->filename, bs->backing_file);
315 void bdrv_register(BlockDriver *bdrv)
317 /* Block drivers without coroutine functions need emulation */
318 if (!bdrv->bdrv_co_readv) {
319 bdrv->bdrv_co_readv = bdrv_co_readv_em;
320 bdrv->bdrv_co_writev = bdrv_co_writev_em;
322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323 * the block driver lacks aio we need to emulate that too.
325 if (!bdrv->bdrv_aio_readv) {
326 /* add AIO emulation layer */
327 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
332 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
335 BlockDriverState *bdrv_new_root(void)
337 BlockDriverState *bs = bdrv_new();
339 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
340 return bs;
343 BlockDriverState *bdrv_new(void)
345 BlockDriverState *bs;
346 int i;
348 bs = g_new0(BlockDriverState, 1);
349 QLIST_INIT(&bs->dirty_bitmaps);
350 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
351 QLIST_INIT(&bs->op_blockers[i]);
353 bdrv_iostatus_disable(bs);
354 notifier_list_init(&bs->close_notifiers);
355 notifier_with_return_list_init(&bs->before_write_notifiers);
356 qemu_co_queue_init(&bs->throttled_reqs[0]);
357 qemu_co_queue_init(&bs->throttled_reqs[1]);
358 bs->refcnt = 1;
359 bs->aio_context = qemu_get_aio_context();
361 return bs;
364 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
366 notifier_list_add(&bs->close_notifiers, notify);
369 BlockDriver *bdrv_find_format(const char *format_name)
371 BlockDriver *drv1;
372 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
373 if (!strcmp(drv1->format_name, format_name)) {
374 return drv1;
377 return NULL;
380 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
382 static const char *whitelist_rw[] = {
383 CONFIG_BDRV_RW_WHITELIST
385 static const char *whitelist_ro[] = {
386 CONFIG_BDRV_RO_WHITELIST
388 const char **p;
390 if (!whitelist_rw[0] && !whitelist_ro[0]) {
391 return 1; /* no whitelist, anything goes */
394 for (p = whitelist_rw; *p; p++) {
395 if (!strcmp(drv->format_name, *p)) {
396 return 1;
399 if (read_only) {
400 for (p = whitelist_ro; *p; p++) {
401 if (!strcmp(drv->format_name, *p)) {
402 return 1;
406 return 0;
409 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
410 bool read_only)
412 BlockDriver *drv = bdrv_find_format(format_name);
413 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
416 typedef struct CreateCo {
417 BlockDriver *drv;
418 char *filename;
419 QemuOpts *opts;
420 int ret;
421 Error *err;
422 } CreateCo;
424 static void coroutine_fn bdrv_create_co_entry(void *opaque)
426 Error *local_err = NULL;
427 int ret;
429 CreateCo *cco = opaque;
430 assert(cco->drv);
432 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
433 if (local_err) {
434 error_propagate(&cco->err, local_err);
436 cco->ret = ret;
439 int bdrv_create(BlockDriver *drv, const char* filename,
440 QemuOpts *opts, Error **errp)
442 int ret;
444 Coroutine *co;
445 CreateCo cco = {
446 .drv = drv,
447 .filename = g_strdup(filename),
448 .opts = opts,
449 .ret = NOT_DONE,
450 .err = NULL,
453 if (!drv->bdrv_create) {
454 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
455 ret = -ENOTSUP;
456 goto out;
459 if (qemu_in_coroutine()) {
460 /* Fast-path if already in coroutine context */
461 bdrv_create_co_entry(&cco);
462 } else {
463 co = qemu_coroutine_create(bdrv_create_co_entry);
464 qemu_coroutine_enter(co, &cco);
465 while (cco.ret == NOT_DONE) {
466 aio_poll(qemu_get_aio_context(), true);
470 ret = cco.ret;
471 if (ret < 0) {
472 if (cco.err) {
473 error_propagate(errp, cco.err);
474 } else {
475 error_setg_errno(errp, -ret, "Could not create image");
479 out:
480 g_free(cco.filename);
481 return ret;
484 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
486 BlockDriver *drv;
487 Error *local_err = NULL;
488 int ret;
490 drv = bdrv_find_protocol(filename, true);
491 if (drv == NULL) {
492 error_setg(errp, "Could not find protocol for file '%s'", filename);
493 return -ENOENT;
496 ret = bdrv_create(drv, filename, opts, &local_err);
497 if (local_err) {
498 error_propagate(errp, local_err);
500 return ret;
503 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
505 BlockDriver *drv = bs->drv;
506 Error *local_err = NULL;
508 memset(&bs->bl, 0, sizeof(bs->bl));
510 if (!drv) {
511 return;
514 /* Take some limits from the children as a default */
515 if (bs->file) {
516 bdrv_refresh_limits(bs->file, &local_err);
517 if (local_err) {
518 error_propagate(errp, local_err);
519 return;
521 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
522 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
523 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
524 } else {
525 bs->bl.opt_mem_alignment = 512;
528 if (bs->backing_hd) {
529 bdrv_refresh_limits(bs->backing_hd, &local_err);
530 if (local_err) {
531 error_propagate(errp, local_err);
532 return;
534 bs->bl.opt_transfer_length =
535 MAX(bs->bl.opt_transfer_length,
536 bs->backing_hd->bl.opt_transfer_length);
537 bs->bl.max_transfer_length =
538 MIN_NON_ZERO(bs->bl.max_transfer_length,
539 bs->backing_hd->bl.max_transfer_length);
540 bs->bl.opt_mem_alignment =
541 MAX(bs->bl.opt_mem_alignment,
542 bs->backing_hd->bl.opt_mem_alignment);
545 /* Then let the driver override it */
546 if (drv->bdrv_refresh_limits) {
547 drv->bdrv_refresh_limits(bs, errp);
552 * Create a uniquely-named empty temporary file.
553 * Return 0 upon success, otherwise a negative errno value.
555 int get_tmp_filename(char *filename, int size)
557 #ifdef _WIN32
558 char temp_dir[MAX_PATH];
559 /* GetTempFileName requires that its output buffer (4th param)
560 have length MAX_PATH or greater. */
561 assert(size >= MAX_PATH);
562 return (GetTempPath(MAX_PATH, temp_dir)
563 && GetTempFileName(temp_dir, "qem", 0, filename)
564 ? 0 : -GetLastError());
565 #else
566 int fd;
567 const char *tmpdir;
568 tmpdir = getenv("TMPDIR");
569 if (!tmpdir) {
570 tmpdir = "/var/tmp";
572 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
573 return -EOVERFLOW;
575 fd = mkstemp(filename);
576 if (fd < 0) {
577 return -errno;
579 if (close(fd) != 0) {
580 unlink(filename);
581 return -errno;
583 return 0;
584 #endif
588 * Detect host devices. By convention, /dev/cdrom[N] is always
589 * recognized as a host CDROM.
591 static BlockDriver *find_hdev_driver(const char *filename)
593 int score_max = 0, score;
594 BlockDriver *drv = NULL, *d;
596 QLIST_FOREACH(d, &bdrv_drivers, list) {
597 if (d->bdrv_probe_device) {
598 score = d->bdrv_probe_device(filename);
599 if (score > score_max) {
600 score_max = score;
601 drv = d;
606 return drv;
609 BlockDriver *bdrv_find_protocol(const char *filename,
610 bool allow_protocol_prefix)
612 BlockDriver *drv1;
613 char protocol[128];
614 int len;
615 const char *p;
617 /* TODO Drivers without bdrv_file_open must be specified explicitly */
620 * XXX(hch): we really should not let host device detection
621 * override an explicit protocol specification, but moving this
622 * later breaks access to device names with colons in them.
623 * Thanks to the brain-dead persistent naming schemes on udev-
624 * based Linux systems those actually are quite common.
626 drv1 = find_hdev_driver(filename);
627 if (drv1) {
628 return drv1;
631 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
632 return &bdrv_file;
635 p = strchr(filename, ':');
636 assert(p != NULL);
637 len = p - filename;
638 if (len > sizeof(protocol) - 1)
639 len = sizeof(protocol) - 1;
640 memcpy(protocol, filename, len);
641 protocol[len] = '\0';
642 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
643 if (drv1->protocol_name &&
644 !strcmp(drv1->protocol_name, protocol)) {
645 return drv1;
648 return NULL;
652 * Guess image format by probing its contents.
653 * This is not a good idea when your image is raw (CVE-2008-2004), but
654 * we do it anyway for backward compatibility.
656 * @buf contains the image's first @buf_size bytes.
657 * @buf_size is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
658 * but can be smaller if the image file is smaller)
659 * @filename is its filename.
661 * For all block drivers, call the bdrv_probe() method to get its
662 * probing score.
663 * Return the first block driver with the highest probing score.
665 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
666 const char *filename)
668 int score_max = 0, score;
669 BlockDriver *drv = NULL, *d;
671 QLIST_FOREACH(d, &bdrv_drivers, list) {
672 if (d->bdrv_probe) {
673 score = d->bdrv_probe(buf, buf_size, filename);
674 if (score > score_max) {
675 score_max = score;
676 drv = d;
681 return drv;
684 static int find_image_format(BlockDriverState *bs, const char *filename,
685 BlockDriver **pdrv, Error **errp)
687 BlockDriver *drv;
688 uint8_t buf[BLOCK_PROBE_BUF_SIZE];
689 int ret = 0;
691 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
692 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
693 *pdrv = &bdrv_raw;
694 return ret;
697 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
698 if (ret < 0) {
699 error_setg_errno(errp, -ret, "Could not read image for determining its "
700 "format");
701 *pdrv = NULL;
702 return ret;
705 drv = bdrv_probe_all(buf, ret, filename);
706 if (!drv) {
707 error_setg(errp, "Could not determine image format: No compatible "
708 "driver found");
709 ret = -ENOENT;
711 *pdrv = drv;
712 return ret;
716 * Set the current 'total_sectors' value
717 * Return 0 on success, -errno on error.
719 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
721 BlockDriver *drv = bs->drv;
723 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
724 if (bs->sg)
725 return 0;
727 /* query actual device if possible, otherwise just trust the hint */
728 if (drv->bdrv_getlength) {
729 int64_t length = drv->bdrv_getlength(bs);
730 if (length < 0) {
731 return length;
733 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
736 bs->total_sectors = hint;
737 return 0;
741 * Set open flags for a given discard mode
743 * Return 0 on success, -1 if the discard mode was invalid.
745 int bdrv_parse_discard_flags(const char *mode, int *flags)
747 *flags &= ~BDRV_O_UNMAP;
749 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
750 /* do nothing */
751 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
752 *flags |= BDRV_O_UNMAP;
753 } else {
754 return -1;
757 return 0;
761 * Set open flags for a given cache mode
763 * Return 0 on success, -1 if the cache mode was invalid.
765 int bdrv_parse_cache_flags(const char *mode, int *flags)
767 *flags &= ~BDRV_O_CACHE_MASK;
769 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
770 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
771 } else if (!strcmp(mode, "directsync")) {
772 *flags |= BDRV_O_NOCACHE;
773 } else if (!strcmp(mode, "writeback")) {
774 *flags |= BDRV_O_CACHE_WB;
775 } else if (!strcmp(mode, "unsafe")) {
776 *flags |= BDRV_O_CACHE_WB;
777 *flags |= BDRV_O_NO_FLUSH;
778 } else if (!strcmp(mode, "writethrough")) {
779 /* this is the default */
780 } else {
781 return -1;
784 return 0;
788 * The copy-on-read flag is actually a reference count so multiple users may
789 * use the feature without worrying about clobbering its previous state.
790 * Copy-on-read stays enabled until all users have called to disable it.
792 void bdrv_enable_copy_on_read(BlockDriverState *bs)
794 bs->copy_on_read++;
797 void bdrv_disable_copy_on_read(BlockDriverState *bs)
799 assert(bs->copy_on_read > 0);
800 bs->copy_on_read--;
804 * Returns the flags that a temporary snapshot should get, based on the
805 * originally requested flags (the originally requested image will have flags
806 * like a backing file)
808 static int bdrv_temp_snapshot_flags(int flags)
810 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
814 * Returns the flags that bs->file should get, based on the given flags for
815 * the parent BDS
817 static int bdrv_inherited_flags(int flags)
819 /* Enable protocol handling, disable format probing for bs->file */
820 flags |= BDRV_O_PROTOCOL;
822 /* Our block drivers take care to send flushes and respect unmap policy,
823 * so we can enable both unconditionally on lower layers. */
824 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
826 /* Clear flags that only apply to the top layer */
827 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
829 return flags;
833 * Returns the flags that bs->backing_hd should get, based on the given flags
834 * for the parent BDS
836 static int bdrv_backing_flags(int flags)
838 /* backing files always opened read-only */
839 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
841 /* snapshot=on is handled on the top layer */
842 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
844 return flags;
847 static int bdrv_open_flags(BlockDriverState *bs, int flags)
849 int open_flags = flags | BDRV_O_CACHE_WB;
852 * Clear flags that are internal to the block layer before opening the
853 * image.
855 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
858 * Snapshots should be writable.
860 if (flags & BDRV_O_TEMPORARY) {
861 open_flags |= BDRV_O_RDWR;
864 return open_flags;
867 static void bdrv_assign_node_name(BlockDriverState *bs,
868 const char *node_name,
869 Error **errp)
871 if (!node_name) {
872 return;
875 /* Check for empty string or invalid characters */
876 if (!id_wellformed(node_name)) {
877 error_setg(errp, "Invalid node name");
878 return;
881 /* takes care of avoiding namespaces collisions */
882 if (blk_by_name(node_name)) {
883 error_setg(errp, "node-name=%s is conflicting with a device id",
884 node_name);
885 return;
888 /* takes care of avoiding duplicates node names */
889 if (bdrv_find_node(node_name)) {
890 error_setg(errp, "Duplicate node name");
891 return;
894 /* copy node name into the bs and insert it into the graph list */
895 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
896 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
900 * Common part for opening disk images and files
902 * Removes all processed options from *options.
904 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
905 QDict *options, int flags, BlockDriver *drv, Error **errp)
907 int ret, open_flags;
908 const char *filename;
909 const char *node_name = NULL;
910 Error *local_err = NULL;
912 assert(drv != NULL);
913 assert(bs->file == NULL);
914 assert(options != NULL && bs->options != options);
916 if (file != NULL) {
917 filename = file->filename;
918 } else {
919 filename = qdict_get_try_str(options, "filename");
922 if (drv->bdrv_needs_filename && !filename) {
923 error_setg(errp, "The '%s' block driver requires a file name",
924 drv->format_name);
925 return -EINVAL;
928 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
930 node_name = qdict_get_try_str(options, "node-name");
931 bdrv_assign_node_name(bs, node_name, &local_err);
932 if (local_err) {
933 error_propagate(errp, local_err);
934 return -EINVAL;
936 qdict_del(options, "node-name");
938 /* bdrv_open() with directly using a protocol as drv. This layer is already
939 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
940 * and return immediately. */
941 if (file != NULL && drv->bdrv_file_open) {
942 bdrv_swap(file, bs);
943 return 0;
946 bs->open_flags = flags;
947 bs->guest_block_size = 512;
948 bs->request_alignment = 512;
949 bs->zero_beyond_eof = true;
950 open_flags = bdrv_open_flags(bs, flags);
951 bs->read_only = !(open_flags & BDRV_O_RDWR);
952 bs->growable = !!(flags & BDRV_O_PROTOCOL);
954 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
955 error_setg(errp,
956 !bs->read_only && bdrv_is_whitelisted(drv, true)
957 ? "Driver '%s' can only be used for read-only devices"
958 : "Driver '%s' is not whitelisted",
959 drv->format_name);
960 return -ENOTSUP;
963 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
964 if (flags & BDRV_O_COPY_ON_READ) {
965 if (!bs->read_only) {
966 bdrv_enable_copy_on_read(bs);
967 } else {
968 error_setg(errp, "Can't use copy-on-read on read-only device");
969 return -EINVAL;
973 if (filename != NULL) {
974 pstrcpy(bs->filename, sizeof(bs->filename), filename);
975 } else {
976 bs->filename[0] = '\0';
978 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
980 bs->drv = drv;
981 bs->opaque = g_malloc0(drv->instance_size);
983 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
985 /* Open the image, either directly or using a protocol */
986 if (drv->bdrv_file_open) {
987 assert(file == NULL);
988 assert(!drv->bdrv_needs_filename || filename != NULL);
989 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
990 } else {
991 if (file == NULL) {
992 error_setg(errp, "Can't use '%s' as a block driver for the "
993 "protocol level", drv->format_name);
994 ret = -EINVAL;
995 goto free_and_fail;
997 bs->file = file;
998 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1001 if (ret < 0) {
1002 if (local_err) {
1003 error_propagate(errp, local_err);
1004 } else if (bs->filename[0]) {
1005 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1006 } else {
1007 error_setg_errno(errp, -ret, "Could not open image");
1009 goto free_and_fail;
1012 ret = refresh_total_sectors(bs, bs->total_sectors);
1013 if (ret < 0) {
1014 error_setg_errno(errp, -ret, "Could not refresh total sector count");
1015 goto free_and_fail;
1018 bdrv_refresh_limits(bs, &local_err);
1019 if (local_err) {
1020 error_propagate(errp, local_err);
1021 ret = -EINVAL;
1022 goto free_and_fail;
1025 assert(bdrv_opt_mem_align(bs) != 0);
1026 assert((bs->request_alignment != 0) || bs->sg);
1027 return 0;
1029 free_and_fail:
1030 bs->file = NULL;
1031 g_free(bs->opaque);
1032 bs->opaque = NULL;
1033 bs->drv = NULL;
1034 return ret;
1037 static QDict *parse_json_filename(const char *filename, Error **errp)
1039 QObject *options_obj;
1040 QDict *options;
1041 int ret;
1043 ret = strstart(filename, "json:", &filename);
1044 assert(ret);
1046 options_obj = qobject_from_json(filename);
1047 if (!options_obj) {
1048 error_setg(errp, "Could not parse the JSON options");
1049 return NULL;
1052 if (qobject_type(options_obj) != QTYPE_QDICT) {
1053 qobject_decref(options_obj);
1054 error_setg(errp, "Invalid JSON object given");
1055 return NULL;
1058 options = qobject_to_qdict(options_obj);
1059 qdict_flatten(options);
1061 return options;
1065 * Fills in default options for opening images and converts the legacy
1066 * filename/flags pair to option QDict entries.
1068 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1069 BlockDriver *drv, Error **errp)
1071 const char *filename = *pfilename;
1072 const char *drvname;
1073 bool protocol = flags & BDRV_O_PROTOCOL;
1074 bool parse_filename = false;
1075 Error *local_err = NULL;
1077 /* Parse json: pseudo-protocol */
1078 if (filename && g_str_has_prefix(filename, "json:")) {
1079 QDict *json_options = parse_json_filename(filename, &local_err);
1080 if (local_err) {
1081 error_propagate(errp, local_err);
1082 return -EINVAL;
1085 /* Options given in the filename have lower priority than options
1086 * specified directly */
1087 qdict_join(*options, json_options, false);
1088 QDECREF(json_options);
1089 *pfilename = filename = NULL;
1092 /* Fetch the file name from the options QDict if necessary */
1093 if (protocol && filename) {
1094 if (!qdict_haskey(*options, "filename")) {
1095 qdict_put(*options, "filename", qstring_from_str(filename));
1096 parse_filename = true;
1097 } else {
1098 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1099 "the same time");
1100 return -EINVAL;
1104 /* Find the right block driver */
1105 filename = qdict_get_try_str(*options, "filename");
1106 drvname = qdict_get_try_str(*options, "driver");
1108 if (drv) {
1109 if (drvname) {
1110 error_setg(errp, "Driver specified twice");
1111 return -EINVAL;
1113 drvname = drv->format_name;
1114 qdict_put(*options, "driver", qstring_from_str(drvname));
1115 } else {
1116 if (!drvname && protocol) {
1117 if (filename) {
1118 drv = bdrv_find_protocol(filename, parse_filename);
1119 if (!drv) {
1120 error_setg(errp, "Unknown protocol");
1121 return -EINVAL;
1124 drvname = drv->format_name;
1125 qdict_put(*options, "driver", qstring_from_str(drvname));
1126 } else {
1127 error_setg(errp, "Must specify either driver or file");
1128 return -EINVAL;
1130 } else if (drvname) {
1131 drv = bdrv_find_format(drvname);
1132 if (!drv) {
1133 error_setg(errp, "Unknown driver '%s'", drvname);
1134 return -ENOENT;
1139 assert(drv || !protocol);
1141 /* Driver-specific filename parsing */
1142 if (drv && drv->bdrv_parse_filename && parse_filename) {
1143 drv->bdrv_parse_filename(filename, *options, &local_err);
1144 if (local_err) {
1145 error_propagate(errp, local_err);
1146 return -EINVAL;
1149 if (!drv->bdrv_needs_filename) {
1150 qdict_del(*options, "filename");
1154 return 0;
1157 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1160 if (bs->backing_hd) {
1161 assert(bs->backing_blocker);
1162 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1163 } else if (backing_hd) {
1164 error_setg(&bs->backing_blocker,
1165 "device is used as backing hd of '%s'",
1166 bdrv_get_device_name(bs));
1169 bs->backing_hd = backing_hd;
1170 if (!backing_hd) {
1171 error_free(bs->backing_blocker);
1172 bs->backing_blocker = NULL;
1173 goto out;
1175 bs->open_flags &= ~BDRV_O_NO_BACKING;
1176 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1177 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1178 backing_hd->drv ? backing_hd->drv->format_name : "");
1180 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1181 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1182 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1183 bs->backing_blocker);
1184 out:
1185 bdrv_refresh_limits(bs, NULL);
1189 * Opens the backing file for a BlockDriverState if not yet open
1191 * options is a QDict of options to pass to the block drivers, or NULL for an
1192 * empty set of options. The reference to the QDict is transferred to this
1193 * function (even on failure), so if the caller intends to reuse the dictionary,
1194 * it needs to use QINCREF() before calling bdrv_file_open.
1196 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1198 char *backing_filename = g_malloc0(PATH_MAX);
1199 int ret = 0;
1200 BlockDriverState *backing_hd;
1201 Error *local_err = NULL;
1203 if (bs->backing_hd != NULL) {
1204 QDECREF(options);
1205 goto free_exit;
1208 /* NULL means an empty set of options */
1209 if (options == NULL) {
1210 options = qdict_new();
1213 bs->open_flags &= ~BDRV_O_NO_BACKING;
1214 if (qdict_haskey(options, "file.filename")) {
1215 backing_filename[0] = '\0';
1216 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1217 QDECREF(options);
1218 goto free_exit;
1219 } else {
1220 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1223 if (!bs->drv || !bs->drv->supports_backing) {
1224 ret = -EINVAL;
1225 error_setg(errp, "Driver doesn't support backing files");
1226 QDECREF(options);
1227 goto free_exit;
1230 backing_hd = bdrv_new();
1232 if (bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
1233 qdict_put(options, "driver", qstring_from_str(bs->backing_format));
1236 assert(bs->backing_hd == NULL);
1237 ret = bdrv_open(&backing_hd,
1238 *backing_filename ? backing_filename : NULL, NULL, options,
1239 bdrv_backing_flags(bs->open_flags), NULL, &local_err);
1240 if (ret < 0) {
1241 bdrv_unref(backing_hd);
1242 backing_hd = NULL;
1243 bs->open_flags |= BDRV_O_NO_BACKING;
1244 error_setg(errp, "Could not open backing file: %s",
1245 error_get_pretty(local_err));
1246 error_free(local_err);
1247 goto free_exit;
1249 bdrv_set_backing_hd(bs, backing_hd);
1251 free_exit:
1252 g_free(backing_filename);
1253 return ret;
1257 * Opens a disk image whose options are given as BlockdevRef in another block
1258 * device's options.
1260 * If allow_none is true, no image will be opened if filename is false and no
1261 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1263 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1264 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1265 * itself, all options starting with "${bdref_key}." are considered part of the
1266 * BlockdevRef.
1268 * The BlockdevRef will be removed from the options QDict.
1270 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1272 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1273 QDict *options, const char *bdref_key, int flags,
1274 bool allow_none, Error **errp)
1276 QDict *image_options;
1277 int ret;
1278 char *bdref_key_dot;
1279 const char *reference;
1281 assert(pbs);
1282 assert(*pbs == NULL);
1284 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1285 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1286 g_free(bdref_key_dot);
1288 reference = qdict_get_try_str(options, bdref_key);
1289 if (!filename && !reference && !qdict_size(image_options)) {
1290 if (allow_none) {
1291 ret = 0;
1292 } else {
1293 error_setg(errp, "A block device must be specified for \"%s\"",
1294 bdref_key);
1295 ret = -EINVAL;
1297 QDECREF(image_options);
1298 goto done;
1301 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1303 done:
1304 qdict_del(options, bdref_key);
1305 return ret;
1308 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1310 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1311 char *tmp_filename = g_malloc0(PATH_MAX + 1);
1312 int64_t total_size;
1313 QemuOpts *opts = NULL;
1314 QDict *snapshot_options;
1315 BlockDriverState *bs_snapshot;
1316 Error *local_err;
1317 int ret;
1319 /* if snapshot, we create a temporary backing file and open it
1320 instead of opening 'filename' directly */
1322 /* Get the required size from the image */
1323 total_size = bdrv_getlength(bs);
1324 if (total_size < 0) {
1325 ret = total_size;
1326 error_setg_errno(errp, -total_size, "Could not get image size");
1327 goto out;
1330 /* Create the temporary image */
1331 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1332 if (ret < 0) {
1333 error_setg_errno(errp, -ret, "Could not get temporary filename");
1334 goto out;
1337 opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
1338 &error_abort);
1339 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1340 ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, &local_err);
1341 qemu_opts_del(opts);
1342 if (ret < 0) {
1343 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1344 "'%s': %s", tmp_filename,
1345 error_get_pretty(local_err));
1346 error_free(local_err);
1347 goto out;
1350 /* Prepare a new options QDict for the temporary file */
1351 snapshot_options = qdict_new();
1352 qdict_put(snapshot_options, "file.driver",
1353 qstring_from_str("file"));
1354 qdict_put(snapshot_options, "file.filename",
1355 qstring_from_str(tmp_filename));
1357 bs_snapshot = bdrv_new();
1359 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1360 flags, &bdrv_qcow2, &local_err);
1361 if (ret < 0) {
1362 error_propagate(errp, local_err);
1363 goto out;
1366 bdrv_append(bs_snapshot, bs);
1368 out:
1369 g_free(tmp_filename);
1370 return ret;
1374 * Opens a disk image (raw, qcow2, vmdk, ...)
1376 * options is a QDict of options to pass to the block drivers, or NULL for an
1377 * empty set of options. The reference to the QDict belongs to the block layer
1378 * after the call (even on failure), so if the caller intends to reuse the
1379 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1381 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1382 * If it is not NULL, the referenced BDS will be reused.
1384 * The reference parameter may be used to specify an existing block device which
1385 * should be opened. If specified, neither options nor a filename may be given,
1386 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1388 int bdrv_open(BlockDriverState **pbs, const char *filename,
1389 const char *reference, QDict *options, int flags,
1390 BlockDriver *drv, Error **errp)
1392 int ret;
1393 BlockDriverState *file = NULL, *bs;
1394 const char *drvname;
1395 Error *local_err = NULL;
1396 int snapshot_flags = 0;
1398 assert(pbs);
1400 if (reference) {
1401 bool options_non_empty = options ? qdict_size(options) : false;
1402 QDECREF(options);
1404 if (*pbs) {
1405 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1406 "another block device");
1407 return -EINVAL;
1410 if (filename || options_non_empty) {
1411 error_setg(errp, "Cannot reference an existing block device with "
1412 "additional options or a new filename");
1413 return -EINVAL;
1416 bs = bdrv_lookup_bs(reference, reference, errp);
1417 if (!bs) {
1418 return -ENODEV;
1420 bdrv_ref(bs);
1421 *pbs = bs;
1422 return 0;
1425 if (*pbs) {
1426 bs = *pbs;
1427 } else {
1428 bs = bdrv_new();
1431 /* NULL means an empty set of options */
1432 if (options == NULL) {
1433 options = qdict_new();
1436 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1437 if (local_err) {
1438 goto fail;
1441 /* Find the right image format driver */
1442 drv = NULL;
1443 drvname = qdict_get_try_str(options, "driver");
1444 if (drvname) {
1445 drv = bdrv_find_format(drvname);
1446 qdict_del(options, "driver");
1447 if (!drv) {
1448 error_setg(errp, "Unknown driver: '%s'", drvname);
1449 ret = -EINVAL;
1450 goto fail;
1454 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1455 if (drv && !drv->bdrv_file_open) {
1456 /* If the user explicitly wants a format driver here, we'll need to add
1457 * another layer for the protocol in bs->file */
1458 flags &= ~BDRV_O_PROTOCOL;
1461 bs->options = options;
1462 options = qdict_clone_shallow(options);
1464 /* Open image file without format layer */
1465 if ((flags & BDRV_O_PROTOCOL) == 0) {
1466 if (flags & BDRV_O_RDWR) {
1467 flags |= BDRV_O_ALLOW_RDWR;
1469 if (flags & BDRV_O_SNAPSHOT) {
1470 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1471 flags = bdrv_backing_flags(flags);
1474 assert(file == NULL);
1475 ret = bdrv_open_image(&file, filename, options, "file",
1476 bdrv_inherited_flags(flags),
1477 true, &local_err);
1478 if (ret < 0) {
1479 goto fail;
1483 /* Image format probing */
1484 bs->probed = !drv;
1485 if (!drv && file) {
1486 ret = find_image_format(file, filename, &drv, &local_err);
1487 if (ret < 0) {
1488 goto fail;
1490 } else if (!drv) {
1491 error_setg(errp, "Must specify either driver or file");
1492 ret = -EINVAL;
1493 goto fail;
1496 /* Open the image */
1497 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1498 if (ret < 0) {
1499 goto fail;
1502 if (file && (bs->file != file)) {
1503 bdrv_unref(file);
1504 file = NULL;
1507 /* If there is a backing file, use it */
1508 if ((flags & BDRV_O_NO_BACKING) == 0) {
1509 QDict *backing_options;
1511 qdict_extract_subqdict(options, &backing_options, "backing.");
1512 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1513 if (ret < 0) {
1514 goto close_and_fail;
1518 bdrv_refresh_filename(bs);
1520 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1521 * temporary snapshot afterwards. */
1522 if (snapshot_flags) {
1523 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1524 if (local_err) {
1525 goto close_and_fail;
1529 /* Check if any unknown options were used */
1530 if (options && (qdict_size(options) != 0)) {
1531 const QDictEntry *entry = qdict_first(options);
1532 if (flags & BDRV_O_PROTOCOL) {
1533 error_setg(errp, "Block protocol '%s' doesn't support the option "
1534 "'%s'", drv->format_name, entry->key);
1535 } else {
1536 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1537 "support the option '%s'", drv->format_name,
1538 bdrv_get_device_name(bs), entry->key);
1541 ret = -EINVAL;
1542 goto close_and_fail;
1545 if (!bdrv_key_required(bs)) {
1546 if (bs->blk) {
1547 blk_dev_change_media_cb(bs->blk, true);
1549 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1550 && !runstate_check(RUN_STATE_INMIGRATE)
1551 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1552 error_setg(errp,
1553 "Guest must be stopped for opening of encrypted image");
1554 ret = -EBUSY;
1555 goto close_and_fail;
1558 QDECREF(options);
1559 *pbs = bs;
1560 return 0;
1562 fail:
1563 if (file != NULL) {
1564 bdrv_unref(file);
1566 QDECREF(bs->options);
1567 QDECREF(options);
1568 bs->options = NULL;
1569 if (!*pbs) {
1570 /* If *pbs is NULL, a new BDS has been created in this function and
1571 needs to be freed now. Otherwise, it does not need to be closed,
1572 since it has not really been opened yet. */
1573 bdrv_unref(bs);
1575 if (local_err) {
1576 error_propagate(errp, local_err);
1578 return ret;
1580 close_and_fail:
1581 /* See fail path, but now the BDS has to be always closed */
1582 if (*pbs) {
1583 bdrv_close(bs);
1584 } else {
1585 bdrv_unref(bs);
1587 QDECREF(options);
1588 if (local_err) {
1589 error_propagate(errp, local_err);
1591 return ret;
1594 typedef struct BlockReopenQueueEntry {
1595 bool prepared;
1596 BDRVReopenState state;
1597 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1598 } BlockReopenQueueEntry;
1601 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1602 * reopen of multiple devices.
1604 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1605 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1606 * be created and initialized. This newly created BlockReopenQueue should be
1607 * passed back in for subsequent calls that are intended to be of the same
1608 * atomic 'set'.
1610 * bs is the BlockDriverState to add to the reopen queue.
1612 * flags contains the open flags for the associated bs
1614 * returns a pointer to bs_queue, which is either the newly allocated
1615 * bs_queue, or the existing bs_queue being used.
1618 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1619 BlockDriverState *bs, int flags)
1621 assert(bs != NULL);
1623 BlockReopenQueueEntry *bs_entry;
1624 if (bs_queue == NULL) {
1625 bs_queue = g_new0(BlockReopenQueue, 1);
1626 QSIMPLEQ_INIT(bs_queue);
1629 /* bdrv_open() masks this flag out */
1630 flags &= ~BDRV_O_PROTOCOL;
1632 if (bs->file) {
1633 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1636 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1637 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1639 bs_entry->state.bs = bs;
1640 bs_entry->state.flags = flags;
1642 return bs_queue;
1646 * Reopen multiple BlockDriverStates atomically & transactionally.
1648 * The queue passed in (bs_queue) must have been built up previous
1649 * via bdrv_reopen_queue().
1651 * Reopens all BDS specified in the queue, with the appropriate
1652 * flags. All devices are prepared for reopen, and failure of any
1653 * device will cause all device changes to be abandonded, and intermediate
1654 * data cleaned up.
1656 * If all devices prepare successfully, then the changes are committed
1657 * to all devices.
1660 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1662 int ret = -1;
1663 BlockReopenQueueEntry *bs_entry, *next;
1664 Error *local_err = NULL;
1666 assert(bs_queue != NULL);
1668 bdrv_drain_all();
1670 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1671 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1672 error_propagate(errp, local_err);
1673 goto cleanup;
1675 bs_entry->prepared = true;
1678 /* If we reach this point, we have success and just need to apply the
1679 * changes
1681 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1682 bdrv_reopen_commit(&bs_entry->state);
1685 ret = 0;
1687 cleanup:
1688 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1689 if (ret && bs_entry->prepared) {
1690 bdrv_reopen_abort(&bs_entry->state);
1692 g_free(bs_entry);
1694 g_free(bs_queue);
1695 return ret;
1699 /* Reopen a single BlockDriverState with the specified flags. */
1700 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1702 int ret = -1;
1703 Error *local_err = NULL;
1704 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1706 ret = bdrv_reopen_multiple(queue, &local_err);
1707 if (local_err != NULL) {
1708 error_propagate(errp, local_err);
1710 return ret;
1715 * Prepares a BlockDriverState for reopen. All changes are staged in the
1716 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1717 * the block driver layer .bdrv_reopen_prepare()
1719 * bs is the BlockDriverState to reopen
1720 * flags are the new open flags
1721 * queue is the reopen queue
1723 * Returns 0 on success, non-zero on error. On error errp will be set
1724 * as well.
1726 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1727 * It is the responsibility of the caller to then call the abort() or
1728 * commit() for any other BDS that have been left in a prepare() state
1731 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1732 Error **errp)
1734 int ret = -1;
1735 Error *local_err = NULL;
1736 BlockDriver *drv;
1738 assert(reopen_state != NULL);
1739 assert(reopen_state->bs->drv != NULL);
1740 drv = reopen_state->bs->drv;
1742 /* if we are to stay read-only, do not allow permission change
1743 * to r/w */
1744 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1745 reopen_state->flags & BDRV_O_RDWR) {
1746 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1747 bdrv_get_device_name(reopen_state->bs));
1748 goto error;
1752 ret = bdrv_flush(reopen_state->bs);
1753 if (ret) {
1754 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1755 strerror(-ret));
1756 goto error;
1759 if (drv->bdrv_reopen_prepare) {
1760 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1761 if (ret) {
1762 if (local_err != NULL) {
1763 error_propagate(errp, local_err);
1764 } else {
1765 error_setg(errp, "failed while preparing to reopen image '%s'",
1766 reopen_state->bs->filename);
1768 goto error;
1770 } else {
1771 /* It is currently mandatory to have a bdrv_reopen_prepare()
1772 * handler for each supported drv. */
1773 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1774 drv->format_name, bdrv_get_device_name(reopen_state->bs),
1775 "reopening of file");
1776 ret = -1;
1777 goto error;
1780 ret = 0;
1782 error:
1783 return ret;
1787 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1788 * makes them final by swapping the staging BlockDriverState contents into
1789 * the active BlockDriverState contents.
1791 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1793 BlockDriver *drv;
1795 assert(reopen_state != NULL);
1796 drv = reopen_state->bs->drv;
1797 assert(drv != NULL);
1799 /* If there are any driver level actions to take */
1800 if (drv->bdrv_reopen_commit) {
1801 drv->bdrv_reopen_commit(reopen_state);
1804 /* set BDS specific flags now */
1805 reopen_state->bs->open_flags = reopen_state->flags;
1806 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1807 BDRV_O_CACHE_WB);
1808 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1810 bdrv_refresh_limits(reopen_state->bs, NULL);
1814 * Abort the reopen, and delete and free the staged changes in
1815 * reopen_state
1817 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1819 BlockDriver *drv;
1821 assert(reopen_state != NULL);
1822 drv = reopen_state->bs->drv;
1823 assert(drv != NULL);
1825 if (drv->bdrv_reopen_abort) {
1826 drv->bdrv_reopen_abort(reopen_state);
1831 void bdrv_close(BlockDriverState *bs)
1833 BdrvAioNotifier *ban, *ban_next;
1835 if (bs->job) {
1836 block_job_cancel_sync(bs->job);
1838 bdrv_drain_all(); /* complete I/O */
1839 bdrv_flush(bs);
1840 bdrv_drain_all(); /* in case flush left pending I/O */
1841 notifier_list_notify(&bs->close_notifiers, bs);
1843 if (bs->drv) {
1844 if (bs->backing_hd) {
1845 BlockDriverState *backing_hd = bs->backing_hd;
1846 bdrv_set_backing_hd(bs, NULL);
1847 bdrv_unref(backing_hd);
1849 bs->drv->bdrv_close(bs);
1850 g_free(bs->opaque);
1851 bs->opaque = NULL;
1852 bs->drv = NULL;
1853 bs->copy_on_read = 0;
1854 bs->backing_file[0] = '\0';
1855 bs->backing_format[0] = '\0';
1856 bs->total_sectors = 0;
1857 bs->encrypted = 0;
1858 bs->valid_key = 0;
1859 bs->sg = 0;
1860 bs->growable = 0;
1861 bs->zero_beyond_eof = false;
1862 QDECREF(bs->options);
1863 bs->options = NULL;
1864 QDECREF(bs->full_open_options);
1865 bs->full_open_options = NULL;
1867 if (bs->file != NULL) {
1868 bdrv_unref(bs->file);
1869 bs->file = NULL;
1873 if (bs->blk) {
1874 blk_dev_change_media_cb(bs->blk, false);
1877 /*throttling disk I/O limits*/
1878 if (bs->io_limits_enabled) {
1879 bdrv_io_limits_disable(bs);
1882 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1883 g_free(ban);
1885 QLIST_INIT(&bs->aio_notifiers);
1888 void bdrv_close_all(void)
1890 BlockDriverState *bs;
1892 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1893 AioContext *aio_context = bdrv_get_aio_context(bs);
1895 aio_context_acquire(aio_context);
1896 bdrv_close(bs);
1897 aio_context_release(aio_context);
1901 /* Check if any requests are in-flight (including throttled requests) */
1902 static bool bdrv_requests_pending(BlockDriverState *bs)
1904 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1905 return true;
1907 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1908 return true;
1910 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1911 return true;
1913 if (bs->file && bdrv_requests_pending(bs->file)) {
1914 return true;
1916 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1917 return true;
1919 return false;
1922 static bool bdrv_drain_one(BlockDriverState *bs)
1924 bool bs_busy;
1926 bdrv_flush_io_queue(bs);
1927 bdrv_start_throttled_reqs(bs);
1928 bs_busy = bdrv_requests_pending(bs);
1929 bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
1930 return bs_busy;
1934 * Wait for pending requests to complete on a single BlockDriverState subtree
1936 * See the warning in bdrv_drain_all(). This function can only be called if
1937 * you are sure nothing can generate I/O because you have op blockers
1938 * installed.
1940 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
1941 * AioContext.
1943 void bdrv_drain(BlockDriverState *bs)
1945 while (bdrv_drain_one(bs)) {
1946 /* Keep iterating */
1951 * Wait for pending requests to complete across all BlockDriverStates
1953 * This function does not flush data to disk, use bdrv_flush_all() for that
1954 * after calling this function.
1956 * Note that completion of an asynchronous I/O operation can trigger any
1957 * number of other I/O operations on other devices---for example a coroutine
1958 * can be arbitrarily complex and a constant flow of I/O can come until the
1959 * coroutine is complete. Because of this, it is not possible to have a
1960 * function to drain a single device's I/O queue.
1962 void bdrv_drain_all(void)
1964 /* Always run first iteration so any pending completion BHs run */
1965 bool busy = true;
1966 BlockDriverState *bs;
1968 while (busy) {
1969 busy = false;
1971 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1972 AioContext *aio_context = bdrv_get_aio_context(bs);
1974 aio_context_acquire(aio_context);
1975 busy |= bdrv_drain_one(bs);
1976 aio_context_release(aio_context);
1981 /* make a BlockDriverState anonymous by removing from bdrv_state and
1982 * graph_bdrv_state list.
1983 Also, NULL terminate the device_name to prevent double remove */
1984 void bdrv_make_anon(BlockDriverState *bs)
1987 * Take care to remove bs from bdrv_states only when it's actually
1988 * in it. Note that bs->device_list.tqe_prev is initially null,
1989 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
1990 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1991 * resetting it to null on remove.
1993 if (bs->device_list.tqe_prev) {
1994 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1995 bs->device_list.tqe_prev = NULL;
1997 if (bs->node_name[0] != '\0') {
1998 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
2000 bs->node_name[0] = '\0';
2003 static void bdrv_rebind(BlockDriverState *bs)
2005 if (bs->drv && bs->drv->bdrv_rebind) {
2006 bs->drv->bdrv_rebind(bs);
2010 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2011 BlockDriverState *bs_src)
2013 /* move some fields that need to stay attached to the device */
2015 /* dev info */
2016 bs_dest->guest_block_size = bs_src->guest_block_size;
2017 bs_dest->copy_on_read = bs_src->copy_on_read;
2019 bs_dest->enable_write_cache = bs_src->enable_write_cache;
2021 /* i/o throttled req */
2022 memcpy(&bs_dest->throttle_state,
2023 &bs_src->throttle_state,
2024 sizeof(ThrottleState));
2025 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
2026 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
2027 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
2029 /* r/w error */
2030 bs_dest->on_read_error = bs_src->on_read_error;
2031 bs_dest->on_write_error = bs_src->on_write_error;
2033 /* i/o status */
2034 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2035 bs_dest->iostatus = bs_src->iostatus;
2037 /* dirty bitmap */
2038 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
2040 /* reference count */
2041 bs_dest->refcnt = bs_src->refcnt;
2043 /* job */
2044 bs_dest->job = bs_src->job;
2046 /* keep the same entry in bdrv_states */
2047 bs_dest->device_list = bs_src->device_list;
2048 bs_dest->blk = bs_src->blk;
2050 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2051 sizeof(bs_dest->op_blockers));
2055 * Swap bs contents for two image chains while they are live,
2056 * while keeping required fields on the BlockDriverState that is
2057 * actually attached to a device.
2059 * This will modify the BlockDriverState fields, and swap contents
2060 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2062 * bs_new must not be attached to a BlockBackend.
2064 * This function does not create any image files.
2066 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2068 BlockDriverState tmp;
2070 /* The code needs to swap the node_name but simply swapping node_list won't
2071 * work so first remove the nodes from the graph list, do the swap then
2072 * insert them back if needed.
2074 if (bs_new->node_name[0] != '\0') {
2075 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2077 if (bs_old->node_name[0] != '\0') {
2078 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2081 /* bs_new must be unattached and shouldn't have anything fancy enabled */
2082 assert(!bs_new->blk);
2083 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2084 assert(bs_new->job == NULL);
2085 assert(bs_new->io_limits_enabled == false);
2086 assert(!throttle_have_timer(&bs_new->throttle_state));
2088 tmp = *bs_new;
2089 *bs_new = *bs_old;
2090 *bs_old = tmp;
2092 /* there are some fields that should not be swapped, move them back */
2093 bdrv_move_feature_fields(&tmp, bs_old);
2094 bdrv_move_feature_fields(bs_old, bs_new);
2095 bdrv_move_feature_fields(bs_new, &tmp);
2097 /* bs_new must remain unattached */
2098 assert(!bs_new->blk);
2100 /* Check a few fields that should remain attached to the device */
2101 assert(bs_new->job == NULL);
2102 assert(bs_new->io_limits_enabled == false);
2103 assert(!throttle_have_timer(&bs_new->throttle_state));
2105 /* insert the nodes back into the graph node list if needed */
2106 if (bs_new->node_name[0] != '\0') {
2107 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2109 if (bs_old->node_name[0] != '\0') {
2110 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2113 bdrv_rebind(bs_new);
2114 bdrv_rebind(bs_old);
2118 * Add new bs contents at the top of an image chain while the chain is
2119 * live, while keeping required fields on the top layer.
2121 * This will modify the BlockDriverState fields, and swap contents
2122 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2124 * bs_new must not be attached to a BlockBackend.
2126 * This function does not create any image files.
2128 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2130 bdrv_swap(bs_new, bs_top);
2132 /* The contents of 'tmp' will become bs_top, as we are
2133 * swapping bs_new and bs_top contents. */
2134 bdrv_set_backing_hd(bs_top, bs_new);
2137 static void bdrv_delete(BlockDriverState *bs)
2139 assert(!bs->job);
2140 assert(bdrv_op_blocker_is_empty(bs));
2141 assert(!bs->refcnt);
2142 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2144 bdrv_close(bs);
2146 /* remove from list, if necessary */
2147 bdrv_make_anon(bs);
2149 g_free(bs);
2153 * Run consistency checks on an image
2155 * Returns 0 if the check could be completed (it doesn't mean that the image is
2156 * free of errors) or -errno when an internal error occurred. The results of the
2157 * check are stored in res.
2159 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2161 if (bs->drv == NULL) {
2162 return -ENOMEDIUM;
2164 if (bs->drv->bdrv_check == NULL) {
2165 return -ENOTSUP;
2168 memset(res, 0, sizeof(*res));
2169 return bs->drv->bdrv_check(bs, res, fix);
2172 #define COMMIT_BUF_SECTORS 2048
2174 /* commit COW file into the raw image */
2175 int bdrv_commit(BlockDriverState *bs)
2177 BlockDriver *drv = bs->drv;
2178 int64_t sector, total_sectors, length, backing_length;
2179 int n, ro, open_flags;
2180 int ret = 0;
2181 uint8_t *buf = NULL;
2182 char filename[PATH_MAX];
2184 if (!drv)
2185 return -ENOMEDIUM;
2187 if (!bs->backing_hd) {
2188 return -ENOTSUP;
2191 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2192 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2193 return -EBUSY;
2196 ro = bs->backing_hd->read_only;
2197 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2198 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2199 open_flags = bs->backing_hd->open_flags;
2201 if (ro) {
2202 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2203 return -EACCES;
2207 length = bdrv_getlength(bs);
2208 if (length < 0) {
2209 ret = length;
2210 goto ro_cleanup;
2213 backing_length = bdrv_getlength(bs->backing_hd);
2214 if (backing_length < 0) {
2215 ret = backing_length;
2216 goto ro_cleanup;
2219 /* If our top snapshot is larger than the backing file image,
2220 * grow the backing file image if possible. If not possible,
2221 * we must return an error */
2222 if (length > backing_length) {
2223 ret = bdrv_truncate(bs->backing_hd, length);
2224 if (ret < 0) {
2225 goto ro_cleanup;
2229 total_sectors = length >> BDRV_SECTOR_BITS;
2231 /* qemu_try_blockalign() for bs will choose an alignment that works for
2232 * bs->backing_hd as well, so no need to compare the alignment manually. */
2233 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2234 if (buf == NULL) {
2235 ret = -ENOMEM;
2236 goto ro_cleanup;
2239 for (sector = 0; sector < total_sectors; sector += n) {
2240 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2241 if (ret < 0) {
2242 goto ro_cleanup;
2244 if (ret) {
2245 ret = bdrv_read(bs, sector, buf, n);
2246 if (ret < 0) {
2247 goto ro_cleanup;
2250 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2251 if (ret < 0) {
2252 goto ro_cleanup;
2257 if (drv->bdrv_make_empty) {
2258 ret = drv->bdrv_make_empty(bs);
2259 if (ret < 0) {
2260 goto ro_cleanup;
2262 bdrv_flush(bs);
2266 * Make sure all data we wrote to the backing device is actually
2267 * stable on disk.
2269 if (bs->backing_hd) {
2270 bdrv_flush(bs->backing_hd);
2273 ret = 0;
2274 ro_cleanup:
2275 qemu_vfree(buf);
2277 if (ro) {
2278 /* ignoring error return here */
2279 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2282 return ret;
2285 int bdrv_commit_all(void)
2287 BlockDriverState *bs;
2289 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2290 AioContext *aio_context = bdrv_get_aio_context(bs);
2292 aio_context_acquire(aio_context);
2293 if (bs->drv && bs->backing_hd) {
2294 int ret = bdrv_commit(bs);
2295 if (ret < 0) {
2296 aio_context_release(aio_context);
2297 return ret;
2300 aio_context_release(aio_context);
2302 return 0;
2306 * Remove an active request from the tracked requests list
2308 * This function should be called when a tracked request is completing.
2310 static void tracked_request_end(BdrvTrackedRequest *req)
2312 if (req->serialising) {
2313 req->bs->serialising_in_flight--;
2316 QLIST_REMOVE(req, list);
2317 qemu_co_queue_restart_all(&req->wait_queue);
2321 * Add an active request to the tracked requests list
2323 static void tracked_request_begin(BdrvTrackedRequest *req,
2324 BlockDriverState *bs,
2325 int64_t offset,
2326 unsigned int bytes, bool is_write)
2328 *req = (BdrvTrackedRequest){
2329 .bs = bs,
2330 .offset = offset,
2331 .bytes = bytes,
2332 .is_write = is_write,
2333 .co = qemu_coroutine_self(),
2334 .serialising = false,
2335 .overlap_offset = offset,
2336 .overlap_bytes = bytes,
2339 qemu_co_queue_init(&req->wait_queue);
2341 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2344 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2346 int64_t overlap_offset = req->offset & ~(align - 1);
2347 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2348 - overlap_offset;
2350 if (!req->serialising) {
2351 req->bs->serialising_in_flight++;
2352 req->serialising = true;
2355 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2356 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2360 * Round a region to cluster boundaries
2362 void bdrv_round_to_clusters(BlockDriverState *bs,
2363 int64_t sector_num, int nb_sectors,
2364 int64_t *cluster_sector_num,
2365 int *cluster_nb_sectors)
2367 BlockDriverInfo bdi;
2369 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2370 *cluster_sector_num = sector_num;
2371 *cluster_nb_sectors = nb_sectors;
2372 } else {
2373 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2374 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2375 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2376 nb_sectors, c);
2380 static int bdrv_get_cluster_size(BlockDriverState *bs)
2382 BlockDriverInfo bdi;
2383 int ret;
2385 ret = bdrv_get_info(bs, &bdi);
2386 if (ret < 0 || bdi.cluster_size == 0) {
2387 return bs->request_alignment;
2388 } else {
2389 return bdi.cluster_size;
2393 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2394 int64_t offset, unsigned int bytes)
2396 /* aaaa bbbb */
2397 if (offset >= req->overlap_offset + req->overlap_bytes) {
2398 return false;
2400 /* bbbb aaaa */
2401 if (req->overlap_offset >= offset + bytes) {
2402 return false;
2404 return true;
2407 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2409 BlockDriverState *bs = self->bs;
2410 BdrvTrackedRequest *req;
2411 bool retry;
2412 bool waited = false;
2414 if (!bs->serialising_in_flight) {
2415 return false;
2418 do {
2419 retry = false;
2420 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2421 if (req == self || (!req->serialising && !self->serialising)) {
2422 continue;
2424 if (tracked_request_overlaps(req, self->overlap_offset,
2425 self->overlap_bytes))
2427 /* Hitting this means there was a reentrant request, for
2428 * example, a block driver issuing nested requests. This must
2429 * never happen since it means deadlock.
2431 assert(qemu_coroutine_self() != req->co);
2433 /* If the request is already (indirectly) waiting for us, or
2434 * will wait for us as soon as it wakes up, then just go on
2435 * (instead of producing a deadlock in the former case). */
2436 if (!req->waiting_for) {
2437 self->waiting_for = req;
2438 qemu_co_queue_wait(&req->wait_queue);
2439 self->waiting_for = NULL;
2440 retry = true;
2441 waited = true;
2442 break;
2446 } while (retry);
2448 return waited;
2452 * Return values:
2453 * 0 - success
2454 * -EINVAL - backing format specified, but no file
2455 * -ENOSPC - can't update the backing file because no space is left in the
2456 * image file header
2457 * -ENOTSUP - format driver doesn't support changing the backing file
2459 int bdrv_change_backing_file(BlockDriverState *bs,
2460 const char *backing_file, const char *backing_fmt)
2462 BlockDriver *drv = bs->drv;
2463 int ret;
2465 /* Backing file format doesn't make sense without a backing file */
2466 if (backing_fmt && !backing_file) {
2467 return -EINVAL;
2470 if (drv->bdrv_change_backing_file != NULL) {
2471 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2472 } else {
2473 ret = -ENOTSUP;
2476 if (ret == 0) {
2477 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2478 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2480 return ret;
2484 * Finds the image layer in the chain that has 'bs' as its backing file.
2486 * active is the current topmost image.
2488 * Returns NULL if bs is not found in active's image chain,
2489 * or if active == bs.
2491 * Returns the bottommost base image if bs == NULL.
2493 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2494 BlockDriverState *bs)
2496 while (active && bs != active->backing_hd) {
2497 active = active->backing_hd;
2500 return active;
2503 /* Given a BDS, searches for the base layer. */
2504 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2506 return bdrv_find_overlay(bs, NULL);
2509 typedef struct BlkIntermediateStates {
2510 BlockDriverState *bs;
2511 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2512 } BlkIntermediateStates;
2516 * Drops images above 'base' up to and including 'top', and sets the image
2517 * above 'top' to have base as its backing file.
2519 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2520 * information in 'bs' can be properly updated.
2522 * E.g., this will convert the following chain:
2523 * bottom <- base <- intermediate <- top <- active
2525 * to
2527 * bottom <- base <- active
2529 * It is allowed for bottom==base, in which case it converts:
2531 * base <- intermediate <- top <- active
2533 * to
2535 * base <- active
2537 * If backing_file_str is non-NULL, it will be used when modifying top's
2538 * overlay image metadata.
2540 * Error conditions:
2541 * if active == top, that is considered an error
2544 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2545 BlockDriverState *base, const char *backing_file_str)
2547 BlockDriverState *intermediate;
2548 BlockDriverState *base_bs = NULL;
2549 BlockDriverState *new_top_bs = NULL;
2550 BlkIntermediateStates *intermediate_state, *next;
2551 int ret = -EIO;
2553 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2554 QSIMPLEQ_INIT(&states_to_delete);
2556 if (!top->drv || !base->drv) {
2557 goto exit;
2560 new_top_bs = bdrv_find_overlay(active, top);
2562 if (new_top_bs == NULL) {
2563 /* we could not find the image above 'top', this is an error */
2564 goto exit;
2567 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2568 * to do, no intermediate images */
2569 if (new_top_bs->backing_hd == base) {
2570 ret = 0;
2571 goto exit;
2574 intermediate = top;
2576 /* now we will go down through the list, and add each BDS we find
2577 * into our deletion queue, until we hit the 'base'
2579 while (intermediate) {
2580 intermediate_state = g_new0(BlkIntermediateStates, 1);
2581 intermediate_state->bs = intermediate;
2582 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2584 if (intermediate->backing_hd == base) {
2585 base_bs = intermediate->backing_hd;
2586 break;
2588 intermediate = intermediate->backing_hd;
2590 if (base_bs == NULL) {
2591 /* something went wrong, we did not end at the base. safely
2592 * unravel everything, and exit with error */
2593 goto exit;
2596 /* success - we can delete the intermediate states, and link top->base */
2597 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2598 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2599 base_bs->drv ? base_bs->drv->format_name : "");
2600 if (ret) {
2601 goto exit;
2603 bdrv_set_backing_hd(new_top_bs, base_bs);
2605 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2606 /* so that bdrv_close() does not recursively close the chain */
2607 bdrv_set_backing_hd(intermediate_state->bs, NULL);
2608 bdrv_unref(intermediate_state->bs);
2610 ret = 0;
2612 exit:
2613 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2614 g_free(intermediate_state);
2616 return ret;
2620 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2621 size_t size)
2623 int64_t len;
2625 if (size > INT_MAX) {
2626 return -EIO;
2629 if (!bdrv_is_inserted(bs))
2630 return -ENOMEDIUM;
2632 if (bs->growable)
2633 return 0;
2635 len = bdrv_getlength(bs);
2637 if (offset < 0)
2638 return -EIO;
2640 if ((offset > len) || (len - offset < size))
2641 return -EIO;
2643 return 0;
2646 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2647 int nb_sectors)
2649 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2650 return -EIO;
2653 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2654 nb_sectors * BDRV_SECTOR_SIZE);
2657 typedef struct RwCo {
2658 BlockDriverState *bs;
2659 int64_t offset;
2660 QEMUIOVector *qiov;
2661 bool is_write;
2662 int ret;
2663 BdrvRequestFlags flags;
2664 } RwCo;
2666 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2668 RwCo *rwco = opaque;
2670 if (!rwco->is_write) {
2671 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2672 rwco->qiov->size, rwco->qiov,
2673 rwco->flags);
2674 } else {
2675 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2676 rwco->qiov->size, rwco->qiov,
2677 rwco->flags);
2682 * Process a vectored synchronous request using coroutines
2684 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2685 QEMUIOVector *qiov, bool is_write,
2686 BdrvRequestFlags flags)
2688 Coroutine *co;
2689 RwCo rwco = {
2690 .bs = bs,
2691 .offset = offset,
2692 .qiov = qiov,
2693 .is_write = is_write,
2694 .ret = NOT_DONE,
2695 .flags = flags,
2699 * In sync call context, when the vcpu is blocked, this throttling timer
2700 * will not fire; so the I/O throttling function has to be disabled here
2701 * if it has been enabled.
2703 if (bs->io_limits_enabled) {
2704 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2705 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2706 bdrv_io_limits_disable(bs);
2709 if (qemu_in_coroutine()) {
2710 /* Fast-path if already in coroutine context */
2711 bdrv_rw_co_entry(&rwco);
2712 } else {
2713 AioContext *aio_context = bdrv_get_aio_context(bs);
2715 co = qemu_coroutine_create(bdrv_rw_co_entry);
2716 qemu_coroutine_enter(co, &rwco);
2717 while (rwco.ret == NOT_DONE) {
2718 aio_poll(aio_context, true);
2721 return rwco.ret;
2725 * Process a synchronous request using coroutines
2727 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2728 int nb_sectors, bool is_write, BdrvRequestFlags flags)
2730 QEMUIOVector qiov;
2731 struct iovec iov = {
2732 .iov_base = (void *)buf,
2733 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2736 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2737 return -EINVAL;
2740 qemu_iovec_init_external(&qiov, &iov, 1);
2741 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2742 &qiov, is_write, flags);
2745 /* return < 0 if error. See bdrv_write() for the return codes */
2746 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2747 uint8_t *buf, int nb_sectors)
2749 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2752 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2753 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2754 uint8_t *buf, int nb_sectors)
2756 bool enabled;
2757 int ret;
2759 enabled = bs->io_limits_enabled;
2760 bs->io_limits_enabled = false;
2761 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2762 bs->io_limits_enabled = enabled;
2763 return ret;
2766 /* Return < 0 if error. Important errors are:
2767 -EIO generic I/O error (may happen for all errors)
2768 -ENOMEDIUM No media inserted.
2769 -EINVAL Invalid sector number or nb_sectors
2770 -EACCES Trying to write a read-only device
2772 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2773 const uint8_t *buf, int nb_sectors)
2775 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2778 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2779 int nb_sectors, BdrvRequestFlags flags)
2781 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2782 BDRV_REQ_ZERO_WRITE | flags);
2786 * Completely zero out a block device with the help of bdrv_write_zeroes.
2787 * The operation is sped up by checking the block status and only writing
2788 * zeroes to the device if they currently do not return zeroes. Optional
2789 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2791 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2793 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2795 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2796 int n;
2798 target_sectors = bdrv_nb_sectors(bs);
2799 if (target_sectors < 0) {
2800 return target_sectors;
2803 for (;;) {
2804 nb_sectors = target_sectors - sector_num;
2805 if (nb_sectors <= 0) {
2806 return 0;
2808 if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2809 nb_sectors = INT_MAX / BDRV_SECTOR_SIZE;
2811 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2812 if (ret < 0) {
2813 error_report("error getting block status at sector %" PRId64 ": %s",
2814 sector_num, strerror(-ret));
2815 return ret;
2817 if (ret & BDRV_BLOCK_ZERO) {
2818 sector_num += n;
2819 continue;
2821 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2822 if (ret < 0) {
2823 error_report("error writing zeroes at sector %" PRId64 ": %s",
2824 sector_num, strerror(-ret));
2825 return ret;
2827 sector_num += n;
2831 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2833 QEMUIOVector qiov;
2834 struct iovec iov = {
2835 .iov_base = (void *)buf,
2836 .iov_len = bytes,
2838 int ret;
2840 if (bytes < 0) {
2841 return -EINVAL;
2844 qemu_iovec_init_external(&qiov, &iov, 1);
2845 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2846 if (ret < 0) {
2847 return ret;
2850 return bytes;
2853 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2855 int ret;
2857 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2858 if (ret < 0) {
2859 return ret;
2862 return qiov->size;
2865 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2866 const void *buf, int bytes)
2868 QEMUIOVector qiov;
2869 struct iovec iov = {
2870 .iov_base = (void *) buf,
2871 .iov_len = bytes,
2874 if (bytes < 0) {
2875 return -EINVAL;
2878 qemu_iovec_init_external(&qiov, &iov, 1);
2879 return bdrv_pwritev(bs, offset, &qiov);
2883 * Writes to the file and ensures that no writes are reordered across this
2884 * request (acts as a barrier)
2886 * Returns 0 on success, -errno in error cases.
2888 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2889 const void *buf, int count)
2891 int ret;
2893 ret = bdrv_pwrite(bs, offset, buf, count);
2894 if (ret < 0) {
2895 return ret;
2898 /* No flush needed for cache modes that already do it */
2899 if (bs->enable_write_cache) {
2900 bdrv_flush(bs);
2903 return 0;
2906 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2907 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2909 /* Perform I/O through a temporary buffer so that users who scribble over
2910 * their read buffer while the operation is in progress do not end up
2911 * modifying the image file. This is critical for zero-copy guest I/O
2912 * where anything might happen inside guest memory.
2914 void *bounce_buffer;
2916 BlockDriver *drv = bs->drv;
2917 struct iovec iov;
2918 QEMUIOVector bounce_qiov;
2919 int64_t cluster_sector_num;
2920 int cluster_nb_sectors;
2921 size_t skip_bytes;
2922 int ret;
2924 /* Cover entire cluster so no additional backing file I/O is required when
2925 * allocating cluster in the image file.
2927 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2928 &cluster_sector_num, &cluster_nb_sectors);
2930 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2931 cluster_sector_num, cluster_nb_sectors);
2933 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2934 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2935 if (bounce_buffer == NULL) {
2936 ret = -ENOMEM;
2937 goto err;
2940 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2942 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2943 &bounce_qiov);
2944 if (ret < 0) {
2945 goto err;
2948 if (drv->bdrv_co_write_zeroes &&
2949 buffer_is_zero(bounce_buffer, iov.iov_len)) {
2950 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2951 cluster_nb_sectors, 0);
2952 } else {
2953 /* This does not change the data on the disk, it is not necessary
2954 * to flush even in cache=writethrough mode.
2956 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2957 &bounce_qiov);
2960 if (ret < 0) {
2961 /* It might be okay to ignore write errors for guest requests. If this
2962 * is a deliberate copy-on-read then we don't want to ignore the error.
2963 * Simply report it in all cases.
2965 goto err;
2968 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2969 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2970 nb_sectors * BDRV_SECTOR_SIZE);
2972 err:
2973 qemu_vfree(bounce_buffer);
2974 return ret;
2978 * Forwards an already correctly aligned request to the BlockDriver. This
2979 * handles copy on read and zeroing after EOF; any other features must be
2980 * implemented by the caller.
2982 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2983 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2984 int64_t align, QEMUIOVector *qiov, int flags)
2986 BlockDriver *drv = bs->drv;
2987 int ret;
2989 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2990 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2992 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2993 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2994 assert(!qiov || bytes == qiov->size);
2996 /* Handle Copy on Read and associated serialisation */
2997 if (flags & BDRV_REQ_COPY_ON_READ) {
2998 /* If we touch the same cluster it counts as an overlap. This
2999 * guarantees that allocating writes will be serialized and not race
3000 * with each other for the same cluster. For example, in copy-on-read
3001 * it ensures that the CoR read and write operations are atomic and
3002 * guest writes cannot interleave between them. */
3003 mark_request_serialising(req, bdrv_get_cluster_size(bs));
3006 wait_serialising_requests(req);
3008 if (flags & BDRV_REQ_COPY_ON_READ) {
3009 int pnum;
3011 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3012 if (ret < 0) {
3013 goto out;
3016 if (!ret || pnum != nb_sectors) {
3017 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3018 goto out;
3022 /* Forward the request to the BlockDriver */
3023 if (!(bs->zero_beyond_eof && bs->growable)) {
3024 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3025 } else {
3026 /* Read zeros after EOF of growable BDSes */
3027 int64_t total_sectors, max_nb_sectors;
3029 total_sectors = bdrv_nb_sectors(bs);
3030 if (total_sectors < 0) {
3031 ret = total_sectors;
3032 goto out;
3035 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3036 align >> BDRV_SECTOR_BITS);
3037 if (nb_sectors < max_nb_sectors) {
3038 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3039 } else if (max_nb_sectors > 0) {
3040 QEMUIOVector local_qiov;
3042 qemu_iovec_init(&local_qiov, qiov->niov);
3043 qemu_iovec_concat(&local_qiov, qiov, 0,
3044 max_nb_sectors * BDRV_SECTOR_SIZE);
3046 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
3047 &local_qiov);
3049 qemu_iovec_destroy(&local_qiov);
3050 } else {
3051 ret = 0;
3054 /* Reading beyond end of file is supposed to produce zeroes */
3055 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3056 uint64_t offset = MAX(0, total_sectors - sector_num);
3057 uint64_t bytes = (sector_num + nb_sectors - offset) *
3058 BDRV_SECTOR_SIZE;
3059 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3063 out:
3064 return ret;
3068 * Handle a read request in coroutine context
3070 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3071 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3072 BdrvRequestFlags flags)
3074 BlockDriver *drv = bs->drv;
3075 BdrvTrackedRequest req;
3077 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3078 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3079 uint8_t *head_buf = NULL;
3080 uint8_t *tail_buf = NULL;
3081 QEMUIOVector local_qiov;
3082 bool use_local_qiov = false;
3083 int ret;
3085 if (!drv) {
3086 return -ENOMEDIUM;
3088 if (bdrv_check_byte_request(bs, offset, bytes)) {
3089 return -EIO;
3092 if (bs->copy_on_read) {
3093 flags |= BDRV_REQ_COPY_ON_READ;
3096 /* throttling disk I/O */
3097 if (bs->io_limits_enabled) {
3098 bdrv_io_limits_intercept(bs, bytes, false);
3101 /* Align read if necessary by padding qiov */
3102 if (offset & (align - 1)) {
3103 head_buf = qemu_blockalign(bs, align);
3104 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3105 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3106 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3107 use_local_qiov = true;
3109 bytes += offset & (align - 1);
3110 offset = offset & ~(align - 1);
3113 if ((offset + bytes) & (align - 1)) {
3114 if (!use_local_qiov) {
3115 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3116 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3117 use_local_qiov = true;
3119 tail_buf = qemu_blockalign(bs, align);
3120 qemu_iovec_add(&local_qiov, tail_buf,
3121 align - ((offset + bytes) & (align - 1)));
3123 bytes = ROUND_UP(bytes, align);
3126 tracked_request_begin(&req, bs, offset, bytes, false);
3127 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3128 use_local_qiov ? &local_qiov : qiov,
3129 flags);
3130 tracked_request_end(&req);
3132 if (use_local_qiov) {
3133 qemu_iovec_destroy(&local_qiov);
3134 qemu_vfree(head_buf);
3135 qemu_vfree(tail_buf);
3138 return ret;
3141 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3142 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3143 BdrvRequestFlags flags)
3145 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3146 return -EINVAL;
3149 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3150 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3153 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3154 int nb_sectors, QEMUIOVector *qiov)
3156 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3158 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3161 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3162 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3164 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3166 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3167 BDRV_REQ_COPY_ON_READ);
3170 /* if no limit is specified in the BlockLimits use a default
3171 * of 32768 512-byte sectors (16 MiB) per request.
3173 #define MAX_WRITE_ZEROES_DEFAULT 32768
3175 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3176 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3178 BlockDriver *drv = bs->drv;
3179 QEMUIOVector qiov;
3180 struct iovec iov = {0};
3181 int ret = 0;
3183 int max_write_zeroes = bs->bl.max_write_zeroes ?
3184 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3186 while (nb_sectors > 0 && !ret) {
3187 int num = nb_sectors;
3189 /* Align request. Block drivers can expect the "bulk" of the request
3190 * to be aligned.
3192 if (bs->bl.write_zeroes_alignment
3193 && num > bs->bl.write_zeroes_alignment) {
3194 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3195 /* Make a small request up to the first aligned sector. */
3196 num = bs->bl.write_zeroes_alignment;
3197 num -= sector_num % bs->bl.write_zeroes_alignment;
3198 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3199 /* Shorten the request to the last aligned sector. num cannot
3200 * underflow because num > bs->bl.write_zeroes_alignment.
3202 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3206 /* limit request size */
3207 if (num > max_write_zeroes) {
3208 num = max_write_zeroes;
3211 ret = -ENOTSUP;
3212 /* First try the efficient write zeroes operation */
3213 if (drv->bdrv_co_write_zeroes) {
3214 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3217 if (ret == -ENOTSUP) {
3218 /* Fall back to bounce buffer if write zeroes is unsupported */
3219 iov.iov_len = num * BDRV_SECTOR_SIZE;
3220 if (iov.iov_base == NULL) {
3221 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3222 if (iov.iov_base == NULL) {
3223 ret = -ENOMEM;
3224 goto fail;
3226 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3228 qemu_iovec_init_external(&qiov, &iov, 1);
3230 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3232 /* Keep bounce buffer around if it is big enough for all
3233 * all future requests.
3235 if (num < max_write_zeroes) {
3236 qemu_vfree(iov.iov_base);
3237 iov.iov_base = NULL;
3241 sector_num += num;
3242 nb_sectors -= num;
3245 fail:
3246 qemu_vfree(iov.iov_base);
3247 return ret;
3251 * Forwards an already correctly aligned write request to the BlockDriver.
3253 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3254 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3255 QEMUIOVector *qiov, int flags)
3257 BlockDriver *drv = bs->drv;
3258 bool waited;
3259 int ret;
3261 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3262 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3264 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3265 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3266 assert(!qiov || bytes == qiov->size);
3268 waited = wait_serialising_requests(req);
3269 assert(!waited || !req->serialising);
3270 assert(req->overlap_offset <= offset);
3271 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3273 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3275 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3276 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3277 qemu_iovec_is_zero(qiov)) {
3278 flags |= BDRV_REQ_ZERO_WRITE;
3279 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3280 flags |= BDRV_REQ_MAY_UNMAP;
3284 if (ret < 0) {
3285 /* Do nothing, write notifier decided to fail this request */
3286 } else if (flags & BDRV_REQ_ZERO_WRITE) {
3287 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3288 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3289 } else {
3290 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3291 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3293 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3295 if (ret == 0 && !bs->enable_write_cache) {
3296 ret = bdrv_co_flush(bs);
3299 bdrv_set_dirty(bs, sector_num, nb_sectors);
3301 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3303 if (bs->growable && ret >= 0) {
3304 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3307 return ret;
3311 * Handle a write request in coroutine context
3313 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3314 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3315 BdrvRequestFlags flags)
3317 BdrvTrackedRequest req;
3318 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3319 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3320 uint8_t *head_buf = NULL;
3321 uint8_t *tail_buf = NULL;
3322 QEMUIOVector local_qiov;
3323 bool use_local_qiov = false;
3324 int ret;
3326 if (!bs->drv) {
3327 return -ENOMEDIUM;
3329 if (bs->read_only) {
3330 return -EACCES;
3332 if (bdrv_check_byte_request(bs, offset, bytes)) {
3333 return -EIO;
3336 /* throttling disk I/O */
3337 if (bs->io_limits_enabled) {
3338 bdrv_io_limits_intercept(bs, bytes, true);
3342 * Align write if necessary by performing a read-modify-write cycle.
3343 * Pad qiov with the read parts and be sure to have a tracked request not
3344 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3346 tracked_request_begin(&req, bs, offset, bytes, true);
3348 if (offset & (align - 1)) {
3349 QEMUIOVector head_qiov;
3350 struct iovec head_iov;
3352 mark_request_serialising(&req, align);
3353 wait_serialising_requests(&req);
3355 head_buf = qemu_blockalign(bs, align);
3356 head_iov = (struct iovec) {
3357 .iov_base = head_buf,
3358 .iov_len = align,
3360 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3362 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3363 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3364 align, &head_qiov, 0);
3365 if (ret < 0) {
3366 goto fail;
3368 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3370 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3371 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3372 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3373 use_local_qiov = true;
3375 bytes += offset & (align - 1);
3376 offset = offset & ~(align - 1);
3379 if ((offset + bytes) & (align - 1)) {
3380 QEMUIOVector tail_qiov;
3381 struct iovec tail_iov;
3382 size_t tail_bytes;
3383 bool waited;
3385 mark_request_serialising(&req, align);
3386 waited = wait_serialising_requests(&req);
3387 assert(!waited || !use_local_qiov);
3389 tail_buf = qemu_blockalign(bs, align);
3390 tail_iov = (struct iovec) {
3391 .iov_base = tail_buf,
3392 .iov_len = align,
3394 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3396 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3397 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3398 align, &tail_qiov, 0);
3399 if (ret < 0) {
3400 goto fail;
3402 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3404 if (!use_local_qiov) {
3405 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3406 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3407 use_local_qiov = true;
3410 tail_bytes = (offset + bytes) & (align - 1);
3411 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3413 bytes = ROUND_UP(bytes, align);
3416 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3417 use_local_qiov ? &local_qiov : qiov,
3418 flags);
3420 fail:
3421 tracked_request_end(&req);
3423 if (use_local_qiov) {
3424 qemu_iovec_destroy(&local_qiov);
3426 qemu_vfree(head_buf);
3427 qemu_vfree(tail_buf);
3429 return ret;
3432 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3433 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3434 BdrvRequestFlags flags)
3436 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3437 return -EINVAL;
3440 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3441 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3444 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3445 int nb_sectors, QEMUIOVector *qiov)
3447 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3449 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3452 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3453 int64_t sector_num, int nb_sectors,
3454 BdrvRequestFlags flags)
3456 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3458 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3459 flags &= ~BDRV_REQ_MAY_UNMAP;
3462 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3463 BDRV_REQ_ZERO_WRITE | flags);
3467 * Truncate file to 'offset' bytes (needed only for file protocols)
3469 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3471 BlockDriver *drv = bs->drv;
3472 int ret;
3473 if (!drv)
3474 return -ENOMEDIUM;
3475 if (!drv->bdrv_truncate)
3476 return -ENOTSUP;
3477 if (bs->read_only)
3478 return -EACCES;
3480 ret = drv->bdrv_truncate(bs, offset);
3481 if (ret == 0) {
3482 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3483 if (bs->blk) {
3484 blk_dev_resize_cb(bs->blk);
3487 return ret;
3491 * Length of a allocated file in bytes. Sparse files are counted by actual
3492 * allocated space. Return < 0 if error or unknown.
3494 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3496 BlockDriver *drv = bs->drv;
3497 if (!drv) {
3498 return -ENOMEDIUM;
3500 if (drv->bdrv_get_allocated_file_size) {
3501 return drv->bdrv_get_allocated_file_size(bs);
3503 if (bs->file) {
3504 return bdrv_get_allocated_file_size(bs->file);
3506 return -ENOTSUP;
3510 * Return number of sectors on success, -errno on error.
3512 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3514 BlockDriver *drv = bs->drv;
3516 if (!drv)
3517 return -ENOMEDIUM;
3519 if (drv->has_variable_length) {
3520 int ret = refresh_total_sectors(bs, bs->total_sectors);
3521 if (ret < 0) {
3522 return ret;
3525 return bs->total_sectors;
3529 * Return length in bytes on success, -errno on error.
3530 * The length is always a multiple of BDRV_SECTOR_SIZE.
3532 int64_t bdrv_getlength(BlockDriverState *bs)
3534 int64_t ret = bdrv_nb_sectors(bs);
3536 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3539 /* return 0 as number of sectors if no device present or error */
3540 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3542 int64_t nb_sectors = bdrv_nb_sectors(bs);
3544 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3547 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3548 BlockdevOnError on_write_error)
3550 bs->on_read_error = on_read_error;
3551 bs->on_write_error = on_write_error;
3554 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3556 return is_read ? bs->on_read_error : bs->on_write_error;
3559 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3561 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3563 switch (on_err) {
3564 case BLOCKDEV_ON_ERROR_ENOSPC:
3565 return (error == ENOSPC) ?
3566 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3567 case BLOCKDEV_ON_ERROR_STOP:
3568 return BLOCK_ERROR_ACTION_STOP;
3569 case BLOCKDEV_ON_ERROR_REPORT:
3570 return BLOCK_ERROR_ACTION_REPORT;
3571 case BLOCKDEV_ON_ERROR_IGNORE:
3572 return BLOCK_ERROR_ACTION_IGNORE;
3573 default:
3574 abort();
3578 static void send_qmp_error_event(BlockDriverState *bs,
3579 BlockErrorAction action,
3580 bool is_read, int error)
3582 IoOperationType optype;
3584 optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3585 qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
3586 bdrv_iostatus_is_enabled(bs),
3587 error == ENOSPC, strerror(error),
3588 &error_abort);
3591 /* This is done by device models because, while the block layer knows
3592 * about the error, it does not know whether an operation comes from
3593 * the device or the block layer (from a job, for example).
3595 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3596 bool is_read, int error)
3598 assert(error >= 0);
3600 if (action == BLOCK_ERROR_ACTION_STOP) {
3601 /* First set the iostatus, so that "info block" returns an iostatus
3602 * that matches the events raised so far (an additional error iostatus
3603 * is fine, but not a lost one).
3605 bdrv_iostatus_set_err(bs, error);
3607 /* Then raise the request to stop the VM and the event.
3608 * qemu_system_vmstop_request_prepare has two effects. First,
3609 * it ensures that the STOP event always comes after the
3610 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3611 * can observe the STOP event and do a "cont" before the STOP
3612 * event is issued, the VM will not stop. In this case, vm_start()
3613 * also ensures that the STOP/RESUME pair of events is emitted.
3615 qemu_system_vmstop_request_prepare();
3616 send_qmp_error_event(bs, action, is_read, error);
3617 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3618 } else {
3619 send_qmp_error_event(bs, action, is_read, error);
3623 int bdrv_is_read_only(BlockDriverState *bs)
3625 return bs->read_only;
3628 int bdrv_is_sg(BlockDriverState *bs)
3630 return bs->sg;
3633 int bdrv_enable_write_cache(BlockDriverState *bs)
3635 return bs->enable_write_cache;
3638 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3640 bs->enable_write_cache = wce;
3642 /* so a reopen() will preserve wce */
3643 if (wce) {
3644 bs->open_flags |= BDRV_O_CACHE_WB;
3645 } else {
3646 bs->open_flags &= ~BDRV_O_CACHE_WB;
3650 int bdrv_is_encrypted(BlockDriverState *bs)
3652 if (bs->backing_hd && bs->backing_hd->encrypted)
3653 return 1;
3654 return bs->encrypted;
3657 int bdrv_key_required(BlockDriverState *bs)
3659 BlockDriverState *backing_hd = bs->backing_hd;
3661 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3662 return 1;
3663 return (bs->encrypted && !bs->valid_key);
3666 int bdrv_set_key(BlockDriverState *bs, const char *key)
3668 int ret;
3669 if (bs->backing_hd && bs->backing_hd->encrypted) {
3670 ret = bdrv_set_key(bs->backing_hd, key);
3671 if (ret < 0)
3672 return ret;
3673 if (!bs->encrypted)
3674 return 0;
3676 if (!bs->encrypted) {
3677 return -EINVAL;
3678 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3679 return -ENOMEDIUM;
3681 ret = bs->drv->bdrv_set_key(bs, key);
3682 if (ret < 0) {
3683 bs->valid_key = 0;
3684 } else if (!bs->valid_key) {
3685 bs->valid_key = 1;
3686 if (bs->blk) {
3687 /* call the change callback now, we skipped it on open */
3688 blk_dev_change_media_cb(bs->blk, true);
3691 return ret;
3694 const char *bdrv_get_format_name(BlockDriverState *bs)
3696 return bs->drv ? bs->drv->format_name : NULL;
3699 static int qsort_strcmp(const void *a, const void *b)
3701 return strcmp(a, b);
3704 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3705 void *opaque)
3707 BlockDriver *drv;
3708 int count = 0;
3709 int i;
3710 const char **formats = NULL;
3712 QLIST_FOREACH(drv, &bdrv_drivers, list) {
3713 if (drv->format_name) {
3714 bool found = false;
3715 int i = count;
3716 while (formats && i && !found) {
3717 found = !strcmp(formats[--i], drv->format_name);
3720 if (!found) {
3721 formats = g_renew(const char *, formats, count + 1);
3722 formats[count++] = drv->format_name;
3727 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3729 for (i = 0; i < count; i++) {
3730 it(opaque, formats[i]);
3733 g_free(formats);
3736 /* This function is to find block backend bs */
3737 /* TODO convert callers to blk_by_name(), then remove */
3738 BlockDriverState *bdrv_find(const char *name)
3740 BlockBackend *blk = blk_by_name(name);
3742 return blk ? blk_bs(blk) : NULL;
3745 /* This function is to find a node in the bs graph */
3746 BlockDriverState *bdrv_find_node(const char *node_name)
3748 BlockDriverState *bs;
3750 assert(node_name);
3752 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3753 if (!strcmp(node_name, bs->node_name)) {
3754 return bs;
3757 return NULL;
3760 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3761 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3763 BlockDeviceInfoList *list, *entry;
3764 BlockDriverState *bs;
3766 list = NULL;
3767 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3768 entry = g_malloc0(sizeof(*entry));
3769 entry->value = bdrv_block_device_info(bs);
3770 entry->next = list;
3771 list = entry;
3774 return list;
3777 BlockDriverState *bdrv_lookup_bs(const char *device,
3778 const char *node_name,
3779 Error **errp)
3781 BlockBackend *blk;
3782 BlockDriverState *bs;
3784 if (device) {
3785 blk = blk_by_name(device);
3787 if (blk) {
3788 return blk_bs(blk);
3792 if (node_name) {
3793 bs = bdrv_find_node(node_name);
3795 if (bs) {
3796 return bs;
3800 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3801 device ? device : "",
3802 node_name ? node_name : "");
3803 return NULL;
3806 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3807 * return false. If either argument is NULL, return false. */
3808 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3810 while (top && top != base) {
3811 top = top->backing_hd;
3814 return top != NULL;
3817 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3819 if (!bs) {
3820 return QTAILQ_FIRST(&graph_bdrv_states);
3822 return QTAILQ_NEXT(bs, node_list);
3825 BlockDriverState *bdrv_next(BlockDriverState *bs)
3827 if (!bs) {
3828 return QTAILQ_FIRST(&bdrv_states);
3830 return QTAILQ_NEXT(bs, device_list);
3833 const char *bdrv_get_node_name(const BlockDriverState *bs)
3835 return bs->node_name;
3838 /* TODO check what callers really want: bs->node_name or blk_name() */
3839 const char *bdrv_get_device_name(const BlockDriverState *bs)
3841 return bs->blk ? blk_name(bs->blk) : "";
3844 int bdrv_get_flags(BlockDriverState *bs)
3846 return bs->open_flags;
3849 int bdrv_flush_all(void)
3851 BlockDriverState *bs;
3852 int result = 0;
3854 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3855 AioContext *aio_context = bdrv_get_aio_context(bs);
3856 int ret;
3858 aio_context_acquire(aio_context);
3859 ret = bdrv_flush(bs);
3860 if (ret < 0 && !result) {
3861 result = ret;
3863 aio_context_release(aio_context);
3866 return result;
3869 int bdrv_has_zero_init_1(BlockDriverState *bs)
3871 return 1;
3874 int bdrv_has_zero_init(BlockDriverState *bs)
3876 assert(bs->drv);
3878 /* If BS is a copy on write image, it is initialized to
3879 the contents of the base image, which may not be zeroes. */
3880 if (bs->backing_hd) {
3881 return 0;
3883 if (bs->drv->bdrv_has_zero_init) {
3884 return bs->drv->bdrv_has_zero_init(bs);
3887 /* safe default */
3888 return 0;
3891 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3893 BlockDriverInfo bdi;
3895 if (bs->backing_hd) {
3896 return false;
3899 if (bdrv_get_info(bs, &bdi) == 0) {
3900 return bdi.unallocated_blocks_are_zero;
3903 return false;
3906 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3908 BlockDriverInfo bdi;
3910 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3911 return false;
3914 if (bdrv_get_info(bs, &bdi) == 0) {
3915 return bdi.can_write_zeroes_with_unmap;
3918 return false;
3921 typedef struct BdrvCoGetBlockStatusData {
3922 BlockDriverState *bs;
3923 BlockDriverState *base;
3924 int64_t sector_num;
3925 int nb_sectors;
3926 int *pnum;
3927 int64_t ret;
3928 bool done;
3929 } BdrvCoGetBlockStatusData;
3932 * Returns the allocation status of the specified sectors.
3933 * Drivers not implementing the functionality are assumed to not support
3934 * backing files, hence all their sectors are reported as allocated.
3936 * If 'sector_num' is beyond the end of the disk image the return value is 0
3937 * and 'pnum' is set to 0.
3939 * 'pnum' is set to the number of sectors (including and immediately following
3940 * the specified sector) that are known to be in the same
3941 * allocated/unallocated state.
3943 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3944 * beyond the end of the disk image it will be clamped.
3946 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3947 int64_t sector_num,
3948 int nb_sectors, int *pnum)
3950 int64_t total_sectors;
3951 int64_t n;
3952 int64_t ret, ret2;
3954 total_sectors = bdrv_nb_sectors(bs);
3955 if (total_sectors < 0) {
3956 return total_sectors;
3959 if (sector_num >= total_sectors) {
3960 *pnum = 0;
3961 return 0;
3964 n = total_sectors - sector_num;
3965 if (n < nb_sectors) {
3966 nb_sectors = n;
3969 if (!bs->drv->bdrv_co_get_block_status) {
3970 *pnum = nb_sectors;
3971 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3972 if (bs->drv->protocol_name) {
3973 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3975 return ret;
3978 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3979 if (ret < 0) {
3980 *pnum = 0;
3981 return ret;
3984 if (ret & BDRV_BLOCK_RAW) {
3985 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3986 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3987 *pnum, pnum);
3990 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
3991 ret |= BDRV_BLOCK_ALLOCATED;
3994 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3995 if (bdrv_unallocated_blocks_are_zero(bs)) {
3996 ret |= BDRV_BLOCK_ZERO;
3997 } else if (bs->backing_hd) {
3998 BlockDriverState *bs2 = bs->backing_hd;
3999 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4000 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4001 ret |= BDRV_BLOCK_ZERO;
4006 if (bs->file &&
4007 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4008 (ret & BDRV_BLOCK_OFFSET_VALID)) {
4009 int file_pnum;
4011 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4012 *pnum, &file_pnum);
4013 if (ret2 >= 0) {
4014 /* Ignore errors. This is just providing extra information, it
4015 * is useful but not necessary.
4017 if (!file_pnum) {
4018 /* !file_pnum indicates an offset at or beyond the EOF; it is
4019 * perfectly valid for the format block driver to point to such
4020 * offsets, so catch it and mark everything as zero */
4021 ret |= BDRV_BLOCK_ZERO;
4022 } else {
4023 /* Limit request to the range reported by the protocol driver */
4024 *pnum = file_pnum;
4025 ret |= (ret2 & BDRV_BLOCK_ZERO);
4030 return ret;
4033 /* Coroutine wrapper for bdrv_get_block_status() */
4034 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4036 BdrvCoGetBlockStatusData *data = opaque;
4037 BlockDriverState *bs = data->bs;
4039 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4040 data->pnum);
4041 data->done = true;
4045 * Synchronous wrapper around bdrv_co_get_block_status().
4047 * See bdrv_co_get_block_status() for details.
4049 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4050 int nb_sectors, int *pnum)
4052 Coroutine *co;
4053 BdrvCoGetBlockStatusData data = {
4054 .bs = bs,
4055 .sector_num = sector_num,
4056 .nb_sectors = nb_sectors,
4057 .pnum = pnum,
4058 .done = false,
4061 if (qemu_in_coroutine()) {
4062 /* Fast-path if already in coroutine context */
4063 bdrv_get_block_status_co_entry(&data);
4064 } else {
4065 AioContext *aio_context = bdrv_get_aio_context(bs);
4067 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4068 qemu_coroutine_enter(co, &data);
4069 while (!data.done) {
4070 aio_poll(aio_context, true);
4073 return data.ret;
4076 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4077 int nb_sectors, int *pnum)
4079 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4080 if (ret < 0) {
4081 return ret;
4083 return !!(ret & BDRV_BLOCK_ALLOCATED);
4087 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4089 * Return true if the given sector is allocated in any image between
4090 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4091 * sector is allocated in any image of the chain. Return false otherwise.
4093 * 'pnum' is set to the number of sectors (including and immediately following
4094 * the specified sector) that are known to be in the same
4095 * allocated/unallocated state.
4098 int bdrv_is_allocated_above(BlockDriverState *top,
4099 BlockDriverState *base,
4100 int64_t sector_num,
4101 int nb_sectors, int *pnum)
4103 BlockDriverState *intermediate;
4104 int ret, n = nb_sectors;
4106 intermediate = top;
4107 while (intermediate && intermediate != base) {
4108 int pnum_inter;
4109 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4110 &pnum_inter);
4111 if (ret < 0) {
4112 return ret;
4113 } else if (ret) {
4114 *pnum = pnum_inter;
4115 return 1;
4119 * [sector_num, nb_sectors] is unallocated on top but intermediate
4120 * might have
4122 * [sector_num+x, nr_sectors] allocated.
4124 if (n > pnum_inter &&
4125 (intermediate == top ||
4126 sector_num + pnum_inter < intermediate->total_sectors)) {
4127 n = pnum_inter;
4130 intermediate = intermediate->backing_hd;
4133 *pnum = n;
4134 return 0;
4137 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4139 if (bs->backing_hd && bs->backing_hd->encrypted)
4140 return bs->backing_file;
4141 else if (bs->encrypted)
4142 return bs->filename;
4143 else
4144 return NULL;
4147 void bdrv_get_backing_filename(BlockDriverState *bs,
4148 char *filename, int filename_size)
4150 pstrcpy(filename, filename_size, bs->backing_file);
4153 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4154 const uint8_t *buf, int nb_sectors)
4156 BlockDriver *drv = bs->drv;
4157 if (!drv)
4158 return -ENOMEDIUM;
4159 if (!drv->bdrv_write_compressed)
4160 return -ENOTSUP;
4161 if (bdrv_check_request(bs, sector_num, nb_sectors))
4162 return -EIO;
4164 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4166 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4169 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4171 BlockDriver *drv = bs->drv;
4172 if (!drv)
4173 return -ENOMEDIUM;
4174 if (!drv->bdrv_get_info)
4175 return -ENOTSUP;
4176 memset(bdi, 0, sizeof(*bdi));
4177 return drv->bdrv_get_info(bs, bdi);
4180 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4182 BlockDriver *drv = bs->drv;
4183 if (drv && drv->bdrv_get_specific_info) {
4184 return drv->bdrv_get_specific_info(bs);
4186 return NULL;
4189 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4190 int64_t pos, int size)
4192 QEMUIOVector qiov;
4193 struct iovec iov = {
4194 .iov_base = (void *) buf,
4195 .iov_len = size,
4198 qemu_iovec_init_external(&qiov, &iov, 1);
4199 return bdrv_writev_vmstate(bs, &qiov, pos);
4202 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4204 BlockDriver *drv = bs->drv;
4206 if (!drv) {
4207 return -ENOMEDIUM;
4208 } else if (drv->bdrv_save_vmstate) {
4209 return drv->bdrv_save_vmstate(bs, qiov, pos);
4210 } else if (bs->file) {
4211 return bdrv_writev_vmstate(bs->file, qiov, pos);
4214 return -ENOTSUP;
4217 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4218 int64_t pos, int size)
4220 BlockDriver *drv = bs->drv;
4221 if (!drv)
4222 return -ENOMEDIUM;
4223 if (drv->bdrv_load_vmstate)
4224 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4225 if (bs->file)
4226 return bdrv_load_vmstate(bs->file, buf, pos, size);
4227 return -ENOTSUP;
4230 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4232 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4233 return;
4236 bs->drv->bdrv_debug_event(bs, event);
4239 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4240 const char *tag)
4242 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4243 bs = bs->file;
4246 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4247 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4250 return -ENOTSUP;
4253 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4255 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4256 bs = bs->file;
4259 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4260 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4263 return -ENOTSUP;
4266 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4268 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4269 bs = bs->file;
4272 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4273 return bs->drv->bdrv_debug_resume(bs, tag);
4276 return -ENOTSUP;
4279 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4281 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4282 bs = bs->file;
4285 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4286 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4289 return false;
4292 int bdrv_is_snapshot(BlockDriverState *bs)
4294 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4297 /* backing_file can either be relative, or absolute, or a protocol. If it is
4298 * relative, it must be relative to the chain. So, passing in bs->filename
4299 * from a BDS as backing_file should not be done, as that may be relative to
4300 * the CWD rather than the chain. */
4301 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4302 const char *backing_file)
4304 char *filename_full = NULL;
4305 char *backing_file_full = NULL;
4306 char *filename_tmp = NULL;
4307 int is_protocol = 0;
4308 BlockDriverState *curr_bs = NULL;
4309 BlockDriverState *retval = NULL;
4311 if (!bs || !bs->drv || !backing_file) {
4312 return NULL;
4315 filename_full = g_malloc(PATH_MAX);
4316 backing_file_full = g_malloc(PATH_MAX);
4317 filename_tmp = g_malloc(PATH_MAX);
4319 is_protocol = path_has_protocol(backing_file);
4321 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4323 /* If either of the filename paths is actually a protocol, then
4324 * compare unmodified paths; otherwise make paths relative */
4325 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4326 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4327 retval = curr_bs->backing_hd;
4328 break;
4330 } else {
4331 /* If not an absolute filename path, make it relative to the current
4332 * image's filename path */
4333 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4334 backing_file);
4336 /* We are going to compare absolute pathnames */
4337 if (!realpath(filename_tmp, filename_full)) {
4338 continue;
4341 /* We need to make sure the backing filename we are comparing against
4342 * is relative to the current image filename (or absolute) */
4343 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4344 curr_bs->backing_file);
4346 if (!realpath(filename_tmp, backing_file_full)) {
4347 continue;
4350 if (strcmp(backing_file_full, filename_full) == 0) {
4351 retval = curr_bs->backing_hd;
4352 break;
4357 g_free(filename_full);
4358 g_free(backing_file_full);
4359 g_free(filename_tmp);
4360 return retval;
4363 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4365 if (!bs->drv) {
4366 return 0;
4369 if (!bs->backing_hd) {
4370 return 0;
4373 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4376 /**************************************************************/
4377 /* async I/Os */
4379 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4380 QEMUIOVector *qiov, int nb_sectors,
4381 BlockCompletionFunc *cb, void *opaque)
4383 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4385 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4386 cb, opaque, false);
4389 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4390 QEMUIOVector *qiov, int nb_sectors,
4391 BlockCompletionFunc *cb, void *opaque)
4393 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4395 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4396 cb, opaque, true);
4399 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4400 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4401 BlockCompletionFunc *cb, void *opaque)
4403 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4405 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4406 BDRV_REQ_ZERO_WRITE | flags,
4407 cb, opaque, true);
4411 typedef struct MultiwriteCB {
4412 int error;
4413 int num_requests;
4414 int num_callbacks;
4415 struct {
4416 BlockCompletionFunc *cb;
4417 void *opaque;
4418 QEMUIOVector *free_qiov;
4419 } callbacks[];
4420 } MultiwriteCB;
4422 static void multiwrite_user_cb(MultiwriteCB *mcb)
4424 int i;
4426 for (i = 0; i < mcb->num_callbacks; i++) {
4427 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4428 if (mcb->callbacks[i].free_qiov) {
4429 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4431 g_free(mcb->callbacks[i].free_qiov);
4435 static void multiwrite_cb(void *opaque, int ret)
4437 MultiwriteCB *mcb = opaque;
4439 trace_multiwrite_cb(mcb, ret);
4441 if (ret < 0 && !mcb->error) {
4442 mcb->error = ret;
4445 mcb->num_requests--;
4446 if (mcb->num_requests == 0) {
4447 multiwrite_user_cb(mcb);
4448 g_free(mcb);
4452 static int multiwrite_req_compare(const void *a, const void *b)
4454 const BlockRequest *req1 = a, *req2 = b;
4457 * Note that we can't simply subtract req2->sector from req1->sector
4458 * here as that could overflow the return value.
4460 if (req1->sector > req2->sector) {
4461 return 1;
4462 } else if (req1->sector < req2->sector) {
4463 return -1;
4464 } else {
4465 return 0;
4470 * Takes a bunch of requests and tries to merge them. Returns the number of
4471 * requests that remain after merging.
4473 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4474 int num_reqs, MultiwriteCB *mcb)
4476 int i, outidx;
4478 // Sort requests by start sector
4479 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4481 // Check if adjacent requests touch the same clusters. If so, combine them,
4482 // filling up gaps with zero sectors.
4483 outidx = 0;
4484 for (i = 1; i < num_reqs; i++) {
4485 int merge = 0;
4486 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4488 // Handle exactly sequential writes and overlapping writes.
4489 if (reqs[i].sector <= oldreq_last) {
4490 merge = 1;
4493 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4494 merge = 0;
4497 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4498 reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4499 merge = 0;
4502 if (merge) {
4503 size_t size;
4504 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4505 qemu_iovec_init(qiov,
4506 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4508 // Add the first request to the merged one. If the requests are
4509 // overlapping, drop the last sectors of the first request.
4510 size = (reqs[i].sector - reqs[outidx].sector) << 9;
4511 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4513 // We should need to add any zeros between the two requests
4514 assert (reqs[i].sector <= oldreq_last);
4516 // Add the second request
4517 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4519 // Add tail of first request, if necessary
4520 if (qiov->size < reqs[outidx].qiov->size) {
4521 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4522 reqs[outidx].qiov->size - qiov->size);
4525 reqs[outidx].nb_sectors = qiov->size >> 9;
4526 reqs[outidx].qiov = qiov;
4528 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4529 } else {
4530 outidx++;
4531 reqs[outidx].sector = reqs[i].sector;
4532 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4533 reqs[outidx].qiov = reqs[i].qiov;
4537 return outidx + 1;
4541 * Submit multiple AIO write requests at once.
4543 * On success, the function returns 0 and all requests in the reqs array have
4544 * been submitted. In error case this function returns -1, and any of the
4545 * requests may or may not be submitted yet. In particular, this means that the
4546 * callback will be called for some of the requests, for others it won't. The
4547 * caller must check the error field of the BlockRequest to wait for the right
4548 * callbacks (if error != 0, no callback will be called).
4550 * The implementation may modify the contents of the reqs array, e.g. to merge
4551 * requests. However, the fields opaque and error are left unmodified as they
4552 * are used to signal failure for a single request to the caller.
4554 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4556 MultiwriteCB *mcb;
4557 int i;
4559 /* don't submit writes if we don't have a medium */
4560 if (bs->drv == NULL) {
4561 for (i = 0; i < num_reqs; i++) {
4562 reqs[i].error = -ENOMEDIUM;
4564 return -1;
4567 if (num_reqs == 0) {
4568 return 0;
4571 // Create MultiwriteCB structure
4572 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4573 mcb->num_requests = 0;
4574 mcb->num_callbacks = num_reqs;
4576 for (i = 0; i < num_reqs; i++) {
4577 mcb->callbacks[i].cb = reqs[i].cb;
4578 mcb->callbacks[i].opaque = reqs[i].opaque;
4581 // Check for mergable requests
4582 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4584 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4586 /* Run the aio requests. */
4587 mcb->num_requests = num_reqs;
4588 for (i = 0; i < num_reqs; i++) {
4589 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4590 reqs[i].nb_sectors, reqs[i].flags,
4591 multiwrite_cb, mcb,
4592 true);
4595 return 0;
4598 void bdrv_aio_cancel(BlockAIOCB *acb)
4600 qemu_aio_ref(acb);
4601 bdrv_aio_cancel_async(acb);
4602 while (acb->refcnt > 1) {
4603 if (acb->aiocb_info->get_aio_context) {
4604 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4605 } else if (acb->bs) {
4606 aio_poll(bdrv_get_aio_context(acb->bs), true);
4607 } else {
4608 abort();
4611 qemu_aio_unref(acb);
4614 /* Async version of aio cancel. The caller is not blocked if the acb implements
4615 * cancel_async, otherwise we do nothing and let the request normally complete.
4616 * In either case the completion callback must be called. */
4617 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4619 if (acb->aiocb_info->cancel_async) {
4620 acb->aiocb_info->cancel_async(acb);
4624 /**************************************************************/
4625 /* async block device emulation */
4627 typedef struct BlockAIOCBSync {
4628 BlockAIOCB common;
4629 QEMUBH *bh;
4630 int ret;
4631 /* vector translation state */
4632 QEMUIOVector *qiov;
4633 uint8_t *bounce;
4634 int is_write;
4635 } BlockAIOCBSync;
4637 static const AIOCBInfo bdrv_em_aiocb_info = {
4638 .aiocb_size = sizeof(BlockAIOCBSync),
4641 static void bdrv_aio_bh_cb(void *opaque)
4643 BlockAIOCBSync *acb = opaque;
4645 if (!acb->is_write && acb->ret >= 0) {
4646 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4648 qemu_vfree(acb->bounce);
4649 acb->common.cb(acb->common.opaque, acb->ret);
4650 qemu_bh_delete(acb->bh);
4651 acb->bh = NULL;
4652 qemu_aio_unref(acb);
4655 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4656 int64_t sector_num,
4657 QEMUIOVector *qiov,
4658 int nb_sectors,
4659 BlockCompletionFunc *cb,
4660 void *opaque,
4661 int is_write)
4664 BlockAIOCBSync *acb;
4666 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4667 acb->is_write = is_write;
4668 acb->qiov = qiov;
4669 acb->bounce = qemu_try_blockalign(bs, qiov->size);
4670 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4672 if (acb->bounce == NULL) {
4673 acb->ret = -ENOMEM;
4674 } else if (is_write) {
4675 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4676 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4677 } else {
4678 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4681 qemu_bh_schedule(acb->bh);
4683 return &acb->common;
4686 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4687 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4688 BlockCompletionFunc *cb, void *opaque)
4690 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4693 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4694 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4695 BlockCompletionFunc *cb, void *opaque)
4697 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4701 typedef struct BlockAIOCBCoroutine {
4702 BlockAIOCB common;
4703 BlockRequest req;
4704 bool is_write;
4705 bool *done;
4706 QEMUBH* bh;
4707 } BlockAIOCBCoroutine;
4709 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4710 .aiocb_size = sizeof(BlockAIOCBCoroutine),
4713 static void bdrv_co_em_bh(void *opaque)
4715 BlockAIOCBCoroutine *acb = opaque;
4717 acb->common.cb(acb->common.opaque, acb->req.error);
4719 qemu_bh_delete(acb->bh);
4720 qemu_aio_unref(acb);
4723 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4724 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4726 BlockAIOCBCoroutine *acb = opaque;
4727 BlockDriverState *bs = acb->common.bs;
4729 if (!acb->is_write) {
4730 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4731 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4732 } else {
4733 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4734 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4737 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4738 qemu_bh_schedule(acb->bh);
4741 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4742 int64_t sector_num,
4743 QEMUIOVector *qiov,
4744 int nb_sectors,
4745 BdrvRequestFlags flags,
4746 BlockCompletionFunc *cb,
4747 void *opaque,
4748 bool is_write)
4750 Coroutine *co;
4751 BlockAIOCBCoroutine *acb;
4753 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4754 acb->req.sector = sector_num;
4755 acb->req.nb_sectors = nb_sectors;
4756 acb->req.qiov = qiov;
4757 acb->req.flags = flags;
4758 acb->is_write = is_write;
4760 co = qemu_coroutine_create(bdrv_co_do_rw);
4761 qemu_coroutine_enter(co, acb);
4763 return &acb->common;
4766 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4768 BlockAIOCBCoroutine *acb = opaque;
4769 BlockDriverState *bs = acb->common.bs;
4771 acb->req.error = bdrv_co_flush(bs);
4772 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4773 qemu_bh_schedule(acb->bh);
4776 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4777 BlockCompletionFunc *cb, void *opaque)
4779 trace_bdrv_aio_flush(bs, opaque);
4781 Coroutine *co;
4782 BlockAIOCBCoroutine *acb;
4784 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4786 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4787 qemu_coroutine_enter(co, acb);
4789 return &acb->common;
4792 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4794 BlockAIOCBCoroutine *acb = opaque;
4795 BlockDriverState *bs = acb->common.bs;
4797 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4798 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4799 qemu_bh_schedule(acb->bh);
4802 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4803 int64_t sector_num, int nb_sectors,
4804 BlockCompletionFunc *cb, void *opaque)
4806 Coroutine *co;
4807 BlockAIOCBCoroutine *acb;
4809 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4811 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4812 acb->req.sector = sector_num;
4813 acb->req.nb_sectors = nb_sectors;
4814 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4815 qemu_coroutine_enter(co, acb);
4817 return &acb->common;
4820 void bdrv_init(void)
4822 module_call_init(MODULE_INIT_BLOCK);
4825 void bdrv_init_with_whitelist(void)
4827 use_bdrv_whitelist = 1;
4828 bdrv_init();
4831 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4832 BlockCompletionFunc *cb, void *opaque)
4834 BlockAIOCB *acb;
4836 acb = g_slice_alloc(aiocb_info->aiocb_size);
4837 acb->aiocb_info = aiocb_info;
4838 acb->bs = bs;
4839 acb->cb = cb;
4840 acb->opaque = opaque;
4841 acb->refcnt = 1;
4842 return acb;
4845 void qemu_aio_ref(void *p)
4847 BlockAIOCB *acb = p;
4848 acb->refcnt++;
4851 void qemu_aio_unref(void *p)
4853 BlockAIOCB *acb = p;
4854 assert(acb->refcnt > 0);
4855 if (--acb->refcnt == 0) {
4856 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4860 /**************************************************************/
4861 /* Coroutine block device emulation */
4863 typedef struct CoroutineIOCompletion {
4864 Coroutine *coroutine;
4865 int ret;
4866 } CoroutineIOCompletion;
4868 static void bdrv_co_io_em_complete(void *opaque, int ret)
4870 CoroutineIOCompletion *co = opaque;
4872 co->ret = ret;
4873 qemu_coroutine_enter(co->coroutine, NULL);
4876 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4877 int nb_sectors, QEMUIOVector *iov,
4878 bool is_write)
4880 CoroutineIOCompletion co = {
4881 .coroutine = qemu_coroutine_self(),
4883 BlockAIOCB *acb;
4885 if (is_write) {
4886 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4887 bdrv_co_io_em_complete, &co);
4888 } else {
4889 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4890 bdrv_co_io_em_complete, &co);
4893 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4894 if (!acb) {
4895 return -EIO;
4897 qemu_coroutine_yield();
4899 return co.ret;
4902 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4903 int64_t sector_num, int nb_sectors,
4904 QEMUIOVector *iov)
4906 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4909 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4910 int64_t sector_num, int nb_sectors,
4911 QEMUIOVector *iov)
4913 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4916 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4918 RwCo *rwco = opaque;
4920 rwco->ret = bdrv_co_flush(rwco->bs);
4923 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4925 int ret;
4927 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4928 return 0;
4931 /* Write back cached data to the OS even with cache=unsafe */
4932 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4933 if (bs->drv->bdrv_co_flush_to_os) {
4934 ret = bs->drv->bdrv_co_flush_to_os(bs);
4935 if (ret < 0) {
4936 return ret;
4940 /* But don't actually force it to the disk with cache=unsafe */
4941 if (bs->open_flags & BDRV_O_NO_FLUSH) {
4942 goto flush_parent;
4945 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4946 if (bs->drv->bdrv_co_flush_to_disk) {
4947 ret = bs->drv->bdrv_co_flush_to_disk(bs);
4948 } else if (bs->drv->bdrv_aio_flush) {
4949 BlockAIOCB *acb;
4950 CoroutineIOCompletion co = {
4951 .coroutine = qemu_coroutine_self(),
4954 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4955 if (acb == NULL) {
4956 ret = -EIO;
4957 } else {
4958 qemu_coroutine_yield();
4959 ret = co.ret;
4961 } else {
4963 * Some block drivers always operate in either writethrough or unsafe
4964 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4965 * know how the server works (because the behaviour is hardcoded or
4966 * depends on server-side configuration), so we can't ensure that
4967 * everything is safe on disk. Returning an error doesn't work because
4968 * that would break guests even if the server operates in writethrough
4969 * mode.
4971 * Let's hope the user knows what he's doing.
4973 ret = 0;
4975 if (ret < 0) {
4976 return ret;
4979 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4980 * in the case of cache=unsafe, so there are no useless flushes.
4982 flush_parent:
4983 return bdrv_co_flush(bs->file);
4986 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4988 Error *local_err = NULL;
4989 int ret;
4991 if (!bs->drv) {
4992 return;
4995 if (!(bs->open_flags & BDRV_O_INCOMING)) {
4996 return;
4998 bs->open_flags &= ~BDRV_O_INCOMING;
5000 if (bs->drv->bdrv_invalidate_cache) {
5001 bs->drv->bdrv_invalidate_cache(bs, &local_err);
5002 } else if (bs->file) {
5003 bdrv_invalidate_cache(bs->file, &local_err);
5005 if (local_err) {
5006 error_propagate(errp, local_err);
5007 return;
5010 ret = refresh_total_sectors(bs, bs->total_sectors);
5011 if (ret < 0) {
5012 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5013 return;
5017 void bdrv_invalidate_cache_all(Error **errp)
5019 BlockDriverState *bs;
5020 Error *local_err = NULL;
5022 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5023 AioContext *aio_context = bdrv_get_aio_context(bs);
5025 aio_context_acquire(aio_context);
5026 bdrv_invalidate_cache(bs, &local_err);
5027 aio_context_release(aio_context);
5028 if (local_err) {
5029 error_propagate(errp, local_err);
5030 return;
5035 int bdrv_flush(BlockDriverState *bs)
5037 Coroutine *co;
5038 RwCo rwco = {
5039 .bs = bs,
5040 .ret = NOT_DONE,
5043 if (qemu_in_coroutine()) {
5044 /* Fast-path if already in coroutine context */
5045 bdrv_flush_co_entry(&rwco);
5046 } else {
5047 AioContext *aio_context = bdrv_get_aio_context(bs);
5049 co = qemu_coroutine_create(bdrv_flush_co_entry);
5050 qemu_coroutine_enter(co, &rwco);
5051 while (rwco.ret == NOT_DONE) {
5052 aio_poll(aio_context, true);
5056 return rwco.ret;
5059 typedef struct DiscardCo {
5060 BlockDriverState *bs;
5061 int64_t sector_num;
5062 int nb_sectors;
5063 int ret;
5064 } DiscardCo;
5065 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5067 DiscardCo *rwco = opaque;
5069 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5072 /* if no limit is specified in the BlockLimits use a default
5073 * of 32768 512-byte sectors (16 MiB) per request.
5075 #define MAX_DISCARD_DEFAULT 32768
5077 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5078 int nb_sectors)
5080 int max_discard;
5082 if (!bs->drv) {
5083 return -ENOMEDIUM;
5084 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5085 return -EIO;
5086 } else if (bs->read_only) {
5087 return -EROFS;
5090 bdrv_reset_dirty(bs, sector_num, nb_sectors);
5092 /* Do nothing if disabled. */
5093 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5094 return 0;
5097 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5098 return 0;
5101 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5102 while (nb_sectors > 0) {
5103 int ret;
5104 int num = nb_sectors;
5106 /* align request */
5107 if (bs->bl.discard_alignment &&
5108 num >= bs->bl.discard_alignment &&
5109 sector_num % bs->bl.discard_alignment) {
5110 if (num > bs->bl.discard_alignment) {
5111 num = bs->bl.discard_alignment;
5113 num -= sector_num % bs->bl.discard_alignment;
5116 /* limit request size */
5117 if (num > max_discard) {
5118 num = max_discard;
5121 if (bs->drv->bdrv_co_discard) {
5122 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5123 } else {
5124 BlockAIOCB *acb;
5125 CoroutineIOCompletion co = {
5126 .coroutine = qemu_coroutine_self(),
5129 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5130 bdrv_co_io_em_complete, &co);
5131 if (acb == NULL) {
5132 return -EIO;
5133 } else {
5134 qemu_coroutine_yield();
5135 ret = co.ret;
5138 if (ret && ret != -ENOTSUP) {
5139 return ret;
5142 sector_num += num;
5143 nb_sectors -= num;
5145 return 0;
5148 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5150 Coroutine *co;
5151 DiscardCo rwco = {
5152 .bs = bs,
5153 .sector_num = sector_num,
5154 .nb_sectors = nb_sectors,
5155 .ret = NOT_DONE,
5158 if (qemu_in_coroutine()) {
5159 /* Fast-path if already in coroutine context */
5160 bdrv_discard_co_entry(&rwco);
5161 } else {
5162 AioContext *aio_context = bdrv_get_aio_context(bs);
5164 co = qemu_coroutine_create(bdrv_discard_co_entry);
5165 qemu_coroutine_enter(co, &rwco);
5166 while (rwco.ret == NOT_DONE) {
5167 aio_poll(aio_context, true);
5171 return rwco.ret;
5174 /**************************************************************/
5175 /* removable device support */
5178 * Return TRUE if the media is present
5180 int bdrv_is_inserted(BlockDriverState *bs)
5182 BlockDriver *drv = bs->drv;
5184 if (!drv)
5185 return 0;
5186 if (!drv->bdrv_is_inserted)
5187 return 1;
5188 return drv->bdrv_is_inserted(bs);
5192 * Return whether the media changed since the last call to this
5193 * function, or -ENOTSUP if we don't know. Most drivers don't know.
5195 int bdrv_media_changed(BlockDriverState *bs)
5197 BlockDriver *drv = bs->drv;
5199 if (drv && drv->bdrv_media_changed) {
5200 return drv->bdrv_media_changed(bs);
5202 return -ENOTSUP;
5206 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5208 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5210 BlockDriver *drv = bs->drv;
5211 const char *device_name;
5213 if (drv && drv->bdrv_eject) {
5214 drv->bdrv_eject(bs, eject_flag);
5217 device_name = bdrv_get_device_name(bs);
5218 if (device_name[0] != '\0') {
5219 qapi_event_send_device_tray_moved(device_name,
5220 eject_flag, &error_abort);
5225 * Lock or unlock the media (if it is locked, the user won't be able
5226 * to eject it manually).
5228 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5230 BlockDriver *drv = bs->drv;
5232 trace_bdrv_lock_medium(bs, locked);
5234 if (drv && drv->bdrv_lock_medium) {
5235 drv->bdrv_lock_medium(bs, locked);
5239 /* needed for generic scsi interface */
5241 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5243 BlockDriver *drv = bs->drv;
5245 if (drv && drv->bdrv_ioctl)
5246 return drv->bdrv_ioctl(bs, req, buf);
5247 return -ENOTSUP;
5250 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5251 unsigned long int req, void *buf,
5252 BlockCompletionFunc *cb, void *opaque)
5254 BlockDriver *drv = bs->drv;
5256 if (drv && drv->bdrv_aio_ioctl)
5257 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5258 return NULL;
5261 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5263 bs->guest_block_size = align;
5266 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5268 return qemu_memalign(bdrv_opt_mem_align(bs), size);
5271 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5273 return memset(qemu_blockalign(bs, size), 0, size);
5276 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5278 size_t align = bdrv_opt_mem_align(bs);
5280 /* Ensure that NULL is never returned on success */
5281 assert(align > 0);
5282 if (size == 0) {
5283 size = align;
5286 return qemu_try_memalign(align, size);
5289 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5291 void *mem = qemu_try_blockalign(bs, size);
5293 if (mem) {
5294 memset(mem, 0, size);
5297 return mem;
5301 * Check if all memory in this vector is sector aligned.
5303 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5305 int i;
5306 size_t alignment = bdrv_opt_mem_align(bs);
5308 for (i = 0; i < qiov->niov; i++) {
5309 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5310 return false;
5312 if (qiov->iov[i].iov_len % alignment) {
5313 return false;
5317 return true;
5320 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5321 Error **errp)
5323 int64_t bitmap_size;
5324 BdrvDirtyBitmap *bitmap;
5326 assert((granularity & (granularity - 1)) == 0);
5328 granularity >>= BDRV_SECTOR_BITS;
5329 assert(granularity);
5330 bitmap_size = bdrv_nb_sectors(bs);
5331 if (bitmap_size < 0) {
5332 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5333 errno = -bitmap_size;
5334 return NULL;
5336 bitmap = g_new0(BdrvDirtyBitmap, 1);
5337 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5338 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5339 return bitmap;
5342 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5344 BdrvDirtyBitmap *bm, *next;
5345 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5346 if (bm == bitmap) {
5347 QLIST_REMOVE(bitmap, list);
5348 hbitmap_free(bitmap->bitmap);
5349 g_free(bitmap);
5350 return;
5355 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5357 BdrvDirtyBitmap *bm;
5358 BlockDirtyInfoList *list = NULL;
5359 BlockDirtyInfoList **plist = &list;
5361 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5362 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5363 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5364 info->count = bdrv_get_dirty_count(bs, bm);
5365 info->granularity =
5366 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5367 entry->value = info;
5368 *plist = entry;
5369 plist = &entry->next;
5372 return list;
5375 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5377 if (bitmap) {
5378 return hbitmap_get(bitmap->bitmap, sector);
5379 } else {
5380 return 0;
5384 void bdrv_dirty_iter_init(BlockDriverState *bs,
5385 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5387 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5390 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5391 int nr_sectors)
5393 BdrvDirtyBitmap *bitmap;
5394 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5395 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5399 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5401 BdrvDirtyBitmap *bitmap;
5402 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5403 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5407 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5409 return hbitmap_count(bitmap->bitmap);
5412 /* Get a reference to bs */
5413 void bdrv_ref(BlockDriverState *bs)
5415 bs->refcnt++;
5418 /* Release a previously grabbed reference to bs.
5419 * If after releasing, reference count is zero, the BlockDriverState is
5420 * deleted. */
5421 void bdrv_unref(BlockDriverState *bs)
5423 if (!bs) {
5424 return;
5426 assert(bs->refcnt > 0);
5427 if (--bs->refcnt == 0) {
5428 bdrv_delete(bs);
5432 struct BdrvOpBlocker {
5433 Error *reason;
5434 QLIST_ENTRY(BdrvOpBlocker) list;
5437 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5439 BdrvOpBlocker *blocker;
5440 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5441 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5442 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5443 if (errp) {
5444 error_setg(errp, "Device '%s' is busy: %s",
5445 bdrv_get_device_name(bs),
5446 error_get_pretty(blocker->reason));
5448 return true;
5450 return false;
5453 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5455 BdrvOpBlocker *blocker;
5456 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5458 blocker = g_new0(BdrvOpBlocker, 1);
5459 blocker->reason = reason;
5460 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5463 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5465 BdrvOpBlocker *blocker, *next;
5466 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5467 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5468 if (blocker->reason == reason) {
5469 QLIST_REMOVE(blocker, list);
5470 g_free(blocker);
5475 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5477 int i;
5478 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5479 bdrv_op_block(bs, i, reason);
5483 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5485 int i;
5486 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5487 bdrv_op_unblock(bs, i, reason);
5491 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5493 int i;
5495 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5496 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5497 return false;
5500 return true;
5503 void bdrv_iostatus_enable(BlockDriverState *bs)
5505 bs->iostatus_enabled = true;
5506 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5509 /* The I/O status is only enabled if the drive explicitly
5510 * enables it _and_ the VM is configured to stop on errors */
5511 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5513 return (bs->iostatus_enabled &&
5514 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5515 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5516 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5519 void bdrv_iostatus_disable(BlockDriverState *bs)
5521 bs->iostatus_enabled = false;
5524 void bdrv_iostatus_reset(BlockDriverState *bs)
5526 if (bdrv_iostatus_is_enabled(bs)) {
5527 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5528 if (bs->job) {
5529 block_job_iostatus_reset(bs->job);
5534 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5536 assert(bdrv_iostatus_is_enabled(bs));
5537 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5538 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5539 BLOCK_DEVICE_IO_STATUS_FAILED;
5543 void bdrv_img_create(const char *filename, const char *fmt,
5544 const char *base_filename, const char *base_fmt,
5545 char *options, uint64_t img_size, int flags,
5546 Error **errp, bool quiet)
5548 QemuOptsList *create_opts = NULL;
5549 QemuOpts *opts = NULL;
5550 const char *backing_fmt, *backing_file;
5551 int64_t size;
5552 BlockDriver *drv, *proto_drv;
5553 BlockDriver *backing_drv = NULL;
5554 Error *local_err = NULL;
5555 int ret = 0;
5557 /* Find driver and parse its options */
5558 drv = bdrv_find_format(fmt);
5559 if (!drv) {
5560 error_setg(errp, "Unknown file format '%s'", fmt);
5561 return;
5564 proto_drv = bdrv_find_protocol(filename, true);
5565 if (!proto_drv) {
5566 error_setg(errp, "Unknown protocol '%s'", filename);
5567 return;
5570 if (!drv->create_opts) {
5571 error_setg(errp, "Format driver '%s' does not support image creation",
5572 drv->format_name);
5573 return;
5576 if (!proto_drv->create_opts) {
5577 error_setg(errp, "Protocol driver '%s' does not support image creation",
5578 proto_drv->format_name);
5579 return;
5582 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5583 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5585 /* Create parameter list with default values */
5586 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5587 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5589 /* Parse -o options */
5590 if (options) {
5591 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5592 error_setg(errp, "Invalid options for file format '%s'", fmt);
5593 goto out;
5597 if (base_filename) {
5598 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5599 error_setg(errp, "Backing file not supported for file format '%s'",
5600 fmt);
5601 goto out;
5605 if (base_fmt) {
5606 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5607 error_setg(errp, "Backing file format not supported for file "
5608 "format '%s'", fmt);
5609 goto out;
5613 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5614 if (backing_file) {
5615 if (!strcmp(filename, backing_file)) {
5616 error_setg(errp, "Error: Trying to create an image with the "
5617 "same filename as the backing file");
5618 goto out;
5622 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5623 if (backing_fmt) {
5624 backing_drv = bdrv_find_format(backing_fmt);
5625 if (!backing_drv) {
5626 error_setg(errp, "Unknown backing file format '%s'",
5627 backing_fmt);
5628 goto out;
5632 // The size for the image must always be specified, with one exception:
5633 // If we are using a backing file, we can obtain the size from there
5634 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5635 if (size == -1) {
5636 if (backing_file) {
5637 BlockDriverState *bs;
5638 int64_t size;
5639 int back_flags;
5641 /* backing files always opened read-only */
5642 back_flags =
5643 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5645 bs = NULL;
5646 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5647 backing_drv, &local_err);
5648 if (ret < 0) {
5649 goto out;
5651 size = bdrv_getlength(bs);
5652 if (size < 0) {
5653 error_setg_errno(errp, -size, "Could not get size of '%s'",
5654 backing_file);
5655 bdrv_unref(bs);
5656 goto out;
5659 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5661 bdrv_unref(bs);
5662 } else {
5663 error_setg(errp, "Image creation needs a size parameter");
5664 goto out;
5668 if (!quiet) {
5669 printf("Formatting '%s', fmt=%s", filename, fmt);
5670 qemu_opts_print(opts, " ");
5671 puts("");
5674 ret = bdrv_create(drv, filename, opts, &local_err);
5676 if (ret == -EFBIG) {
5677 /* This is generally a better message than whatever the driver would
5678 * deliver (especially because of the cluster_size_hint), since that
5679 * is most probably not much different from "image too large". */
5680 const char *cluster_size_hint = "";
5681 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5682 cluster_size_hint = " (try using a larger cluster size)";
5684 error_setg(errp, "The image size is too large for file format '%s'"
5685 "%s", fmt, cluster_size_hint);
5686 error_free(local_err);
5687 local_err = NULL;
5690 out:
5691 qemu_opts_del(opts);
5692 qemu_opts_free(create_opts);
5693 if (local_err) {
5694 error_propagate(errp, local_err);
5698 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5700 return bs->aio_context;
5703 void bdrv_detach_aio_context(BlockDriverState *bs)
5705 BdrvAioNotifier *baf;
5707 if (!bs->drv) {
5708 return;
5711 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5712 baf->detach_aio_context(baf->opaque);
5715 if (bs->io_limits_enabled) {
5716 throttle_detach_aio_context(&bs->throttle_state);
5718 if (bs->drv->bdrv_detach_aio_context) {
5719 bs->drv->bdrv_detach_aio_context(bs);
5721 if (bs->file) {
5722 bdrv_detach_aio_context(bs->file);
5724 if (bs->backing_hd) {
5725 bdrv_detach_aio_context(bs->backing_hd);
5728 bs->aio_context = NULL;
5731 void bdrv_attach_aio_context(BlockDriverState *bs,
5732 AioContext *new_context)
5734 BdrvAioNotifier *ban;
5736 if (!bs->drv) {
5737 return;
5740 bs->aio_context = new_context;
5742 if (bs->backing_hd) {
5743 bdrv_attach_aio_context(bs->backing_hd, new_context);
5745 if (bs->file) {
5746 bdrv_attach_aio_context(bs->file, new_context);
5748 if (bs->drv->bdrv_attach_aio_context) {
5749 bs->drv->bdrv_attach_aio_context(bs, new_context);
5751 if (bs->io_limits_enabled) {
5752 throttle_attach_aio_context(&bs->throttle_state, new_context);
5755 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5756 ban->attached_aio_context(new_context, ban->opaque);
5760 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5762 bdrv_drain_all(); /* ensure there are no in-flight requests */
5764 bdrv_detach_aio_context(bs);
5766 /* This function executes in the old AioContext so acquire the new one in
5767 * case it runs in a different thread.
5769 aio_context_acquire(new_context);
5770 bdrv_attach_aio_context(bs, new_context);
5771 aio_context_release(new_context);
5774 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5775 void (*attached_aio_context)(AioContext *new_context, void *opaque),
5776 void (*detach_aio_context)(void *opaque), void *opaque)
5778 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5779 *ban = (BdrvAioNotifier){
5780 .attached_aio_context = attached_aio_context,
5781 .detach_aio_context = detach_aio_context,
5782 .opaque = opaque
5785 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5788 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5789 void (*attached_aio_context)(AioContext *,
5790 void *),
5791 void (*detach_aio_context)(void *),
5792 void *opaque)
5794 BdrvAioNotifier *ban, *ban_next;
5796 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5797 if (ban->attached_aio_context == attached_aio_context &&
5798 ban->detach_aio_context == detach_aio_context &&
5799 ban->opaque == opaque)
5801 QLIST_REMOVE(ban, list);
5802 g_free(ban);
5804 return;
5808 abort();
5811 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5812 NotifierWithReturn *notifier)
5814 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5817 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
5818 BlockDriverAmendStatusCB *status_cb)
5820 if (!bs->drv->bdrv_amend_options) {
5821 return -ENOTSUP;
5823 return bs->drv->bdrv_amend_options(bs, opts, status_cb);
5826 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5827 * of block filter and by bdrv_is_first_non_filter.
5828 * It is used to test if the given bs is the candidate or recurse more in the
5829 * node graph.
5831 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5832 BlockDriverState *candidate)
5834 /* return false if basic checks fails */
5835 if (!bs || !bs->drv) {
5836 return false;
5839 /* the code reached a non block filter driver -> check if the bs is
5840 * the same as the candidate. It's the recursion termination condition.
5842 if (!bs->drv->is_filter) {
5843 return bs == candidate;
5845 /* Down this path the driver is a block filter driver */
5847 /* If the block filter recursion method is defined use it to recurse down
5848 * the node graph.
5850 if (bs->drv->bdrv_recurse_is_first_non_filter) {
5851 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5854 /* the driver is a block filter but don't allow to recurse -> return false
5856 return false;
5859 /* This function checks if the candidate is the first non filter bs down it's
5860 * bs chain. Since we don't have pointers to parents it explore all bs chains
5861 * from the top. Some filters can choose not to pass down the recursion.
5863 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5865 BlockDriverState *bs;
5867 /* walk down the bs forest recursively */
5868 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5869 bool perm;
5871 /* try to recurse in this top level bs */
5872 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5874 /* candidate is the first non filter */
5875 if (perm) {
5876 return true;
5880 return false;
5883 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5885 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5886 AioContext *aio_context;
5888 if (!to_replace_bs) {
5889 error_setg(errp, "Node name '%s' not found", node_name);
5890 return NULL;
5893 aio_context = bdrv_get_aio_context(to_replace_bs);
5894 aio_context_acquire(aio_context);
5896 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5897 to_replace_bs = NULL;
5898 goto out;
5901 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5902 * most non filter in order to prevent data corruption.
5903 * Another benefit is that this tests exclude backing files which are
5904 * blocked by the backing blockers.
5906 if (!bdrv_is_first_non_filter(to_replace_bs)) {
5907 error_setg(errp, "Only top most non filter can be replaced");
5908 to_replace_bs = NULL;
5909 goto out;
5912 out:
5913 aio_context_release(aio_context);
5914 return to_replace_bs;
5917 void bdrv_io_plug(BlockDriverState *bs)
5919 BlockDriver *drv = bs->drv;
5920 if (drv && drv->bdrv_io_plug) {
5921 drv->bdrv_io_plug(bs);
5922 } else if (bs->file) {
5923 bdrv_io_plug(bs->file);
5927 void bdrv_io_unplug(BlockDriverState *bs)
5929 BlockDriver *drv = bs->drv;
5930 if (drv && drv->bdrv_io_unplug) {
5931 drv->bdrv_io_unplug(bs);
5932 } else if (bs->file) {
5933 bdrv_io_unplug(bs->file);
5937 void bdrv_flush_io_queue(BlockDriverState *bs)
5939 BlockDriver *drv = bs->drv;
5940 if (drv && drv->bdrv_flush_io_queue) {
5941 drv->bdrv_flush_io_queue(bs);
5942 } else if (bs->file) {
5943 bdrv_flush_io_queue(bs->file);
5947 static bool append_open_options(QDict *d, BlockDriverState *bs)
5949 const QDictEntry *entry;
5950 bool found_any = false;
5952 for (entry = qdict_first(bs->options); entry;
5953 entry = qdict_next(bs->options, entry))
5955 /* Only take options for this level and exclude all non-driver-specific
5956 * options */
5957 if (!strchr(qdict_entry_key(entry), '.') &&
5958 strcmp(qdict_entry_key(entry), "node-name"))
5960 qobject_incref(qdict_entry_value(entry));
5961 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5962 found_any = true;
5966 return found_any;
5969 /* Updates the following BDS fields:
5970 * - exact_filename: A filename which may be used for opening a block device
5971 * which (mostly) equals the given BDS (even without any
5972 * other options; so reading and writing must return the same
5973 * results, but caching etc. may be different)
5974 * - full_open_options: Options which, when given when opening a block device
5975 * (without a filename), result in a BDS (mostly)
5976 * equalling the given one
5977 * - filename: If exact_filename is set, it is copied here. Otherwise,
5978 * full_open_options is converted to a JSON object, prefixed with
5979 * "json:" (for use through the JSON pseudo protocol) and put here.
5981 void bdrv_refresh_filename(BlockDriverState *bs)
5983 BlockDriver *drv = bs->drv;
5984 QDict *opts;
5986 if (!drv) {
5987 return;
5990 /* This BDS's file name will most probably depend on its file's name, so
5991 * refresh that first */
5992 if (bs->file) {
5993 bdrv_refresh_filename(bs->file);
5996 if (drv->bdrv_refresh_filename) {
5997 /* Obsolete information is of no use here, so drop the old file name
5998 * information before refreshing it */
5999 bs->exact_filename[0] = '\0';
6000 if (bs->full_open_options) {
6001 QDECREF(bs->full_open_options);
6002 bs->full_open_options = NULL;
6005 drv->bdrv_refresh_filename(bs);
6006 } else if (bs->file) {
6007 /* Try to reconstruct valid information from the underlying file */
6008 bool has_open_options;
6010 bs->exact_filename[0] = '\0';
6011 if (bs->full_open_options) {
6012 QDECREF(bs->full_open_options);
6013 bs->full_open_options = NULL;
6016 opts = qdict_new();
6017 has_open_options = append_open_options(opts, bs);
6019 /* If no specific options have been given for this BDS, the filename of
6020 * the underlying file should suffice for this one as well */
6021 if (bs->file->exact_filename[0] && !has_open_options) {
6022 strcpy(bs->exact_filename, bs->file->exact_filename);
6024 /* Reconstructing the full options QDict is simple for most format block
6025 * drivers, as long as the full options are known for the underlying
6026 * file BDS. The full options QDict of that file BDS should somehow
6027 * contain a representation of the filename, therefore the following
6028 * suffices without querying the (exact_)filename of this BDS. */
6029 if (bs->file->full_open_options) {
6030 qdict_put_obj(opts, "driver",
6031 QOBJECT(qstring_from_str(drv->format_name)));
6032 QINCREF(bs->file->full_open_options);
6033 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6035 bs->full_open_options = opts;
6036 } else {
6037 QDECREF(opts);
6039 } else if (!bs->full_open_options && qdict_size(bs->options)) {
6040 /* There is no underlying file BDS (at least referenced by BDS.file),
6041 * so the full options QDict should be equal to the options given
6042 * specifically for this block device when it was opened (plus the
6043 * driver specification).
6044 * Because those options don't change, there is no need to update
6045 * full_open_options when it's already set. */
6047 opts = qdict_new();
6048 append_open_options(opts, bs);
6049 qdict_put_obj(opts, "driver",
6050 QOBJECT(qstring_from_str(drv->format_name)));
6052 if (bs->exact_filename[0]) {
6053 /* This may not work for all block protocol drivers (some may
6054 * require this filename to be parsed), but we have to find some
6055 * default solution here, so just include it. If some block driver
6056 * does not support pure options without any filename at all or
6057 * needs some special format of the options QDict, it needs to
6058 * implement the driver-specific bdrv_refresh_filename() function.
6060 qdict_put_obj(opts, "filename",
6061 QOBJECT(qstring_from_str(bs->exact_filename)));
6064 bs->full_open_options = opts;
6067 if (bs->exact_filename[0]) {
6068 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6069 } else if (bs->full_open_options) {
6070 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6071 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6072 qstring_get_str(json));
6073 QDECREF(json);
6077 /* This accessor function purpose is to allow the device models to access the
6078 * BlockAcctStats structure embedded inside a BlockDriverState without being
6079 * aware of the BlockDriverState structure layout.
6080 * It will go away when the BlockAcctStats structure will be moved inside
6081 * the device models.
6083 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6085 return &bs->stats;