vdi.c: replace QEMUOptionParameter with QemuOpts
[qemu.git] / block.c
blob3dd21945155fb80534e3f938cfd96b7e0686ac9e
1 /*
2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
53 struct BdrvDirtyBitmap {
54 HBitmap *bitmap;
55 QLIST_ENTRY(BdrvDirtyBitmap) list;
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63 BlockDriverCompletionFunc *cb, void *opaque);
64 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66 BlockDriverCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75 BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78 BdrvRequestFlags flags);
79 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
83 BdrvRequestFlags flags,
84 BlockDriverCompletionFunc *cb,
85 void *opaque,
86 bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
111 int is_windows_drive(const char *filename)
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
121 #endif
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
127 int i;
129 throttle_config(&bs->throttle_state, cfg);
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
143 bs->io_limits_enabled = false;
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
151 bs->io_limits_enabled = enabled;
153 return drained;
156 void bdrv_io_limits_disable(BlockDriverState *bs)
158 bs->io_limits_enabled = false;
160 bdrv_start_throttled_reqs(bs);
162 throttle_destroy(&bs->throttle_state);
165 static void bdrv_throttle_read_timer_cb(void *opaque)
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
171 static void bdrv_throttle_write_timer_cb(void *opaque)
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 bdrv_get_aio_context(bs),
183 QEMU_CLOCK_VIRTUAL,
184 bdrv_throttle_read_timer_cb,
185 bdrv_throttle_write_timer_cb,
186 bs);
187 bs->io_limits_enabled = true;
190 /* This function makes an IO wait if needed
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
195 static void bdrv_io_limits_intercept(BlockDriverState *bs,
196 unsigned int bytes,
197 bool is_write)
199 /* does this io must wait */
200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
202 /* if must wait or any request of this type throttled queue the IO */
203 if (must_wait ||
204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
208 /* the IO will be executed, do the accounting */
209 throttle_account(&bs->throttle_state, is_write, bytes);
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214 return;
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
221 size_t bdrv_opt_mem_align(BlockDriverState *bs)
223 if (!bs || !bs->drv) {
224 /* 4k should be on the safe side */
225 return 4096;
228 return bs->bl.opt_mem_alignment;
231 /* check if the path starts with "<protocol>:" */
232 static int path_has_protocol(const char *path)
234 const char *p;
236 #ifdef _WIN32
237 if (is_windows_drive(path) ||
238 is_windows_drive_prefix(path)) {
239 return 0;
241 p = path + strcspn(path, ":/\\");
242 #else
243 p = path + strcspn(path, ":/");
244 #endif
246 return *p == ':';
249 int path_is_absolute(const char *path)
251 #ifdef _WIN32
252 /* specific case for names like: "\\.\d:" */
253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254 return 1;
256 return (*path == '/' || *path == '\\');
257 #else
258 return (*path == '/');
259 #endif
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
264 supported. */
265 void path_combine(char *dest, int dest_size,
266 const char *base_path,
267 const char *filename)
269 const char *p, *p1;
270 int len;
272 if (dest_size <= 0)
273 return;
274 if (path_is_absolute(filename)) {
275 pstrcpy(dest, dest_size, filename);
276 } else {
277 p = strchr(base_path, ':');
278 if (p)
279 p++;
280 else
281 p = base_path;
282 p1 = strrchr(base_path, '/');
283 #ifdef _WIN32
285 const char *p2;
286 p2 = strrchr(base_path, '\\');
287 if (!p1 || p2 > p1)
288 p1 = p2;
290 #endif
291 if (p1)
292 p1++;
293 else
294 p1 = base_path;
295 if (p1 > p)
296 p = p1;
297 len = p - base_path;
298 if (len > dest_size - 1)
299 len = dest_size - 1;
300 memcpy(dest, base_path, len);
301 dest[len] = '\0';
302 pstrcat(dest, dest_size, filename);
306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
308 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309 pstrcpy(dest, sz, bs->backing_file);
310 } else {
311 path_combine(dest, sz, bs->filename, bs->backing_file);
315 void bdrv_register(BlockDriver *bdrv)
317 /* Block drivers without coroutine functions need emulation */
318 if (!bdrv->bdrv_co_readv) {
319 bdrv->bdrv_co_readv = bdrv_co_readv_em;
320 bdrv->bdrv_co_writev = bdrv_co_writev_em;
322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323 * the block driver lacks aio we need to emulate that too.
325 if (!bdrv->bdrv_aio_readv) {
326 /* add AIO emulation layer */
327 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
332 if (bdrv->bdrv_create) {
333 assert(!bdrv->bdrv_create2 && !bdrv->create_opts);
334 assert(!bdrv->bdrv_amend_options2);
335 } else if (bdrv->bdrv_create2) {
336 assert(!bdrv->bdrv_create && !bdrv->create_options);
337 assert(!bdrv->bdrv_amend_options);
339 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
342 /* create a new block device (by default it is empty) */
343 BlockDriverState *bdrv_new(const char *device_name, Error **errp)
345 BlockDriverState *bs;
346 int i;
348 if (bdrv_find(device_name)) {
349 error_setg(errp, "Device with id '%s' already exists",
350 device_name);
351 return NULL;
353 if (bdrv_find_node(device_name)) {
354 error_setg(errp, "Device with node-name '%s' already exists",
355 device_name);
356 return NULL;
359 bs = g_malloc0(sizeof(BlockDriverState));
360 QLIST_INIT(&bs->dirty_bitmaps);
361 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
362 if (device_name[0] != '\0') {
363 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
365 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
366 QLIST_INIT(&bs->op_blockers[i]);
368 bdrv_iostatus_disable(bs);
369 notifier_list_init(&bs->close_notifiers);
370 notifier_with_return_list_init(&bs->before_write_notifiers);
371 qemu_co_queue_init(&bs->throttled_reqs[0]);
372 qemu_co_queue_init(&bs->throttled_reqs[1]);
373 bs->refcnt = 1;
374 bs->aio_context = qemu_get_aio_context();
376 return bs;
379 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
381 notifier_list_add(&bs->close_notifiers, notify);
384 BlockDriver *bdrv_find_format(const char *format_name)
386 BlockDriver *drv1;
387 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
388 if (!strcmp(drv1->format_name, format_name)) {
389 return drv1;
392 return NULL;
395 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
397 static const char *whitelist_rw[] = {
398 CONFIG_BDRV_RW_WHITELIST
400 static const char *whitelist_ro[] = {
401 CONFIG_BDRV_RO_WHITELIST
403 const char **p;
405 if (!whitelist_rw[0] && !whitelist_ro[0]) {
406 return 1; /* no whitelist, anything goes */
409 for (p = whitelist_rw; *p; p++) {
410 if (!strcmp(drv->format_name, *p)) {
411 return 1;
414 if (read_only) {
415 for (p = whitelist_ro; *p; p++) {
416 if (!strcmp(drv->format_name, *p)) {
417 return 1;
421 return 0;
424 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
425 bool read_only)
427 BlockDriver *drv = bdrv_find_format(format_name);
428 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
431 typedef struct CreateCo {
432 BlockDriver *drv;
433 char *filename;
434 QEMUOptionParameter *options;
435 QemuOpts *opts;
436 int ret;
437 Error *err;
438 } CreateCo;
440 static void coroutine_fn bdrv_create_co_entry(void *opaque)
442 Error *local_err = NULL;
443 int ret;
445 CreateCo *cco = opaque;
446 assert(cco->drv);
447 assert(!(cco->options && cco->opts));
449 if (cco->drv->bdrv_create2) {
450 QemuOptsList *opts_list = NULL;
451 if (cco->options) {
452 opts_list = params_to_opts(cco->options);
453 cco->opts = qemu_opts_create(opts_list, NULL, 0, &error_abort);
455 ret = cco->drv->bdrv_create2(cco->filename, cco->opts, &local_err);
456 if (cco->options) {
457 qemu_opts_del(cco->opts);
458 qemu_opts_free(opts_list);
460 } else {
461 if (cco->opts) {
462 cco->options = opts_to_params(cco->opts);
464 ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
465 if (cco->opts) {
466 free_option_parameters(cco->options);
469 if (local_err) {
470 error_propagate(&cco->err, local_err);
472 cco->ret = ret;
475 int bdrv_create(BlockDriver *drv, const char* filename,
476 QEMUOptionParameter *options,
477 QemuOpts *opts, Error **errp)
479 int ret;
481 Coroutine *co;
482 CreateCo cco = {
483 .drv = drv,
484 .filename = g_strdup(filename),
485 .options = options,
486 .opts = opts,
487 .ret = NOT_DONE,
488 .err = NULL,
491 if (!drv->bdrv_create && !drv->bdrv_create2) {
492 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
493 ret = -ENOTSUP;
494 goto out;
497 if (qemu_in_coroutine()) {
498 /* Fast-path if already in coroutine context */
499 bdrv_create_co_entry(&cco);
500 } else {
501 co = qemu_coroutine_create(bdrv_create_co_entry);
502 qemu_coroutine_enter(co, &cco);
503 while (cco.ret == NOT_DONE) {
504 qemu_aio_wait();
508 ret = cco.ret;
509 if (ret < 0) {
510 if (cco.err) {
511 error_propagate(errp, cco.err);
512 } else {
513 error_setg_errno(errp, -ret, "Could not create image");
517 out:
518 g_free(cco.filename);
519 return ret;
522 int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
523 QemuOpts *opts, Error **errp)
525 BlockDriver *drv;
526 Error *local_err = NULL;
527 int ret;
529 drv = bdrv_find_protocol(filename, true);
530 if (drv == NULL) {
531 error_setg(errp, "Could not find protocol for file '%s'", filename);
532 return -ENOENT;
535 ret = bdrv_create(drv, filename, options, opts, &local_err);
536 if (local_err) {
537 error_propagate(errp, local_err);
539 return ret;
542 int bdrv_refresh_limits(BlockDriverState *bs)
544 BlockDriver *drv = bs->drv;
546 memset(&bs->bl, 0, sizeof(bs->bl));
548 if (!drv) {
549 return 0;
552 /* Take some limits from the children as a default */
553 if (bs->file) {
554 bdrv_refresh_limits(bs->file);
555 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
556 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
557 } else {
558 bs->bl.opt_mem_alignment = 512;
561 if (bs->backing_hd) {
562 bdrv_refresh_limits(bs->backing_hd);
563 bs->bl.opt_transfer_length =
564 MAX(bs->bl.opt_transfer_length,
565 bs->backing_hd->bl.opt_transfer_length);
566 bs->bl.opt_mem_alignment =
567 MAX(bs->bl.opt_mem_alignment,
568 bs->backing_hd->bl.opt_mem_alignment);
571 /* Then let the driver override it */
572 if (drv->bdrv_refresh_limits) {
573 return drv->bdrv_refresh_limits(bs);
576 return 0;
580 * Create a uniquely-named empty temporary file.
581 * Return 0 upon success, otherwise a negative errno value.
583 int get_tmp_filename(char *filename, int size)
585 #ifdef _WIN32
586 char temp_dir[MAX_PATH];
587 /* GetTempFileName requires that its output buffer (4th param)
588 have length MAX_PATH or greater. */
589 assert(size >= MAX_PATH);
590 return (GetTempPath(MAX_PATH, temp_dir)
591 && GetTempFileName(temp_dir, "qem", 0, filename)
592 ? 0 : -GetLastError());
593 #else
594 int fd;
595 const char *tmpdir;
596 tmpdir = getenv("TMPDIR");
597 if (!tmpdir) {
598 tmpdir = "/var/tmp";
600 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
601 return -EOVERFLOW;
603 fd = mkstemp(filename);
604 if (fd < 0) {
605 return -errno;
607 if (close(fd) != 0) {
608 unlink(filename);
609 return -errno;
611 return 0;
612 #endif
616 * Detect host devices. By convention, /dev/cdrom[N] is always
617 * recognized as a host CDROM.
619 static BlockDriver *find_hdev_driver(const char *filename)
621 int score_max = 0, score;
622 BlockDriver *drv = NULL, *d;
624 QLIST_FOREACH(d, &bdrv_drivers, list) {
625 if (d->bdrv_probe_device) {
626 score = d->bdrv_probe_device(filename);
627 if (score > score_max) {
628 score_max = score;
629 drv = d;
634 return drv;
637 BlockDriver *bdrv_find_protocol(const char *filename,
638 bool allow_protocol_prefix)
640 BlockDriver *drv1;
641 char protocol[128];
642 int len;
643 const char *p;
645 /* TODO Drivers without bdrv_file_open must be specified explicitly */
648 * XXX(hch): we really should not let host device detection
649 * override an explicit protocol specification, but moving this
650 * later breaks access to device names with colons in them.
651 * Thanks to the brain-dead persistent naming schemes on udev-
652 * based Linux systems those actually are quite common.
654 drv1 = find_hdev_driver(filename);
655 if (drv1) {
656 return drv1;
659 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
660 return bdrv_find_format("file");
663 p = strchr(filename, ':');
664 assert(p != NULL);
665 len = p - filename;
666 if (len > sizeof(protocol) - 1)
667 len = sizeof(protocol) - 1;
668 memcpy(protocol, filename, len);
669 protocol[len] = '\0';
670 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
671 if (drv1->protocol_name &&
672 !strcmp(drv1->protocol_name, protocol)) {
673 return drv1;
676 return NULL;
679 static int find_image_format(BlockDriverState *bs, const char *filename,
680 BlockDriver **pdrv, Error **errp)
682 int score, score_max;
683 BlockDriver *drv1, *drv;
684 uint8_t buf[2048];
685 int ret = 0;
687 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
688 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
689 drv = bdrv_find_format("raw");
690 if (!drv) {
691 error_setg(errp, "Could not find raw image format");
692 ret = -ENOENT;
694 *pdrv = drv;
695 return ret;
698 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
699 if (ret < 0) {
700 error_setg_errno(errp, -ret, "Could not read image for determining its "
701 "format");
702 *pdrv = NULL;
703 return ret;
706 score_max = 0;
707 drv = NULL;
708 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
709 if (drv1->bdrv_probe) {
710 score = drv1->bdrv_probe(buf, ret, filename);
711 if (score > score_max) {
712 score_max = score;
713 drv = drv1;
717 if (!drv) {
718 error_setg(errp, "Could not determine image format: No compatible "
719 "driver found");
720 ret = -ENOENT;
722 *pdrv = drv;
723 return ret;
727 * Set the current 'total_sectors' value
729 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
731 BlockDriver *drv = bs->drv;
733 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
734 if (bs->sg)
735 return 0;
737 /* query actual device if possible, otherwise just trust the hint */
738 if (drv->bdrv_getlength) {
739 int64_t length = drv->bdrv_getlength(bs);
740 if (length < 0) {
741 return length;
743 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
746 bs->total_sectors = hint;
747 return 0;
751 * Set open flags for a given discard mode
753 * Return 0 on success, -1 if the discard mode was invalid.
755 int bdrv_parse_discard_flags(const char *mode, int *flags)
757 *flags &= ~BDRV_O_UNMAP;
759 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
760 /* do nothing */
761 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
762 *flags |= BDRV_O_UNMAP;
763 } else {
764 return -1;
767 return 0;
771 * Set open flags for a given cache mode
773 * Return 0 on success, -1 if the cache mode was invalid.
775 int bdrv_parse_cache_flags(const char *mode, int *flags)
777 *flags &= ~BDRV_O_CACHE_MASK;
779 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
780 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
781 } else if (!strcmp(mode, "directsync")) {
782 *flags |= BDRV_O_NOCACHE;
783 } else if (!strcmp(mode, "writeback")) {
784 *flags |= BDRV_O_CACHE_WB;
785 } else if (!strcmp(mode, "unsafe")) {
786 *flags |= BDRV_O_CACHE_WB;
787 *flags |= BDRV_O_NO_FLUSH;
788 } else if (!strcmp(mode, "writethrough")) {
789 /* this is the default */
790 } else {
791 return -1;
794 return 0;
798 * The copy-on-read flag is actually a reference count so multiple users may
799 * use the feature without worrying about clobbering its previous state.
800 * Copy-on-read stays enabled until all users have called to disable it.
802 void bdrv_enable_copy_on_read(BlockDriverState *bs)
804 bs->copy_on_read++;
807 void bdrv_disable_copy_on_read(BlockDriverState *bs)
809 assert(bs->copy_on_read > 0);
810 bs->copy_on_read--;
814 * Returns the flags that a temporary snapshot should get, based on the
815 * originally requested flags (the originally requested image will have flags
816 * like a backing file)
818 static int bdrv_temp_snapshot_flags(int flags)
820 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
824 * Returns the flags that bs->file should get, based on the given flags for
825 * the parent BDS
827 static int bdrv_inherited_flags(int flags)
829 /* Enable protocol handling, disable format probing for bs->file */
830 flags |= BDRV_O_PROTOCOL;
832 /* Our block drivers take care to send flushes and respect unmap policy,
833 * so we can enable both unconditionally on lower layers. */
834 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
836 /* Clear flags that only apply to the top layer */
837 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
839 return flags;
843 * Returns the flags that bs->backing_hd should get, based on the given flags
844 * for the parent BDS
846 static int bdrv_backing_flags(int flags)
848 /* backing files always opened read-only */
849 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
851 /* snapshot=on is handled on the top layer */
852 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
854 return flags;
857 static int bdrv_open_flags(BlockDriverState *bs, int flags)
859 int open_flags = flags | BDRV_O_CACHE_WB;
862 * Clear flags that are internal to the block layer before opening the
863 * image.
865 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
868 * Snapshots should be writable.
870 if (flags & BDRV_O_TEMPORARY) {
871 open_flags |= BDRV_O_RDWR;
874 return open_flags;
877 static void bdrv_assign_node_name(BlockDriverState *bs,
878 const char *node_name,
879 Error **errp)
881 if (!node_name) {
882 return;
885 /* empty string node name is invalid */
886 if (node_name[0] == '\0') {
887 error_setg(errp, "Empty node name");
888 return;
891 /* takes care of avoiding namespaces collisions */
892 if (bdrv_find(node_name)) {
893 error_setg(errp, "node-name=%s is conflicting with a device id",
894 node_name);
895 return;
898 /* takes care of avoiding duplicates node names */
899 if (bdrv_find_node(node_name)) {
900 error_setg(errp, "Duplicate node name");
901 return;
904 /* copy node name into the bs and insert it into the graph list */
905 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
906 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
910 * Common part for opening disk images and files
912 * Removes all processed options from *options.
914 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
915 QDict *options, int flags, BlockDriver *drv, Error **errp)
917 int ret, open_flags;
918 const char *filename;
919 const char *node_name = NULL;
920 Error *local_err = NULL;
922 assert(drv != NULL);
923 assert(bs->file == NULL);
924 assert(options != NULL && bs->options != options);
926 if (file != NULL) {
927 filename = file->filename;
928 } else {
929 filename = qdict_get_try_str(options, "filename");
932 if (drv->bdrv_needs_filename && !filename) {
933 error_setg(errp, "The '%s' block driver requires a file name",
934 drv->format_name);
935 return -EINVAL;
938 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
940 node_name = qdict_get_try_str(options, "node-name");
941 bdrv_assign_node_name(bs, node_name, &local_err);
942 if (local_err) {
943 error_propagate(errp, local_err);
944 return -EINVAL;
946 qdict_del(options, "node-name");
948 /* bdrv_open() with directly using a protocol as drv. This layer is already
949 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
950 * and return immediately. */
951 if (file != NULL && drv->bdrv_file_open) {
952 bdrv_swap(file, bs);
953 return 0;
956 bs->open_flags = flags;
957 bs->guest_block_size = 512;
958 bs->request_alignment = 512;
959 bs->zero_beyond_eof = true;
960 open_flags = bdrv_open_flags(bs, flags);
961 bs->read_only = !(open_flags & BDRV_O_RDWR);
963 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
964 error_setg(errp,
965 !bs->read_only && bdrv_is_whitelisted(drv, true)
966 ? "Driver '%s' can only be used for read-only devices"
967 : "Driver '%s' is not whitelisted",
968 drv->format_name);
969 return -ENOTSUP;
972 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
973 if (flags & BDRV_O_COPY_ON_READ) {
974 if (!bs->read_only) {
975 bdrv_enable_copy_on_read(bs);
976 } else {
977 error_setg(errp, "Can't use copy-on-read on read-only device");
978 return -EINVAL;
982 if (filename != NULL) {
983 pstrcpy(bs->filename, sizeof(bs->filename), filename);
984 } else {
985 bs->filename[0] = '\0';
988 bs->drv = drv;
989 bs->opaque = g_malloc0(drv->instance_size);
991 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
993 /* Open the image, either directly or using a protocol */
994 if (drv->bdrv_file_open) {
995 assert(file == NULL);
996 assert(!drv->bdrv_needs_filename || filename != NULL);
997 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
998 } else {
999 if (file == NULL) {
1000 error_setg(errp, "Can't use '%s' as a block driver for the "
1001 "protocol level", drv->format_name);
1002 ret = -EINVAL;
1003 goto free_and_fail;
1005 bs->file = file;
1006 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1009 if (ret < 0) {
1010 if (local_err) {
1011 error_propagate(errp, local_err);
1012 } else if (bs->filename[0]) {
1013 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1014 } else {
1015 error_setg_errno(errp, -ret, "Could not open image");
1017 goto free_and_fail;
1020 ret = refresh_total_sectors(bs, bs->total_sectors);
1021 if (ret < 0) {
1022 error_setg_errno(errp, -ret, "Could not refresh total sector count");
1023 goto free_and_fail;
1026 bdrv_refresh_limits(bs);
1027 assert(bdrv_opt_mem_align(bs) != 0);
1028 assert((bs->request_alignment != 0) || bs->sg);
1029 return 0;
1031 free_and_fail:
1032 bs->file = NULL;
1033 g_free(bs->opaque);
1034 bs->opaque = NULL;
1035 bs->drv = NULL;
1036 return ret;
1040 * Opens a file using a protocol (file, host_device, nbd, ...)
1042 * options is an indirect pointer to a QDict of options to pass to the block
1043 * drivers, or pointer to NULL for an empty set of options. If this function
1044 * takes ownership of the QDict reference, it will set *options to NULL;
1045 * otherwise, it will contain unused/unrecognized options after this function
1046 * returns. Then, the caller is responsible for freeing it. If it intends to
1047 * reuse the QDict, QINCREF() should be called beforehand.
1049 static int bdrv_file_open(BlockDriverState *bs, const char *filename,
1050 QDict **options, int flags, Error **errp)
1052 BlockDriver *drv;
1053 const char *drvname;
1054 bool parse_filename = false;
1055 Error *local_err = NULL;
1056 int ret;
1058 /* Fetch the file name from the options QDict if necessary */
1059 if (!filename) {
1060 filename = qdict_get_try_str(*options, "filename");
1061 } else if (filename && !qdict_haskey(*options, "filename")) {
1062 qdict_put(*options, "filename", qstring_from_str(filename));
1063 parse_filename = true;
1064 } else {
1065 error_setg(errp, "Can't specify 'file' and 'filename' options at the "
1066 "same time");
1067 ret = -EINVAL;
1068 goto fail;
1071 /* Find the right block driver */
1072 drvname = qdict_get_try_str(*options, "driver");
1073 if (drvname) {
1074 drv = bdrv_find_format(drvname);
1075 if (!drv) {
1076 error_setg(errp, "Unknown driver '%s'", drvname);
1078 qdict_del(*options, "driver");
1079 } else if (filename) {
1080 drv = bdrv_find_protocol(filename, parse_filename);
1081 if (!drv) {
1082 error_setg(errp, "Unknown protocol");
1084 } else {
1085 error_setg(errp, "Must specify either driver or file");
1086 drv = NULL;
1089 if (!drv) {
1090 /* errp has been set already */
1091 ret = -ENOENT;
1092 goto fail;
1095 /* Parse the filename and open it */
1096 if (drv->bdrv_parse_filename && parse_filename) {
1097 drv->bdrv_parse_filename(filename, *options, &local_err);
1098 if (local_err) {
1099 error_propagate(errp, local_err);
1100 ret = -EINVAL;
1101 goto fail;
1104 if (!drv->bdrv_needs_filename) {
1105 qdict_del(*options, "filename");
1106 } else {
1107 filename = qdict_get_str(*options, "filename");
1111 if (!drv->bdrv_file_open) {
1112 ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err);
1113 *options = NULL;
1114 } else {
1115 ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err);
1117 if (ret < 0) {
1118 error_propagate(errp, local_err);
1119 goto fail;
1122 bs->growable = 1;
1123 return 0;
1125 fail:
1126 return ret;
1129 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1132 if (bs->backing_hd) {
1133 assert(bs->backing_blocker);
1134 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1135 } else if (backing_hd) {
1136 error_setg(&bs->backing_blocker,
1137 "device is used as backing hd of '%s'",
1138 bs->device_name);
1141 bs->backing_hd = backing_hd;
1142 if (!backing_hd) {
1143 error_free(bs->backing_blocker);
1144 bs->backing_blocker = NULL;
1145 goto out;
1147 bs->open_flags &= ~BDRV_O_NO_BACKING;
1148 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1149 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1150 backing_hd->drv ? backing_hd->drv->format_name : "");
1152 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1153 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1154 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1155 bs->backing_blocker);
1156 out:
1157 bdrv_refresh_limits(bs);
1161 * Opens the backing file for a BlockDriverState if not yet open
1163 * options is a QDict of options to pass to the block drivers, or NULL for an
1164 * empty set of options. The reference to the QDict is transferred to this
1165 * function (even on failure), so if the caller intends to reuse the dictionary,
1166 * it needs to use QINCREF() before calling bdrv_file_open.
1168 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1170 char *backing_filename = g_malloc0(PATH_MAX);
1171 int ret = 0;
1172 BlockDriver *back_drv = NULL;
1173 BlockDriverState *backing_hd;
1174 Error *local_err = NULL;
1176 if (bs->backing_hd != NULL) {
1177 QDECREF(options);
1178 goto free_exit;
1181 /* NULL means an empty set of options */
1182 if (options == NULL) {
1183 options = qdict_new();
1186 bs->open_flags &= ~BDRV_O_NO_BACKING;
1187 if (qdict_haskey(options, "file.filename")) {
1188 backing_filename[0] = '\0';
1189 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1190 QDECREF(options);
1191 goto free_exit;
1192 } else {
1193 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1196 backing_hd = bdrv_new("", errp);
1198 if (bs->backing_format[0] != '\0') {
1199 back_drv = bdrv_find_format(bs->backing_format);
1202 assert(bs->backing_hd == NULL);
1203 ret = bdrv_open(&backing_hd,
1204 *backing_filename ? backing_filename : NULL, NULL, options,
1205 bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1206 if (ret < 0) {
1207 bdrv_unref(backing_hd);
1208 backing_hd = NULL;
1209 bs->open_flags |= BDRV_O_NO_BACKING;
1210 error_setg(errp, "Could not open backing file: %s",
1211 error_get_pretty(local_err));
1212 error_free(local_err);
1213 goto free_exit;
1215 bdrv_set_backing_hd(bs, backing_hd);
1217 free_exit:
1218 g_free(backing_filename);
1219 return ret;
1223 * Opens a disk image whose options are given as BlockdevRef in another block
1224 * device's options.
1226 * If allow_none is true, no image will be opened if filename is false and no
1227 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1229 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1230 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1231 * itself, all options starting with "${bdref_key}." are considered part of the
1232 * BlockdevRef.
1234 * The BlockdevRef will be removed from the options QDict.
1236 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1238 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1239 QDict *options, const char *bdref_key, int flags,
1240 bool allow_none, Error **errp)
1242 QDict *image_options;
1243 int ret;
1244 char *bdref_key_dot;
1245 const char *reference;
1247 assert(pbs);
1248 assert(*pbs == NULL);
1250 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1251 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1252 g_free(bdref_key_dot);
1254 reference = qdict_get_try_str(options, bdref_key);
1255 if (!filename && !reference && !qdict_size(image_options)) {
1256 if (allow_none) {
1257 ret = 0;
1258 } else {
1259 error_setg(errp, "A block device must be specified for \"%s\"",
1260 bdref_key);
1261 ret = -EINVAL;
1263 QDECREF(image_options);
1264 goto done;
1267 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1269 done:
1270 qdict_del(options, bdref_key);
1271 return ret;
1274 void bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1276 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1277 char *tmp_filename = g_malloc0(PATH_MAX + 1);
1278 int64_t total_size;
1279 BlockDriver *bdrv_qcow2;
1280 QemuOptsList *create_opts = NULL;
1281 QemuOpts *opts = NULL;
1282 QDict *snapshot_options;
1283 BlockDriverState *bs_snapshot;
1284 Error *local_err;
1285 int ret;
1287 /* if snapshot, we create a temporary backing file and open it
1288 instead of opening 'filename' directly */
1290 /* Get the required size from the image */
1291 total_size = bdrv_getlength(bs);
1292 if (total_size < 0) {
1293 error_setg_errno(errp, -total_size, "Could not get image size");
1294 goto out;
1296 total_size &= BDRV_SECTOR_MASK;
1298 /* Create the temporary image */
1299 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1300 if (ret < 0) {
1301 error_setg_errno(errp, -ret, "Could not get temporary filename");
1302 goto out;
1305 bdrv_qcow2 = bdrv_find_format("qcow2");
1307 assert(!(bdrv_qcow2->create_options && bdrv_qcow2->create_opts));
1308 if (bdrv_qcow2->create_options) {
1309 create_opts = params_to_opts(bdrv_qcow2->create_options);
1310 } else {
1311 create_opts = bdrv_qcow2->create_opts;
1313 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
1314 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1315 ret = bdrv_create(bdrv_qcow2, tmp_filename, NULL, opts, &local_err);
1316 qemu_opts_del(opts);
1317 if (bdrv_qcow2->create_options) {
1318 qemu_opts_free(create_opts);
1320 if (ret < 0) {
1321 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1322 "'%s': %s", tmp_filename,
1323 error_get_pretty(local_err));
1324 error_free(local_err);
1325 goto out;
1328 /* Prepare a new options QDict for the temporary file */
1329 snapshot_options = qdict_new();
1330 qdict_put(snapshot_options, "file.driver",
1331 qstring_from_str("file"));
1332 qdict_put(snapshot_options, "file.filename",
1333 qstring_from_str(tmp_filename));
1335 bs_snapshot = bdrv_new("", &error_abort);
1337 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1338 flags, bdrv_qcow2, &local_err);
1339 if (ret < 0) {
1340 error_propagate(errp, local_err);
1341 goto out;
1344 bdrv_append(bs_snapshot, bs);
1346 out:
1347 g_free(tmp_filename);
1350 static QDict *parse_json_filename(const char *filename, Error **errp)
1352 QObject *options_obj;
1353 QDict *options;
1354 int ret;
1356 ret = strstart(filename, "json:", &filename);
1357 assert(ret);
1359 options_obj = qobject_from_json(filename);
1360 if (!options_obj) {
1361 error_setg(errp, "Could not parse the JSON options");
1362 return NULL;
1365 if (qobject_type(options_obj) != QTYPE_QDICT) {
1366 qobject_decref(options_obj);
1367 error_setg(errp, "Invalid JSON object given");
1368 return NULL;
1371 options = qobject_to_qdict(options_obj);
1372 qdict_flatten(options);
1374 return options;
1378 * Opens a disk image (raw, qcow2, vmdk, ...)
1380 * options is a QDict of options to pass to the block drivers, or NULL for an
1381 * empty set of options. The reference to the QDict belongs to the block layer
1382 * after the call (even on failure), so if the caller intends to reuse the
1383 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1385 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1386 * If it is not NULL, the referenced BDS will be reused.
1388 * The reference parameter may be used to specify an existing block device which
1389 * should be opened. If specified, neither options nor a filename may be given,
1390 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1392 int bdrv_open(BlockDriverState **pbs, const char *filename,
1393 const char *reference, QDict *options, int flags,
1394 BlockDriver *drv, Error **errp)
1396 int ret;
1397 BlockDriverState *file = NULL, *bs;
1398 const char *drvname;
1399 Error *local_err = NULL;
1400 int snapshot_flags = 0;
1402 assert(pbs);
1404 if (reference) {
1405 bool options_non_empty = options ? qdict_size(options) : false;
1406 QDECREF(options);
1408 if (*pbs) {
1409 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1410 "another block device");
1411 return -EINVAL;
1414 if (filename || options_non_empty) {
1415 error_setg(errp, "Cannot reference an existing block device with "
1416 "additional options or a new filename");
1417 return -EINVAL;
1420 bs = bdrv_lookup_bs(reference, reference, errp);
1421 if (!bs) {
1422 return -ENODEV;
1424 bdrv_ref(bs);
1425 *pbs = bs;
1426 return 0;
1429 if (*pbs) {
1430 bs = *pbs;
1431 } else {
1432 bs = bdrv_new("", &error_abort);
1435 /* NULL means an empty set of options */
1436 if (options == NULL) {
1437 options = qdict_new();
1440 if (filename && g_str_has_prefix(filename, "json:")) {
1441 QDict *json_options = parse_json_filename(filename, &local_err);
1442 if (local_err) {
1443 ret = -EINVAL;
1444 goto fail;
1447 /* Options given in the filename have lower priority than options
1448 * specified directly */
1449 qdict_join(options, json_options, false);
1450 QDECREF(json_options);
1451 filename = NULL;
1454 bs->options = options;
1455 options = qdict_clone_shallow(options);
1457 if (flags & BDRV_O_PROTOCOL) {
1458 assert(!drv);
1459 ret = bdrv_file_open(bs, filename, &options, flags & ~BDRV_O_PROTOCOL,
1460 &local_err);
1461 if (!ret) {
1462 drv = bs->drv;
1463 goto done;
1464 } else if (bs->drv) {
1465 goto close_and_fail;
1466 } else {
1467 goto fail;
1471 /* Open image file without format layer */
1472 if (flags & BDRV_O_RDWR) {
1473 flags |= BDRV_O_ALLOW_RDWR;
1475 if (flags & BDRV_O_SNAPSHOT) {
1476 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1477 flags = bdrv_backing_flags(flags);
1480 assert(file == NULL);
1481 ret = bdrv_open_image(&file, filename, options, "file",
1482 bdrv_inherited_flags(flags),
1483 true, &local_err);
1484 if (ret < 0) {
1485 goto fail;
1488 /* Find the right image format driver */
1489 drvname = qdict_get_try_str(options, "driver");
1490 if (drvname) {
1491 drv = bdrv_find_format(drvname);
1492 qdict_del(options, "driver");
1493 if (!drv) {
1494 error_setg(errp, "Invalid driver: '%s'", drvname);
1495 ret = -EINVAL;
1496 goto fail;
1500 if (!drv) {
1501 if (file) {
1502 ret = find_image_format(file, filename, &drv, &local_err);
1503 } else {
1504 error_setg(errp, "Must specify either driver or file");
1505 ret = -EINVAL;
1506 goto fail;
1510 if (!drv) {
1511 goto fail;
1514 /* Open the image */
1515 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1516 if (ret < 0) {
1517 goto fail;
1520 if (file && (bs->file != file)) {
1521 bdrv_unref(file);
1522 file = NULL;
1525 /* If there is a backing file, use it */
1526 if ((flags & BDRV_O_NO_BACKING) == 0) {
1527 QDict *backing_options;
1529 qdict_extract_subqdict(options, &backing_options, "backing.");
1530 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1531 if (ret < 0) {
1532 goto close_and_fail;
1536 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1537 * temporary snapshot afterwards. */
1538 if (snapshot_flags) {
1539 bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1540 if (local_err) {
1541 error_propagate(errp, local_err);
1542 goto close_and_fail;
1547 done:
1548 /* Check if any unknown options were used */
1549 if (options && (qdict_size(options) != 0)) {
1550 const QDictEntry *entry = qdict_first(options);
1551 if (flags & BDRV_O_PROTOCOL) {
1552 error_setg(errp, "Block protocol '%s' doesn't support the option "
1553 "'%s'", drv->format_name, entry->key);
1554 } else {
1555 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1556 "support the option '%s'", drv->format_name,
1557 bs->device_name, entry->key);
1560 ret = -EINVAL;
1561 goto close_and_fail;
1564 if (!bdrv_key_required(bs)) {
1565 bdrv_dev_change_media_cb(bs, true);
1566 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1567 && !runstate_check(RUN_STATE_INMIGRATE)
1568 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1569 error_setg(errp,
1570 "Guest must be stopped for opening of encrypted image");
1571 ret = -EBUSY;
1572 goto close_and_fail;
1575 QDECREF(options);
1576 *pbs = bs;
1577 return 0;
1579 fail:
1580 if (file != NULL) {
1581 bdrv_unref(file);
1583 QDECREF(bs->options);
1584 QDECREF(options);
1585 bs->options = NULL;
1586 if (!*pbs) {
1587 /* If *pbs is NULL, a new BDS has been created in this function and
1588 needs to be freed now. Otherwise, it does not need to be closed,
1589 since it has not really been opened yet. */
1590 bdrv_unref(bs);
1592 if (local_err) {
1593 error_propagate(errp, local_err);
1595 return ret;
1597 close_and_fail:
1598 /* See fail path, but now the BDS has to be always closed */
1599 if (*pbs) {
1600 bdrv_close(bs);
1601 } else {
1602 bdrv_unref(bs);
1604 QDECREF(options);
1605 if (local_err) {
1606 error_propagate(errp, local_err);
1608 return ret;
1611 typedef struct BlockReopenQueueEntry {
1612 bool prepared;
1613 BDRVReopenState state;
1614 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1615 } BlockReopenQueueEntry;
1618 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1619 * reopen of multiple devices.
1621 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1622 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1623 * be created and initialized. This newly created BlockReopenQueue should be
1624 * passed back in for subsequent calls that are intended to be of the same
1625 * atomic 'set'.
1627 * bs is the BlockDriverState to add to the reopen queue.
1629 * flags contains the open flags for the associated bs
1631 * returns a pointer to bs_queue, which is either the newly allocated
1632 * bs_queue, or the existing bs_queue being used.
1635 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1636 BlockDriverState *bs, int flags)
1638 assert(bs != NULL);
1640 BlockReopenQueueEntry *bs_entry;
1641 if (bs_queue == NULL) {
1642 bs_queue = g_new0(BlockReopenQueue, 1);
1643 QSIMPLEQ_INIT(bs_queue);
1646 /* bdrv_open() masks this flag out */
1647 flags &= ~BDRV_O_PROTOCOL;
1649 if (bs->file) {
1650 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1653 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1654 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1656 bs_entry->state.bs = bs;
1657 bs_entry->state.flags = flags;
1659 return bs_queue;
1663 * Reopen multiple BlockDriverStates atomically & transactionally.
1665 * The queue passed in (bs_queue) must have been built up previous
1666 * via bdrv_reopen_queue().
1668 * Reopens all BDS specified in the queue, with the appropriate
1669 * flags. All devices are prepared for reopen, and failure of any
1670 * device will cause all device changes to be abandonded, and intermediate
1671 * data cleaned up.
1673 * If all devices prepare successfully, then the changes are committed
1674 * to all devices.
1677 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1679 int ret = -1;
1680 BlockReopenQueueEntry *bs_entry, *next;
1681 Error *local_err = NULL;
1683 assert(bs_queue != NULL);
1685 bdrv_drain_all();
1687 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1688 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1689 error_propagate(errp, local_err);
1690 goto cleanup;
1692 bs_entry->prepared = true;
1695 /* If we reach this point, we have success and just need to apply the
1696 * changes
1698 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1699 bdrv_reopen_commit(&bs_entry->state);
1702 ret = 0;
1704 cleanup:
1705 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1706 if (ret && bs_entry->prepared) {
1707 bdrv_reopen_abort(&bs_entry->state);
1709 g_free(bs_entry);
1711 g_free(bs_queue);
1712 return ret;
1716 /* Reopen a single BlockDriverState with the specified flags. */
1717 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1719 int ret = -1;
1720 Error *local_err = NULL;
1721 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1723 ret = bdrv_reopen_multiple(queue, &local_err);
1724 if (local_err != NULL) {
1725 error_propagate(errp, local_err);
1727 return ret;
1732 * Prepares a BlockDriverState for reopen. All changes are staged in the
1733 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1734 * the block driver layer .bdrv_reopen_prepare()
1736 * bs is the BlockDriverState to reopen
1737 * flags are the new open flags
1738 * queue is the reopen queue
1740 * Returns 0 on success, non-zero on error. On error errp will be set
1741 * as well.
1743 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1744 * It is the responsibility of the caller to then call the abort() or
1745 * commit() for any other BDS that have been left in a prepare() state
1748 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1749 Error **errp)
1751 int ret = -1;
1752 Error *local_err = NULL;
1753 BlockDriver *drv;
1755 assert(reopen_state != NULL);
1756 assert(reopen_state->bs->drv != NULL);
1757 drv = reopen_state->bs->drv;
1759 /* if we are to stay read-only, do not allow permission change
1760 * to r/w */
1761 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1762 reopen_state->flags & BDRV_O_RDWR) {
1763 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1764 reopen_state->bs->device_name);
1765 goto error;
1769 ret = bdrv_flush(reopen_state->bs);
1770 if (ret) {
1771 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1772 strerror(-ret));
1773 goto error;
1776 if (drv->bdrv_reopen_prepare) {
1777 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1778 if (ret) {
1779 if (local_err != NULL) {
1780 error_propagate(errp, local_err);
1781 } else {
1782 error_setg(errp, "failed while preparing to reopen image '%s'",
1783 reopen_state->bs->filename);
1785 goto error;
1787 } else {
1788 /* It is currently mandatory to have a bdrv_reopen_prepare()
1789 * handler for each supported drv. */
1790 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1791 drv->format_name, reopen_state->bs->device_name,
1792 "reopening of file");
1793 ret = -1;
1794 goto error;
1797 ret = 0;
1799 error:
1800 return ret;
1804 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1805 * makes them final by swapping the staging BlockDriverState contents into
1806 * the active BlockDriverState contents.
1808 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1810 BlockDriver *drv;
1812 assert(reopen_state != NULL);
1813 drv = reopen_state->bs->drv;
1814 assert(drv != NULL);
1816 /* If there are any driver level actions to take */
1817 if (drv->bdrv_reopen_commit) {
1818 drv->bdrv_reopen_commit(reopen_state);
1821 /* set BDS specific flags now */
1822 reopen_state->bs->open_flags = reopen_state->flags;
1823 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1824 BDRV_O_CACHE_WB);
1825 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1827 bdrv_refresh_limits(reopen_state->bs);
1831 * Abort the reopen, and delete and free the staged changes in
1832 * reopen_state
1834 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1836 BlockDriver *drv;
1838 assert(reopen_state != NULL);
1839 drv = reopen_state->bs->drv;
1840 assert(drv != NULL);
1842 if (drv->bdrv_reopen_abort) {
1843 drv->bdrv_reopen_abort(reopen_state);
1848 void bdrv_close(BlockDriverState *bs)
1850 if (bs->job) {
1851 block_job_cancel_sync(bs->job);
1853 bdrv_drain_all(); /* complete I/O */
1854 bdrv_flush(bs);
1855 bdrv_drain_all(); /* in case flush left pending I/O */
1856 notifier_list_notify(&bs->close_notifiers, bs);
1858 if (bs->drv) {
1859 if (bs->backing_hd) {
1860 BlockDriverState *backing_hd = bs->backing_hd;
1861 bdrv_set_backing_hd(bs, NULL);
1862 bdrv_unref(backing_hd);
1864 bs->drv->bdrv_close(bs);
1865 g_free(bs->opaque);
1866 bs->opaque = NULL;
1867 bs->drv = NULL;
1868 bs->copy_on_read = 0;
1869 bs->backing_file[0] = '\0';
1870 bs->backing_format[0] = '\0';
1871 bs->total_sectors = 0;
1872 bs->encrypted = 0;
1873 bs->valid_key = 0;
1874 bs->sg = 0;
1875 bs->growable = 0;
1876 bs->zero_beyond_eof = false;
1877 QDECREF(bs->options);
1878 bs->options = NULL;
1880 if (bs->file != NULL) {
1881 bdrv_unref(bs->file);
1882 bs->file = NULL;
1886 bdrv_dev_change_media_cb(bs, false);
1888 /*throttling disk I/O limits*/
1889 if (bs->io_limits_enabled) {
1890 bdrv_io_limits_disable(bs);
1894 void bdrv_close_all(void)
1896 BlockDriverState *bs;
1898 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1899 AioContext *aio_context = bdrv_get_aio_context(bs);
1901 aio_context_acquire(aio_context);
1902 bdrv_close(bs);
1903 aio_context_release(aio_context);
1907 /* Check if any requests are in-flight (including throttled requests) */
1908 static bool bdrv_requests_pending(BlockDriverState *bs)
1910 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1911 return true;
1913 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1914 return true;
1916 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1917 return true;
1919 if (bs->file && bdrv_requests_pending(bs->file)) {
1920 return true;
1922 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1923 return true;
1925 return false;
1929 * Wait for pending requests to complete across all BlockDriverStates
1931 * This function does not flush data to disk, use bdrv_flush_all() for that
1932 * after calling this function.
1934 * Note that completion of an asynchronous I/O operation can trigger any
1935 * number of other I/O operations on other devices---for example a coroutine
1936 * can be arbitrarily complex and a constant flow of I/O can come until the
1937 * coroutine is complete. Because of this, it is not possible to have a
1938 * function to drain a single device's I/O queue.
1940 void bdrv_drain_all(void)
1942 /* Always run first iteration so any pending completion BHs run */
1943 bool busy = true;
1944 BlockDriverState *bs;
1946 while (busy) {
1947 busy = false;
1949 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1950 AioContext *aio_context = bdrv_get_aio_context(bs);
1951 bool bs_busy;
1953 aio_context_acquire(aio_context);
1954 bdrv_start_throttled_reqs(bs);
1955 bs_busy = bdrv_requests_pending(bs);
1956 bs_busy |= aio_poll(aio_context, bs_busy);
1957 aio_context_release(aio_context);
1959 busy |= bs_busy;
1964 /* make a BlockDriverState anonymous by removing from bdrv_state and
1965 * graph_bdrv_state list.
1966 Also, NULL terminate the device_name to prevent double remove */
1967 void bdrv_make_anon(BlockDriverState *bs)
1969 if (bs->device_name[0] != '\0') {
1970 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1972 bs->device_name[0] = '\0';
1973 if (bs->node_name[0] != '\0') {
1974 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1976 bs->node_name[0] = '\0';
1979 static void bdrv_rebind(BlockDriverState *bs)
1981 if (bs->drv && bs->drv->bdrv_rebind) {
1982 bs->drv->bdrv_rebind(bs);
1986 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1987 BlockDriverState *bs_src)
1989 /* move some fields that need to stay attached to the device */
1991 /* dev info */
1992 bs_dest->dev_ops = bs_src->dev_ops;
1993 bs_dest->dev_opaque = bs_src->dev_opaque;
1994 bs_dest->dev = bs_src->dev;
1995 bs_dest->guest_block_size = bs_src->guest_block_size;
1996 bs_dest->copy_on_read = bs_src->copy_on_read;
1998 bs_dest->enable_write_cache = bs_src->enable_write_cache;
2000 /* i/o throttled req */
2001 memcpy(&bs_dest->throttle_state,
2002 &bs_src->throttle_state,
2003 sizeof(ThrottleState));
2004 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
2005 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
2006 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
2008 /* r/w error */
2009 bs_dest->on_read_error = bs_src->on_read_error;
2010 bs_dest->on_write_error = bs_src->on_write_error;
2012 /* i/o status */
2013 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2014 bs_dest->iostatus = bs_src->iostatus;
2016 /* dirty bitmap */
2017 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
2019 /* reference count */
2020 bs_dest->refcnt = bs_src->refcnt;
2022 /* job */
2023 bs_dest->job = bs_src->job;
2025 /* keep the same entry in bdrv_states */
2026 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
2027 bs_src->device_name);
2028 bs_dest->device_list = bs_src->device_list;
2029 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2030 sizeof(bs_dest->op_blockers));
2034 * Swap bs contents for two image chains while they are live,
2035 * while keeping required fields on the BlockDriverState that is
2036 * actually attached to a device.
2038 * This will modify the BlockDriverState fields, and swap contents
2039 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2041 * bs_new is required to be anonymous.
2043 * This function does not create any image files.
2045 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2047 BlockDriverState tmp;
2049 /* The code needs to swap the node_name but simply swapping node_list won't
2050 * work so first remove the nodes from the graph list, do the swap then
2051 * insert them back if needed.
2053 if (bs_new->node_name[0] != '\0') {
2054 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2056 if (bs_old->node_name[0] != '\0') {
2057 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2060 /* bs_new must be anonymous and shouldn't have anything fancy enabled */
2061 assert(bs_new->device_name[0] == '\0');
2062 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2063 assert(bs_new->job == NULL);
2064 assert(bs_new->dev == NULL);
2065 assert(bs_new->io_limits_enabled == false);
2066 assert(!throttle_have_timer(&bs_new->throttle_state));
2068 tmp = *bs_new;
2069 *bs_new = *bs_old;
2070 *bs_old = tmp;
2072 /* there are some fields that should not be swapped, move them back */
2073 bdrv_move_feature_fields(&tmp, bs_old);
2074 bdrv_move_feature_fields(bs_old, bs_new);
2075 bdrv_move_feature_fields(bs_new, &tmp);
2077 /* bs_new shouldn't be in bdrv_states even after the swap! */
2078 assert(bs_new->device_name[0] == '\0');
2080 /* Check a few fields that should remain attached to the device */
2081 assert(bs_new->dev == NULL);
2082 assert(bs_new->job == NULL);
2083 assert(bs_new->io_limits_enabled == false);
2084 assert(!throttle_have_timer(&bs_new->throttle_state));
2086 /* insert the nodes back into the graph node list if needed */
2087 if (bs_new->node_name[0] != '\0') {
2088 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2090 if (bs_old->node_name[0] != '\0') {
2091 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2094 bdrv_rebind(bs_new);
2095 bdrv_rebind(bs_old);
2099 * Add new bs contents at the top of an image chain while the chain is
2100 * live, while keeping required fields on the top layer.
2102 * This will modify the BlockDriverState fields, and swap contents
2103 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2105 * bs_new is required to be anonymous.
2107 * This function does not create any image files.
2109 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2111 bdrv_swap(bs_new, bs_top);
2113 /* The contents of 'tmp' will become bs_top, as we are
2114 * swapping bs_new and bs_top contents. */
2115 bdrv_set_backing_hd(bs_top, bs_new);
2118 static void bdrv_delete(BlockDriverState *bs)
2120 assert(!bs->dev);
2121 assert(!bs->job);
2122 assert(bdrv_op_blocker_is_empty(bs));
2123 assert(!bs->refcnt);
2124 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2126 bdrv_close(bs);
2128 /* remove from list, if necessary */
2129 bdrv_make_anon(bs);
2131 g_free(bs);
2134 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
2135 /* TODO change to DeviceState *dev when all users are qdevified */
2137 if (bs->dev) {
2138 return -EBUSY;
2140 bs->dev = dev;
2141 bdrv_iostatus_reset(bs);
2142 return 0;
2145 /* TODO qdevified devices don't use this, remove when devices are qdevified */
2146 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
2148 if (bdrv_attach_dev(bs, dev) < 0) {
2149 abort();
2153 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2154 /* TODO change to DeviceState *dev when all users are qdevified */
2156 assert(bs->dev == dev);
2157 bs->dev = NULL;
2158 bs->dev_ops = NULL;
2159 bs->dev_opaque = NULL;
2160 bs->guest_block_size = 512;
2163 /* TODO change to return DeviceState * when all users are qdevified */
2164 void *bdrv_get_attached_dev(BlockDriverState *bs)
2166 return bs->dev;
2169 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2170 void *opaque)
2172 bs->dev_ops = ops;
2173 bs->dev_opaque = opaque;
2176 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2177 enum MonitorEvent ev,
2178 BlockErrorAction action, bool is_read)
2180 QObject *data;
2181 const char *action_str;
2183 switch (action) {
2184 case BDRV_ACTION_REPORT:
2185 action_str = "report";
2186 break;
2187 case BDRV_ACTION_IGNORE:
2188 action_str = "ignore";
2189 break;
2190 case BDRV_ACTION_STOP:
2191 action_str = "stop";
2192 break;
2193 default:
2194 abort();
2197 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2198 bdrv->device_name,
2199 action_str,
2200 is_read ? "read" : "write");
2201 monitor_protocol_event(ev, data);
2203 qobject_decref(data);
2206 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2208 QObject *data;
2210 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2211 bdrv_get_device_name(bs), ejected);
2212 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2214 qobject_decref(data);
2217 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2219 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2220 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2221 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2222 if (tray_was_closed) {
2223 /* tray open */
2224 bdrv_emit_qmp_eject_event(bs, true);
2226 if (load) {
2227 /* tray close */
2228 bdrv_emit_qmp_eject_event(bs, false);
2233 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2235 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2238 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2240 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2241 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2245 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2247 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2248 return bs->dev_ops->is_tray_open(bs->dev_opaque);
2250 return false;
2253 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2255 if (bs->dev_ops && bs->dev_ops->resize_cb) {
2256 bs->dev_ops->resize_cb(bs->dev_opaque);
2260 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2262 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2263 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2265 return false;
2269 * Run consistency checks on an image
2271 * Returns 0 if the check could be completed (it doesn't mean that the image is
2272 * free of errors) or -errno when an internal error occurred. The results of the
2273 * check are stored in res.
2275 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2277 if (bs->drv->bdrv_check == NULL) {
2278 return -ENOTSUP;
2281 memset(res, 0, sizeof(*res));
2282 return bs->drv->bdrv_check(bs, res, fix);
2285 #define COMMIT_BUF_SECTORS 2048
2287 /* commit COW file into the raw image */
2288 int bdrv_commit(BlockDriverState *bs)
2290 BlockDriver *drv = bs->drv;
2291 int64_t sector, total_sectors, length, backing_length;
2292 int n, ro, open_flags;
2293 int ret = 0;
2294 uint8_t *buf = NULL;
2295 char filename[PATH_MAX];
2297 if (!drv)
2298 return -ENOMEDIUM;
2300 if (!bs->backing_hd) {
2301 return -ENOTSUP;
2304 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2305 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2306 return -EBUSY;
2309 ro = bs->backing_hd->read_only;
2310 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2311 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2312 open_flags = bs->backing_hd->open_flags;
2314 if (ro) {
2315 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2316 return -EACCES;
2320 length = bdrv_getlength(bs);
2321 if (length < 0) {
2322 ret = length;
2323 goto ro_cleanup;
2326 backing_length = bdrv_getlength(bs->backing_hd);
2327 if (backing_length < 0) {
2328 ret = backing_length;
2329 goto ro_cleanup;
2332 /* If our top snapshot is larger than the backing file image,
2333 * grow the backing file image if possible. If not possible,
2334 * we must return an error */
2335 if (length > backing_length) {
2336 ret = bdrv_truncate(bs->backing_hd, length);
2337 if (ret < 0) {
2338 goto ro_cleanup;
2342 total_sectors = length >> BDRV_SECTOR_BITS;
2343 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2345 for (sector = 0; sector < total_sectors; sector += n) {
2346 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2347 if (ret < 0) {
2348 goto ro_cleanup;
2350 if (ret) {
2351 ret = bdrv_read(bs, sector, buf, n);
2352 if (ret < 0) {
2353 goto ro_cleanup;
2356 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2357 if (ret < 0) {
2358 goto ro_cleanup;
2363 if (drv->bdrv_make_empty) {
2364 ret = drv->bdrv_make_empty(bs);
2365 if (ret < 0) {
2366 goto ro_cleanup;
2368 bdrv_flush(bs);
2372 * Make sure all data we wrote to the backing device is actually
2373 * stable on disk.
2375 if (bs->backing_hd) {
2376 bdrv_flush(bs->backing_hd);
2379 ret = 0;
2380 ro_cleanup:
2381 g_free(buf);
2383 if (ro) {
2384 /* ignoring error return here */
2385 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2388 return ret;
2391 int bdrv_commit_all(void)
2393 BlockDriverState *bs;
2395 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2396 AioContext *aio_context = bdrv_get_aio_context(bs);
2398 aio_context_acquire(aio_context);
2399 if (bs->drv && bs->backing_hd) {
2400 int ret = bdrv_commit(bs);
2401 if (ret < 0) {
2402 aio_context_release(aio_context);
2403 return ret;
2406 aio_context_release(aio_context);
2408 return 0;
2412 * Remove an active request from the tracked requests list
2414 * This function should be called when a tracked request is completing.
2416 static void tracked_request_end(BdrvTrackedRequest *req)
2418 if (req->serialising) {
2419 req->bs->serialising_in_flight--;
2422 QLIST_REMOVE(req, list);
2423 qemu_co_queue_restart_all(&req->wait_queue);
2427 * Add an active request to the tracked requests list
2429 static void tracked_request_begin(BdrvTrackedRequest *req,
2430 BlockDriverState *bs,
2431 int64_t offset,
2432 unsigned int bytes, bool is_write)
2434 *req = (BdrvTrackedRequest){
2435 .bs = bs,
2436 .offset = offset,
2437 .bytes = bytes,
2438 .is_write = is_write,
2439 .co = qemu_coroutine_self(),
2440 .serialising = false,
2441 .overlap_offset = offset,
2442 .overlap_bytes = bytes,
2445 qemu_co_queue_init(&req->wait_queue);
2447 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2450 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2452 int64_t overlap_offset = req->offset & ~(align - 1);
2453 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2454 - overlap_offset;
2456 if (!req->serialising) {
2457 req->bs->serialising_in_flight++;
2458 req->serialising = true;
2461 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2462 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2466 * Round a region to cluster boundaries
2468 void bdrv_round_to_clusters(BlockDriverState *bs,
2469 int64_t sector_num, int nb_sectors,
2470 int64_t *cluster_sector_num,
2471 int *cluster_nb_sectors)
2473 BlockDriverInfo bdi;
2475 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2476 *cluster_sector_num = sector_num;
2477 *cluster_nb_sectors = nb_sectors;
2478 } else {
2479 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2480 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2481 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2482 nb_sectors, c);
2486 static int bdrv_get_cluster_size(BlockDriverState *bs)
2488 BlockDriverInfo bdi;
2489 int ret;
2491 ret = bdrv_get_info(bs, &bdi);
2492 if (ret < 0 || bdi.cluster_size == 0) {
2493 return bs->request_alignment;
2494 } else {
2495 return bdi.cluster_size;
2499 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2500 int64_t offset, unsigned int bytes)
2502 /* aaaa bbbb */
2503 if (offset >= req->overlap_offset + req->overlap_bytes) {
2504 return false;
2506 /* bbbb aaaa */
2507 if (req->overlap_offset >= offset + bytes) {
2508 return false;
2510 return true;
2513 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2515 BlockDriverState *bs = self->bs;
2516 BdrvTrackedRequest *req;
2517 bool retry;
2518 bool waited = false;
2520 if (!bs->serialising_in_flight) {
2521 return false;
2524 do {
2525 retry = false;
2526 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2527 if (req == self || (!req->serialising && !self->serialising)) {
2528 continue;
2530 if (tracked_request_overlaps(req, self->overlap_offset,
2531 self->overlap_bytes))
2533 /* Hitting this means there was a reentrant request, for
2534 * example, a block driver issuing nested requests. This must
2535 * never happen since it means deadlock.
2537 assert(qemu_coroutine_self() != req->co);
2539 /* If the request is already (indirectly) waiting for us, or
2540 * will wait for us as soon as it wakes up, then just go on
2541 * (instead of producing a deadlock in the former case). */
2542 if (!req->waiting_for) {
2543 self->waiting_for = req;
2544 qemu_co_queue_wait(&req->wait_queue);
2545 self->waiting_for = NULL;
2546 retry = true;
2547 waited = true;
2548 break;
2552 } while (retry);
2554 return waited;
2558 * Return values:
2559 * 0 - success
2560 * -EINVAL - backing format specified, but no file
2561 * -ENOSPC - can't update the backing file because no space is left in the
2562 * image file header
2563 * -ENOTSUP - format driver doesn't support changing the backing file
2565 int bdrv_change_backing_file(BlockDriverState *bs,
2566 const char *backing_file, const char *backing_fmt)
2568 BlockDriver *drv = bs->drv;
2569 int ret;
2571 /* Backing file format doesn't make sense without a backing file */
2572 if (backing_fmt && !backing_file) {
2573 return -EINVAL;
2576 if (drv->bdrv_change_backing_file != NULL) {
2577 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2578 } else {
2579 ret = -ENOTSUP;
2582 if (ret == 0) {
2583 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2584 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2586 return ret;
2590 * Finds the image layer in the chain that has 'bs' as its backing file.
2592 * active is the current topmost image.
2594 * Returns NULL if bs is not found in active's image chain,
2595 * or if active == bs.
2597 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2598 BlockDriverState *bs)
2600 BlockDriverState *overlay = NULL;
2601 BlockDriverState *intermediate;
2603 assert(active != NULL);
2604 assert(bs != NULL);
2606 /* if bs is the same as active, then by definition it has no overlay
2608 if (active == bs) {
2609 return NULL;
2612 intermediate = active;
2613 while (intermediate->backing_hd) {
2614 if (intermediate->backing_hd == bs) {
2615 overlay = intermediate;
2616 break;
2618 intermediate = intermediate->backing_hd;
2621 return overlay;
2624 typedef struct BlkIntermediateStates {
2625 BlockDriverState *bs;
2626 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2627 } BlkIntermediateStates;
2631 * Drops images above 'base' up to and including 'top', and sets the image
2632 * above 'top' to have base as its backing file.
2634 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2635 * information in 'bs' can be properly updated.
2637 * E.g., this will convert the following chain:
2638 * bottom <- base <- intermediate <- top <- active
2640 * to
2642 * bottom <- base <- active
2644 * It is allowed for bottom==base, in which case it converts:
2646 * base <- intermediate <- top <- active
2648 * to
2650 * base <- active
2652 * Error conditions:
2653 * if active == top, that is considered an error
2656 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2657 BlockDriverState *base)
2659 BlockDriverState *intermediate;
2660 BlockDriverState *base_bs = NULL;
2661 BlockDriverState *new_top_bs = NULL;
2662 BlkIntermediateStates *intermediate_state, *next;
2663 int ret = -EIO;
2665 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2666 QSIMPLEQ_INIT(&states_to_delete);
2668 if (!top->drv || !base->drv) {
2669 goto exit;
2672 new_top_bs = bdrv_find_overlay(active, top);
2674 if (new_top_bs == NULL) {
2675 /* we could not find the image above 'top', this is an error */
2676 goto exit;
2679 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2680 * to do, no intermediate images */
2681 if (new_top_bs->backing_hd == base) {
2682 ret = 0;
2683 goto exit;
2686 intermediate = top;
2688 /* now we will go down through the list, and add each BDS we find
2689 * into our deletion queue, until we hit the 'base'
2691 while (intermediate) {
2692 intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2693 intermediate_state->bs = intermediate;
2694 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2696 if (intermediate->backing_hd == base) {
2697 base_bs = intermediate->backing_hd;
2698 break;
2700 intermediate = intermediate->backing_hd;
2702 if (base_bs == NULL) {
2703 /* something went wrong, we did not end at the base. safely
2704 * unravel everything, and exit with error */
2705 goto exit;
2708 /* success - we can delete the intermediate states, and link top->base */
2709 ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2710 base_bs->drv ? base_bs->drv->format_name : "");
2711 if (ret) {
2712 goto exit;
2714 bdrv_set_backing_hd(new_top_bs, base_bs);
2716 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2717 /* so that bdrv_close() does not recursively close the chain */
2718 bdrv_set_backing_hd(intermediate_state->bs, NULL);
2719 bdrv_unref(intermediate_state->bs);
2721 ret = 0;
2723 exit:
2724 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2725 g_free(intermediate_state);
2727 return ret;
2731 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2732 size_t size)
2734 int64_t len;
2736 if (size > INT_MAX) {
2737 return -EIO;
2740 if (!bdrv_is_inserted(bs))
2741 return -ENOMEDIUM;
2743 if (bs->growable)
2744 return 0;
2746 len = bdrv_getlength(bs);
2748 if (offset < 0)
2749 return -EIO;
2751 if ((offset > len) || (len - offset < size))
2752 return -EIO;
2754 return 0;
2757 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2758 int nb_sectors)
2760 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2761 return -EIO;
2764 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2765 nb_sectors * BDRV_SECTOR_SIZE);
2768 typedef struct RwCo {
2769 BlockDriverState *bs;
2770 int64_t offset;
2771 QEMUIOVector *qiov;
2772 bool is_write;
2773 int ret;
2774 BdrvRequestFlags flags;
2775 } RwCo;
2777 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2779 RwCo *rwco = opaque;
2781 if (!rwco->is_write) {
2782 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2783 rwco->qiov->size, rwco->qiov,
2784 rwco->flags);
2785 } else {
2786 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2787 rwco->qiov->size, rwco->qiov,
2788 rwco->flags);
2793 * Process a vectored synchronous request using coroutines
2795 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2796 QEMUIOVector *qiov, bool is_write,
2797 BdrvRequestFlags flags)
2799 Coroutine *co;
2800 RwCo rwco = {
2801 .bs = bs,
2802 .offset = offset,
2803 .qiov = qiov,
2804 .is_write = is_write,
2805 .ret = NOT_DONE,
2806 .flags = flags,
2810 * In sync call context, when the vcpu is blocked, this throttling timer
2811 * will not fire; so the I/O throttling function has to be disabled here
2812 * if it has been enabled.
2814 if (bs->io_limits_enabled) {
2815 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2816 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2817 bdrv_io_limits_disable(bs);
2820 if (qemu_in_coroutine()) {
2821 /* Fast-path if already in coroutine context */
2822 bdrv_rw_co_entry(&rwco);
2823 } else {
2824 AioContext *aio_context = bdrv_get_aio_context(bs);
2826 co = qemu_coroutine_create(bdrv_rw_co_entry);
2827 qemu_coroutine_enter(co, &rwco);
2828 while (rwco.ret == NOT_DONE) {
2829 aio_poll(aio_context, true);
2832 return rwco.ret;
2836 * Process a synchronous request using coroutines
2838 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2839 int nb_sectors, bool is_write, BdrvRequestFlags flags)
2841 QEMUIOVector qiov;
2842 struct iovec iov = {
2843 .iov_base = (void *)buf,
2844 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2847 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2848 return -EINVAL;
2851 qemu_iovec_init_external(&qiov, &iov, 1);
2852 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2853 &qiov, is_write, flags);
2856 /* return < 0 if error. See bdrv_write() for the return codes */
2857 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2858 uint8_t *buf, int nb_sectors)
2860 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2863 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2864 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2865 uint8_t *buf, int nb_sectors)
2867 bool enabled;
2868 int ret;
2870 enabled = bs->io_limits_enabled;
2871 bs->io_limits_enabled = false;
2872 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2873 bs->io_limits_enabled = enabled;
2874 return ret;
2877 /* Return < 0 if error. Important errors are:
2878 -EIO generic I/O error (may happen for all errors)
2879 -ENOMEDIUM No media inserted.
2880 -EINVAL Invalid sector number or nb_sectors
2881 -EACCES Trying to write a read-only device
2883 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2884 const uint8_t *buf, int nb_sectors)
2886 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2889 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2890 int nb_sectors, BdrvRequestFlags flags)
2892 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2893 BDRV_REQ_ZERO_WRITE | flags);
2897 * Completely zero out a block device with the help of bdrv_write_zeroes.
2898 * The operation is sped up by checking the block status and only writing
2899 * zeroes to the device if they currently do not return zeroes. Optional
2900 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2902 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2904 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2906 int64_t target_size;
2907 int64_t ret, nb_sectors, sector_num = 0;
2908 int n;
2910 target_size = bdrv_getlength(bs);
2911 if (target_size < 0) {
2912 return target_size;
2914 target_size /= BDRV_SECTOR_SIZE;
2916 for (;;) {
2917 nb_sectors = target_size - sector_num;
2918 if (nb_sectors <= 0) {
2919 return 0;
2921 if (nb_sectors > INT_MAX) {
2922 nb_sectors = INT_MAX;
2924 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2925 if (ret < 0) {
2926 error_report("error getting block status at sector %" PRId64 ": %s",
2927 sector_num, strerror(-ret));
2928 return ret;
2930 if (ret & BDRV_BLOCK_ZERO) {
2931 sector_num += n;
2932 continue;
2934 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2935 if (ret < 0) {
2936 error_report("error writing zeroes at sector %" PRId64 ": %s",
2937 sector_num, strerror(-ret));
2938 return ret;
2940 sector_num += n;
2944 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2946 QEMUIOVector qiov;
2947 struct iovec iov = {
2948 .iov_base = (void *)buf,
2949 .iov_len = bytes,
2951 int ret;
2953 if (bytes < 0) {
2954 return -EINVAL;
2957 qemu_iovec_init_external(&qiov, &iov, 1);
2958 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2959 if (ret < 0) {
2960 return ret;
2963 return bytes;
2966 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2968 int ret;
2970 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2971 if (ret < 0) {
2972 return ret;
2975 return qiov->size;
2978 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2979 const void *buf, int bytes)
2981 QEMUIOVector qiov;
2982 struct iovec iov = {
2983 .iov_base = (void *) buf,
2984 .iov_len = bytes,
2987 if (bytes < 0) {
2988 return -EINVAL;
2991 qemu_iovec_init_external(&qiov, &iov, 1);
2992 return bdrv_pwritev(bs, offset, &qiov);
2996 * Writes to the file and ensures that no writes are reordered across this
2997 * request (acts as a barrier)
2999 * Returns 0 on success, -errno in error cases.
3001 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
3002 const void *buf, int count)
3004 int ret;
3006 ret = bdrv_pwrite(bs, offset, buf, count);
3007 if (ret < 0) {
3008 return ret;
3011 /* No flush needed for cache modes that already do it */
3012 if (bs->enable_write_cache) {
3013 bdrv_flush(bs);
3016 return 0;
3019 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
3020 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3022 /* Perform I/O through a temporary buffer so that users who scribble over
3023 * their read buffer while the operation is in progress do not end up
3024 * modifying the image file. This is critical for zero-copy guest I/O
3025 * where anything might happen inside guest memory.
3027 void *bounce_buffer;
3029 BlockDriver *drv = bs->drv;
3030 struct iovec iov;
3031 QEMUIOVector bounce_qiov;
3032 int64_t cluster_sector_num;
3033 int cluster_nb_sectors;
3034 size_t skip_bytes;
3035 int ret;
3037 /* Cover entire cluster so no additional backing file I/O is required when
3038 * allocating cluster in the image file.
3040 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
3041 &cluster_sector_num, &cluster_nb_sectors);
3043 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
3044 cluster_sector_num, cluster_nb_sectors);
3046 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
3047 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
3048 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
3050 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
3051 &bounce_qiov);
3052 if (ret < 0) {
3053 goto err;
3056 if (drv->bdrv_co_write_zeroes &&
3057 buffer_is_zero(bounce_buffer, iov.iov_len)) {
3058 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
3059 cluster_nb_sectors, 0);
3060 } else {
3061 /* This does not change the data on the disk, it is not necessary
3062 * to flush even in cache=writethrough mode.
3064 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
3065 &bounce_qiov);
3068 if (ret < 0) {
3069 /* It might be okay to ignore write errors for guest requests. If this
3070 * is a deliberate copy-on-read then we don't want to ignore the error.
3071 * Simply report it in all cases.
3073 goto err;
3076 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
3077 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3078 nb_sectors * BDRV_SECTOR_SIZE);
3080 err:
3081 qemu_vfree(bounce_buffer);
3082 return ret;
3086 * Forwards an already correctly aligned request to the BlockDriver. This
3087 * handles copy on read and zeroing after EOF; any other features must be
3088 * implemented by the caller.
3090 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
3091 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3092 int64_t align, QEMUIOVector *qiov, int flags)
3094 BlockDriver *drv = bs->drv;
3095 int ret;
3097 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3098 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3100 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3101 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3103 /* Handle Copy on Read and associated serialisation */
3104 if (flags & BDRV_REQ_COPY_ON_READ) {
3105 /* If we touch the same cluster it counts as an overlap. This
3106 * guarantees that allocating writes will be serialized and not race
3107 * with each other for the same cluster. For example, in copy-on-read
3108 * it ensures that the CoR read and write operations are atomic and
3109 * guest writes cannot interleave between them. */
3110 mark_request_serialising(req, bdrv_get_cluster_size(bs));
3113 wait_serialising_requests(req);
3115 if (flags & BDRV_REQ_COPY_ON_READ) {
3116 int pnum;
3118 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3119 if (ret < 0) {
3120 goto out;
3123 if (!ret || pnum != nb_sectors) {
3124 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3125 goto out;
3129 /* Forward the request to the BlockDriver */
3130 if (!(bs->zero_beyond_eof && bs->growable)) {
3131 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3132 } else {
3133 /* Read zeros after EOF of growable BDSes */
3134 int64_t len, total_sectors, max_nb_sectors;
3136 len = bdrv_getlength(bs);
3137 if (len < 0) {
3138 ret = len;
3139 goto out;
3142 total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
3143 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3144 align >> BDRV_SECTOR_BITS);
3145 if (max_nb_sectors > 0) {
3146 ret = drv->bdrv_co_readv(bs, sector_num,
3147 MIN(nb_sectors, max_nb_sectors), qiov);
3148 } else {
3149 ret = 0;
3152 /* Reading beyond end of file is supposed to produce zeroes */
3153 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3154 uint64_t offset = MAX(0, total_sectors - sector_num);
3155 uint64_t bytes = (sector_num + nb_sectors - offset) *
3156 BDRV_SECTOR_SIZE;
3157 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3161 out:
3162 return ret;
3166 * Handle a read request in coroutine context
3168 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3169 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3170 BdrvRequestFlags flags)
3172 BlockDriver *drv = bs->drv;
3173 BdrvTrackedRequest req;
3175 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3176 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3177 uint8_t *head_buf = NULL;
3178 uint8_t *tail_buf = NULL;
3179 QEMUIOVector local_qiov;
3180 bool use_local_qiov = false;
3181 int ret;
3183 if (!drv) {
3184 return -ENOMEDIUM;
3186 if (bdrv_check_byte_request(bs, offset, bytes)) {
3187 return -EIO;
3190 if (bs->copy_on_read) {
3191 flags |= BDRV_REQ_COPY_ON_READ;
3194 /* throttling disk I/O */
3195 if (bs->io_limits_enabled) {
3196 bdrv_io_limits_intercept(bs, bytes, false);
3199 /* Align read if necessary by padding qiov */
3200 if (offset & (align - 1)) {
3201 head_buf = qemu_blockalign(bs, align);
3202 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3203 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3204 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3205 use_local_qiov = true;
3207 bytes += offset & (align - 1);
3208 offset = offset & ~(align - 1);
3211 if ((offset + bytes) & (align - 1)) {
3212 if (!use_local_qiov) {
3213 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3214 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3215 use_local_qiov = true;
3217 tail_buf = qemu_blockalign(bs, align);
3218 qemu_iovec_add(&local_qiov, tail_buf,
3219 align - ((offset + bytes) & (align - 1)));
3221 bytes = ROUND_UP(bytes, align);
3224 tracked_request_begin(&req, bs, offset, bytes, false);
3225 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3226 use_local_qiov ? &local_qiov : qiov,
3227 flags);
3228 tracked_request_end(&req);
3230 if (use_local_qiov) {
3231 qemu_iovec_destroy(&local_qiov);
3232 qemu_vfree(head_buf);
3233 qemu_vfree(tail_buf);
3236 return ret;
3239 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3240 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3241 BdrvRequestFlags flags)
3243 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3244 return -EINVAL;
3247 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3248 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3251 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3252 int nb_sectors, QEMUIOVector *qiov)
3254 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3256 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3259 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3260 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3262 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3264 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3265 BDRV_REQ_COPY_ON_READ);
3268 /* if no limit is specified in the BlockLimits use a default
3269 * of 32768 512-byte sectors (16 MiB) per request.
3271 #define MAX_WRITE_ZEROES_DEFAULT 32768
3273 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3274 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3276 BlockDriver *drv = bs->drv;
3277 QEMUIOVector qiov;
3278 struct iovec iov = {0};
3279 int ret = 0;
3281 int max_write_zeroes = bs->bl.max_write_zeroes ?
3282 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3284 while (nb_sectors > 0 && !ret) {
3285 int num = nb_sectors;
3287 /* Align request. Block drivers can expect the "bulk" of the request
3288 * to be aligned.
3290 if (bs->bl.write_zeroes_alignment
3291 && num > bs->bl.write_zeroes_alignment) {
3292 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3293 /* Make a small request up to the first aligned sector. */
3294 num = bs->bl.write_zeroes_alignment;
3295 num -= sector_num % bs->bl.write_zeroes_alignment;
3296 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3297 /* Shorten the request to the last aligned sector. num cannot
3298 * underflow because num > bs->bl.write_zeroes_alignment.
3300 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3304 /* limit request size */
3305 if (num > max_write_zeroes) {
3306 num = max_write_zeroes;
3309 ret = -ENOTSUP;
3310 /* First try the efficient write zeroes operation */
3311 if (drv->bdrv_co_write_zeroes) {
3312 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3315 if (ret == -ENOTSUP) {
3316 /* Fall back to bounce buffer if write zeroes is unsupported */
3317 iov.iov_len = num * BDRV_SECTOR_SIZE;
3318 if (iov.iov_base == NULL) {
3319 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3320 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3322 qemu_iovec_init_external(&qiov, &iov, 1);
3324 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3326 /* Keep bounce buffer around if it is big enough for all
3327 * all future requests.
3329 if (num < max_write_zeroes) {
3330 qemu_vfree(iov.iov_base);
3331 iov.iov_base = NULL;
3335 sector_num += num;
3336 nb_sectors -= num;
3339 qemu_vfree(iov.iov_base);
3340 return ret;
3344 * Forwards an already correctly aligned write request to the BlockDriver.
3346 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3347 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3348 QEMUIOVector *qiov, int flags)
3350 BlockDriver *drv = bs->drv;
3351 bool waited;
3352 int ret;
3354 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3355 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3357 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3358 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3360 waited = wait_serialising_requests(req);
3361 assert(!waited || !req->serialising);
3362 assert(req->overlap_offset <= offset);
3363 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3365 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3367 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3368 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3369 qemu_iovec_is_zero(qiov)) {
3370 flags |= BDRV_REQ_ZERO_WRITE;
3371 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3372 flags |= BDRV_REQ_MAY_UNMAP;
3376 if (ret < 0) {
3377 /* Do nothing, write notifier decided to fail this request */
3378 } else if (flags & BDRV_REQ_ZERO_WRITE) {
3379 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3380 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3381 } else {
3382 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3383 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3385 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3387 if (ret == 0 && !bs->enable_write_cache) {
3388 ret = bdrv_co_flush(bs);
3391 bdrv_set_dirty(bs, sector_num, nb_sectors);
3393 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3394 bs->wr_highest_sector = sector_num + nb_sectors - 1;
3396 if (bs->growable && ret >= 0) {
3397 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3400 return ret;
3404 * Handle a write request in coroutine context
3406 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3407 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3408 BdrvRequestFlags flags)
3410 BdrvTrackedRequest req;
3411 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3412 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3413 uint8_t *head_buf = NULL;
3414 uint8_t *tail_buf = NULL;
3415 QEMUIOVector local_qiov;
3416 bool use_local_qiov = false;
3417 int ret;
3419 if (!bs->drv) {
3420 return -ENOMEDIUM;
3422 if (bs->read_only) {
3423 return -EACCES;
3425 if (bdrv_check_byte_request(bs, offset, bytes)) {
3426 return -EIO;
3429 /* throttling disk I/O */
3430 if (bs->io_limits_enabled) {
3431 bdrv_io_limits_intercept(bs, bytes, true);
3435 * Align write if necessary by performing a read-modify-write cycle.
3436 * Pad qiov with the read parts and be sure to have a tracked request not
3437 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3439 tracked_request_begin(&req, bs, offset, bytes, true);
3441 if (offset & (align - 1)) {
3442 QEMUIOVector head_qiov;
3443 struct iovec head_iov;
3445 mark_request_serialising(&req, align);
3446 wait_serialising_requests(&req);
3448 head_buf = qemu_blockalign(bs, align);
3449 head_iov = (struct iovec) {
3450 .iov_base = head_buf,
3451 .iov_len = align,
3453 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3455 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3456 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3457 align, &head_qiov, 0);
3458 if (ret < 0) {
3459 goto fail;
3461 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3463 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3464 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3465 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3466 use_local_qiov = true;
3468 bytes += offset & (align - 1);
3469 offset = offset & ~(align - 1);
3472 if ((offset + bytes) & (align - 1)) {
3473 QEMUIOVector tail_qiov;
3474 struct iovec tail_iov;
3475 size_t tail_bytes;
3476 bool waited;
3478 mark_request_serialising(&req, align);
3479 waited = wait_serialising_requests(&req);
3480 assert(!waited || !use_local_qiov);
3482 tail_buf = qemu_blockalign(bs, align);
3483 tail_iov = (struct iovec) {
3484 .iov_base = tail_buf,
3485 .iov_len = align,
3487 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3489 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3490 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3491 align, &tail_qiov, 0);
3492 if (ret < 0) {
3493 goto fail;
3495 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3497 if (!use_local_qiov) {
3498 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3499 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3500 use_local_qiov = true;
3503 tail_bytes = (offset + bytes) & (align - 1);
3504 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3506 bytes = ROUND_UP(bytes, align);
3509 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3510 use_local_qiov ? &local_qiov : qiov,
3511 flags);
3513 fail:
3514 tracked_request_end(&req);
3516 if (use_local_qiov) {
3517 qemu_iovec_destroy(&local_qiov);
3519 qemu_vfree(head_buf);
3520 qemu_vfree(tail_buf);
3522 return ret;
3525 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3526 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3527 BdrvRequestFlags flags)
3529 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3530 return -EINVAL;
3533 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3534 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3537 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3538 int nb_sectors, QEMUIOVector *qiov)
3540 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3542 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3545 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3546 int64_t sector_num, int nb_sectors,
3547 BdrvRequestFlags flags)
3549 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3551 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3552 flags &= ~BDRV_REQ_MAY_UNMAP;
3555 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3556 BDRV_REQ_ZERO_WRITE | flags);
3560 * Truncate file to 'offset' bytes (needed only for file protocols)
3562 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3564 BlockDriver *drv = bs->drv;
3565 int ret;
3566 if (!drv)
3567 return -ENOMEDIUM;
3568 if (!drv->bdrv_truncate)
3569 return -ENOTSUP;
3570 if (bs->read_only)
3571 return -EACCES;
3572 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_RESIZE, NULL)) {
3573 return -EBUSY;
3575 ret = drv->bdrv_truncate(bs, offset);
3576 if (ret == 0) {
3577 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3578 bdrv_dev_resize_cb(bs);
3580 return ret;
3584 * Length of a allocated file in bytes. Sparse files are counted by actual
3585 * allocated space. Return < 0 if error or unknown.
3587 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3589 BlockDriver *drv = bs->drv;
3590 if (!drv) {
3591 return -ENOMEDIUM;
3593 if (drv->bdrv_get_allocated_file_size) {
3594 return drv->bdrv_get_allocated_file_size(bs);
3596 if (bs->file) {
3597 return bdrv_get_allocated_file_size(bs->file);
3599 return -ENOTSUP;
3603 * Length of a file in bytes. Return < 0 if error or unknown.
3605 int64_t bdrv_getlength(BlockDriverState *bs)
3607 BlockDriver *drv = bs->drv;
3608 if (!drv)
3609 return -ENOMEDIUM;
3611 if (drv->has_variable_length) {
3612 int ret = refresh_total_sectors(bs, bs->total_sectors);
3613 if (ret < 0) {
3614 return ret;
3617 return bs->total_sectors * BDRV_SECTOR_SIZE;
3620 /* return 0 as number of sectors if no device present or error */
3621 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3623 int64_t length;
3624 length = bdrv_getlength(bs);
3625 if (length < 0)
3626 length = 0;
3627 else
3628 length = length >> BDRV_SECTOR_BITS;
3629 *nb_sectors_ptr = length;
3632 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3633 BlockdevOnError on_write_error)
3635 bs->on_read_error = on_read_error;
3636 bs->on_write_error = on_write_error;
3639 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3641 return is_read ? bs->on_read_error : bs->on_write_error;
3644 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3646 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3648 switch (on_err) {
3649 case BLOCKDEV_ON_ERROR_ENOSPC:
3650 return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3651 case BLOCKDEV_ON_ERROR_STOP:
3652 return BDRV_ACTION_STOP;
3653 case BLOCKDEV_ON_ERROR_REPORT:
3654 return BDRV_ACTION_REPORT;
3655 case BLOCKDEV_ON_ERROR_IGNORE:
3656 return BDRV_ACTION_IGNORE;
3657 default:
3658 abort();
3662 /* This is done by device models because, while the block layer knows
3663 * about the error, it does not know whether an operation comes from
3664 * the device or the block layer (from a job, for example).
3666 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3667 bool is_read, int error)
3669 assert(error >= 0);
3670 bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3671 if (action == BDRV_ACTION_STOP) {
3672 vm_stop(RUN_STATE_IO_ERROR);
3673 bdrv_iostatus_set_err(bs, error);
3677 int bdrv_is_read_only(BlockDriverState *bs)
3679 return bs->read_only;
3682 int bdrv_is_sg(BlockDriverState *bs)
3684 return bs->sg;
3687 int bdrv_enable_write_cache(BlockDriverState *bs)
3689 return bs->enable_write_cache;
3692 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3694 bs->enable_write_cache = wce;
3696 /* so a reopen() will preserve wce */
3697 if (wce) {
3698 bs->open_flags |= BDRV_O_CACHE_WB;
3699 } else {
3700 bs->open_flags &= ~BDRV_O_CACHE_WB;
3704 int bdrv_is_encrypted(BlockDriverState *bs)
3706 if (bs->backing_hd && bs->backing_hd->encrypted)
3707 return 1;
3708 return bs->encrypted;
3711 int bdrv_key_required(BlockDriverState *bs)
3713 BlockDriverState *backing_hd = bs->backing_hd;
3715 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3716 return 1;
3717 return (bs->encrypted && !bs->valid_key);
3720 int bdrv_set_key(BlockDriverState *bs, const char *key)
3722 int ret;
3723 if (bs->backing_hd && bs->backing_hd->encrypted) {
3724 ret = bdrv_set_key(bs->backing_hd, key);
3725 if (ret < 0)
3726 return ret;
3727 if (!bs->encrypted)
3728 return 0;
3730 if (!bs->encrypted) {
3731 return -EINVAL;
3732 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3733 return -ENOMEDIUM;
3735 ret = bs->drv->bdrv_set_key(bs, key);
3736 if (ret < 0) {
3737 bs->valid_key = 0;
3738 } else if (!bs->valid_key) {
3739 bs->valid_key = 1;
3740 /* call the change callback now, we skipped it on open */
3741 bdrv_dev_change_media_cb(bs, true);
3743 return ret;
3746 const char *bdrv_get_format_name(BlockDriverState *bs)
3748 return bs->drv ? bs->drv->format_name : NULL;
3751 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3752 void *opaque)
3754 BlockDriver *drv;
3755 int count = 0;
3756 const char **formats = NULL;
3758 QLIST_FOREACH(drv, &bdrv_drivers, list) {
3759 if (drv->format_name) {
3760 bool found = false;
3761 int i = count;
3762 while (formats && i && !found) {
3763 found = !strcmp(formats[--i], drv->format_name);
3766 if (!found) {
3767 formats = g_realloc(formats, (count + 1) * sizeof(char *));
3768 formats[count++] = drv->format_name;
3769 it(opaque, drv->format_name);
3773 g_free(formats);
3776 /* This function is to find block backend bs */
3777 BlockDriverState *bdrv_find(const char *name)
3779 BlockDriverState *bs;
3781 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3782 if (!strcmp(name, bs->device_name)) {
3783 return bs;
3786 return NULL;
3789 /* This function is to find a node in the bs graph */
3790 BlockDriverState *bdrv_find_node(const char *node_name)
3792 BlockDriverState *bs;
3794 assert(node_name);
3796 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3797 if (!strcmp(node_name, bs->node_name)) {
3798 return bs;
3801 return NULL;
3804 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3805 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3807 BlockDeviceInfoList *list, *entry;
3808 BlockDriverState *bs;
3810 list = NULL;
3811 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3812 entry = g_malloc0(sizeof(*entry));
3813 entry->value = bdrv_block_device_info(bs);
3814 entry->next = list;
3815 list = entry;
3818 return list;
3821 BlockDriverState *bdrv_lookup_bs(const char *device,
3822 const char *node_name,
3823 Error **errp)
3825 BlockDriverState *bs = NULL;
3827 if (device) {
3828 bs = bdrv_find(device);
3830 if (bs) {
3831 return bs;
3835 if (node_name) {
3836 bs = bdrv_find_node(node_name);
3838 if (bs) {
3839 return bs;
3843 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3844 device ? device : "",
3845 node_name ? node_name : "");
3846 return NULL;
3849 BlockDriverState *bdrv_next(BlockDriverState *bs)
3851 if (!bs) {
3852 return QTAILQ_FIRST(&bdrv_states);
3854 return QTAILQ_NEXT(bs, device_list);
3857 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3859 BlockDriverState *bs;
3861 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3862 it(opaque, bs);
3866 const char *bdrv_get_device_name(BlockDriverState *bs)
3868 return bs->device_name;
3871 int bdrv_get_flags(BlockDriverState *bs)
3873 return bs->open_flags;
3876 int bdrv_flush_all(void)
3878 BlockDriverState *bs;
3879 int result = 0;
3881 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3882 AioContext *aio_context = bdrv_get_aio_context(bs);
3883 int ret;
3885 aio_context_acquire(aio_context);
3886 ret = bdrv_flush(bs);
3887 if (ret < 0 && !result) {
3888 result = ret;
3890 aio_context_release(aio_context);
3893 return result;
3896 int bdrv_has_zero_init_1(BlockDriverState *bs)
3898 return 1;
3901 int bdrv_has_zero_init(BlockDriverState *bs)
3903 assert(bs->drv);
3905 /* If BS is a copy on write image, it is initialized to
3906 the contents of the base image, which may not be zeroes. */
3907 if (bs->backing_hd) {
3908 return 0;
3910 if (bs->drv->bdrv_has_zero_init) {
3911 return bs->drv->bdrv_has_zero_init(bs);
3914 /* safe default */
3915 return 0;
3918 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3920 BlockDriverInfo bdi;
3922 if (bs->backing_hd) {
3923 return false;
3926 if (bdrv_get_info(bs, &bdi) == 0) {
3927 return bdi.unallocated_blocks_are_zero;
3930 return false;
3933 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3935 BlockDriverInfo bdi;
3937 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3938 return false;
3941 if (bdrv_get_info(bs, &bdi) == 0) {
3942 return bdi.can_write_zeroes_with_unmap;
3945 return false;
3948 typedef struct BdrvCoGetBlockStatusData {
3949 BlockDriverState *bs;
3950 BlockDriverState *base;
3951 int64_t sector_num;
3952 int nb_sectors;
3953 int *pnum;
3954 int64_t ret;
3955 bool done;
3956 } BdrvCoGetBlockStatusData;
3959 * Returns true iff the specified sector is present in the disk image. Drivers
3960 * not implementing the functionality are assumed to not support backing files,
3961 * hence all their sectors are reported as allocated.
3963 * If 'sector_num' is beyond the end of the disk image the return value is 0
3964 * and 'pnum' is set to 0.
3966 * 'pnum' is set to the number of sectors (including and immediately following
3967 * the specified sector) that are known to be in the same
3968 * allocated/unallocated state.
3970 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3971 * beyond the end of the disk image it will be clamped.
3973 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3974 int64_t sector_num,
3975 int nb_sectors, int *pnum)
3977 int64_t length;
3978 int64_t n;
3979 int64_t ret, ret2;
3981 length = bdrv_getlength(bs);
3982 if (length < 0) {
3983 return length;
3986 if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3987 *pnum = 0;
3988 return 0;
3991 n = bs->total_sectors - sector_num;
3992 if (n < nb_sectors) {
3993 nb_sectors = n;
3996 if (!bs->drv->bdrv_co_get_block_status) {
3997 *pnum = nb_sectors;
3998 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3999 if (bs->drv->protocol_name) {
4000 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
4002 return ret;
4005 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
4006 if (ret < 0) {
4007 *pnum = 0;
4008 return ret;
4011 if (ret & BDRV_BLOCK_RAW) {
4012 assert(ret & BDRV_BLOCK_OFFSET_VALID);
4013 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4014 *pnum, pnum);
4017 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4018 ret |= BDRV_BLOCK_ALLOCATED;
4021 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4022 if (bdrv_unallocated_blocks_are_zero(bs)) {
4023 ret |= BDRV_BLOCK_ZERO;
4024 } else if (bs->backing_hd) {
4025 BlockDriverState *bs2 = bs->backing_hd;
4026 int64_t length2 = bdrv_getlength(bs2);
4027 if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
4028 ret |= BDRV_BLOCK_ZERO;
4033 if (bs->file &&
4034 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4035 (ret & BDRV_BLOCK_OFFSET_VALID)) {
4036 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4037 *pnum, pnum);
4038 if (ret2 >= 0) {
4039 /* Ignore errors. This is just providing extra information, it
4040 * is useful but not necessary.
4042 ret |= (ret2 & BDRV_BLOCK_ZERO);
4046 return ret;
4049 /* Coroutine wrapper for bdrv_get_block_status() */
4050 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4052 BdrvCoGetBlockStatusData *data = opaque;
4053 BlockDriverState *bs = data->bs;
4055 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4056 data->pnum);
4057 data->done = true;
4061 * Synchronous wrapper around bdrv_co_get_block_status().
4063 * See bdrv_co_get_block_status() for details.
4065 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4066 int nb_sectors, int *pnum)
4068 Coroutine *co;
4069 BdrvCoGetBlockStatusData data = {
4070 .bs = bs,
4071 .sector_num = sector_num,
4072 .nb_sectors = nb_sectors,
4073 .pnum = pnum,
4074 .done = false,
4077 if (qemu_in_coroutine()) {
4078 /* Fast-path if already in coroutine context */
4079 bdrv_get_block_status_co_entry(&data);
4080 } else {
4081 AioContext *aio_context = bdrv_get_aio_context(bs);
4083 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4084 qemu_coroutine_enter(co, &data);
4085 while (!data.done) {
4086 aio_poll(aio_context, true);
4089 return data.ret;
4092 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4093 int nb_sectors, int *pnum)
4095 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4096 if (ret < 0) {
4097 return ret;
4099 return (ret & BDRV_BLOCK_ALLOCATED);
4103 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4105 * Return true if the given sector is allocated in any image between
4106 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4107 * sector is allocated in any image of the chain. Return false otherwise.
4109 * 'pnum' is set to the number of sectors (including and immediately following
4110 * the specified sector) that are known to be in the same
4111 * allocated/unallocated state.
4114 int bdrv_is_allocated_above(BlockDriverState *top,
4115 BlockDriverState *base,
4116 int64_t sector_num,
4117 int nb_sectors, int *pnum)
4119 BlockDriverState *intermediate;
4120 int ret, n = nb_sectors;
4122 intermediate = top;
4123 while (intermediate && intermediate != base) {
4124 int pnum_inter;
4125 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4126 &pnum_inter);
4127 if (ret < 0) {
4128 return ret;
4129 } else if (ret) {
4130 *pnum = pnum_inter;
4131 return 1;
4135 * [sector_num, nb_sectors] is unallocated on top but intermediate
4136 * might have
4138 * [sector_num+x, nr_sectors] allocated.
4140 if (n > pnum_inter &&
4141 (intermediate == top ||
4142 sector_num + pnum_inter < intermediate->total_sectors)) {
4143 n = pnum_inter;
4146 intermediate = intermediate->backing_hd;
4149 *pnum = n;
4150 return 0;
4153 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4155 if (bs->backing_hd && bs->backing_hd->encrypted)
4156 return bs->backing_file;
4157 else if (bs->encrypted)
4158 return bs->filename;
4159 else
4160 return NULL;
4163 void bdrv_get_backing_filename(BlockDriverState *bs,
4164 char *filename, int filename_size)
4166 pstrcpy(filename, filename_size, bs->backing_file);
4169 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4170 const uint8_t *buf, int nb_sectors)
4172 BlockDriver *drv = bs->drv;
4173 if (!drv)
4174 return -ENOMEDIUM;
4175 if (!drv->bdrv_write_compressed)
4176 return -ENOTSUP;
4177 if (bdrv_check_request(bs, sector_num, nb_sectors))
4178 return -EIO;
4180 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4182 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4185 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4187 BlockDriver *drv = bs->drv;
4188 if (!drv)
4189 return -ENOMEDIUM;
4190 if (!drv->bdrv_get_info)
4191 return -ENOTSUP;
4192 memset(bdi, 0, sizeof(*bdi));
4193 return drv->bdrv_get_info(bs, bdi);
4196 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4198 BlockDriver *drv = bs->drv;
4199 if (drv && drv->bdrv_get_specific_info) {
4200 return drv->bdrv_get_specific_info(bs);
4202 return NULL;
4205 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4206 int64_t pos, int size)
4208 QEMUIOVector qiov;
4209 struct iovec iov = {
4210 .iov_base = (void *) buf,
4211 .iov_len = size,
4214 qemu_iovec_init_external(&qiov, &iov, 1);
4215 return bdrv_writev_vmstate(bs, &qiov, pos);
4218 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4220 BlockDriver *drv = bs->drv;
4222 if (!drv) {
4223 return -ENOMEDIUM;
4224 } else if (drv->bdrv_save_vmstate) {
4225 return drv->bdrv_save_vmstate(bs, qiov, pos);
4226 } else if (bs->file) {
4227 return bdrv_writev_vmstate(bs->file, qiov, pos);
4230 return -ENOTSUP;
4233 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4234 int64_t pos, int size)
4236 BlockDriver *drv = bs->drv;
4237 if (!drv)
4238 return -ENOMEDIUM;
4239 if (drv->bdrv_load_vmstate)
4240 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4241 if (bs->file)
4242 return bdrv_load_vmstate(bs->file, buf, pos, size);
4243 return -ENOTSUP;
4246 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4248 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4249 return;
4252 bs->drv->bdrv_debug_event(bs, event);
4255 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4256 const char *tag)
4258 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4259 bs = bs->file;
4262 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4263 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4266 return -ENOTSUP;
4269 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4271 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4272 bs = bs->file;
4275 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4276 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4279 return -ENOTSUP;
4282 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4284 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4285 bs = bs->file;
4288 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4289 return bs->drv->bdrv_debug_resume(bs, tag);
4292 return -ENOTSUP;
4295 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4297 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4298 bs = bs->file;
4301 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4302 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4305 return false;
4308 int bdrv_is_snapshot(BlockDriverState *bs)
4310 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4313 /* backing_file can either be relative, or absolute, or a protocol. If it is
4314 * relative, it must be relative to the chain. So, passing in bs->filename
4315 * from a BDS as backing_file should not be done, as that may be relative to
4316 * the CWD rather than the chain. */
4317 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4318 const char *backing_file)
4320 char *filename_full = NULL;
4321 char *backing_file_full = NULL;
4322 char *filename_tmp = NULL;
4323 int is_protocol = 0;
4324 BlockDriverState *curr_bs = NULL;
4325 BlockDriverState *retval = NULL;
4327 if (!bs || !bs->drv || !backing_file) {
4328 return NULL;
4331 filename_full = g_malloc(PATH_MAX);
4332 backing_file_full = g_malloc(PATH_MAX);
4333 filename_tmp = g_malloc(PATH_MAX);
4335 is_protocol = path_has_protocol(backing_file);
4337 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4339 /* If either of the filename paths is actually a protocol, then
4340 * compare unmodified paths; otherwise make paths relative */
4341 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4342 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4343 retval = curr_bs->backing_hd;
4344 break;
4346 } else {
4347 /* If not an absolute filename path, make it relative to the current
4348 * image's filename path */
4349 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4350 backing_file);
4352 /* We are going to compare absolute pathnames */
4353 if (!realpath(filename_tmp, filename_full)) {
4354 continue;
4357 /* We need to make sure the backing filename we are comparing against
4358 * is relative to the current image filename (or absolute) */
4359 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4360 curr_bs->backing_file);
4362 if (!realpath(filename_tmp, backing_file_full)) {
4363 continue;
4366 if (strcmp(backing_file_full, filename_full) == 0) {
4367 retval = curr_bs->backing_hd;
4368 break;
4373 g_free(filename_full);
4374 g_free(backing_file_full);
4375 g_free(filename_tmp);
4376 return retval;
4379 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4381 if (!bs->drv) {
4382 return 0;
4385 if (!bs->backing_hd) {
4386 return 0;
4389 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4392 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4394 BlockDriverState *curr_bs = NULL;
4396 if (!bs) {
4397 return NULL;
4400 curr_bs = bs;
4402 while (curr_bs->backing_hd) {
4403 curr_bs = curr_bs->backing_hd;
4405 return curr_bs;
4408 /**************************************************************/
4409 /* async I/Os */
4411 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4412 QEMUIOVector *qiov, int nb_sectors,
4413 BlockDriverCompletionFunc *cb, void *opaque)
4415 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4417 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4418 cb, opaque, false);
4421 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4422 QEMUIOVector *qiov, int nb_sectors,
4423 BlockDriverCompletionFunc *cb, void *opaque)
4425 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4427 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4428 cb, opaque, true);
4431 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4432 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4433 BlockDriverCompletionFunc *cb, void *opaque)
4435 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4437 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4438 BDRV_REQ_ZERO_WRITE | flags,
4439 cb, opaque, true);
4443 typedef struct MultiwriteCB {
4444 int error;
4445 int num_requests;
4446 int num_callbacks;
4447 struct {
4448 BlockDriverCompletionFunc *cb;
4449 void *opaque;
4450 QEMUIOVector *free_qiov;
4451 } callbacks[];
4452 } MultiwriteCB;
4454 static void multiwrite_user_cb(MultiwriteCB *mcb)
4456 int i;
4458 for (i = 0; i < mcb->num_callbacks; i++) {
4459 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4460 if (mcb->callbacks[i].free_qiov) {
4461 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4463 g_free(mcb->callbacks[i].free_qiov);
4467 static void multiwrite_cb(void *opaque, int ret)
4469 MultiwriteCB *mcb = opaque;
4471 trace_multiwrite_cb(mcb, ret);
4473 if (ret < 0 && !mcb->error) {
4474 mcb->error = ret;
4477 mcb->num_requests--;
4478 if (mcb->num_requests == 0) {
4479 multiwrite_user_cb(mcb);
4480 g_free(mcb);
4484 static int multiwrite_req_compare(const void *a, const void *b)
4486 const BlockRequest *req1 = a, *req2 = b;
4489 * Note that we can't simply subtract req2->sector from req1->sector
4490 * here as that could overflow the return value.
4492 if (req1->sector > req2->sector) {
4493 return 1;
4494 } else if (req1->sector < req2->sector) {
4495 return -1;
4496 } else {
4497 return 0;
4502 * Takes a bunch of requests and tries to merge them. Returns the number of
4503 * requests that remain after merging.
4505 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4506 int num_reqs, MultiwriteCB *mcb)
4508 int i, outidx;
4510 // Sort requests by start sector
4511 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4513 // Check if adjacent requests touch the same clusters. If so, combine them,
4514 // filling up gaps with zero sectors.
4515 outidx = 0;
4516 for (i = 1; i < num_reqs; i++) {
4517 int merge = 0;
4518 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4520 // Handle exactly sequential writes and overlapping writes.
4521 if (reqs[i].sector <= oldreq_last) {
4522 merge = 1;
4525 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4526 merge = 0;
4529 if (merge) {
4530 size_t size;
4531 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4532 qemu_iovec_init(qiov,
4533 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4535 // Add the first request to the merged one. If the requests are
4536 // overlapping, drop the last sectors of the first request.
4537 size = (reqs[i].sector - reqs[outidx].sector) << 9;
4538 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4540 // We should need to add any zeros between the two requests
4541 assert (reqs[i].sector <= oldreq_last);
4543 // Add the second request
4544 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4546 reqs[outidx].nb_sectors = qiov->size >> 9;
4547 reqs[outidx].qiov = qiov;
4549 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4550 } else {
4551 outidx++;
4552 reqs[outidx].sector = reqs[i].sector;
4553 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4554 reqs[outidx].qiov = reqs[i].qiov;
4558 return outidx + 1;
4562 * Submit multiple AIO write requests at once.
4564 * On success, the function returns 0 and all requests in the reqs array have
4565 * been submitted. In error case this function returns -1, and any of the
4566 * requests may or may not be submitted yet. In particular, this means that the
4567 * callback will be called for some of the requests, for others it won't. The
4568 * caller must check the error field of the BlockRequest to wait for the right
4569 * callbacks (if error != 0, no callback will be called).
4571 * The implementation may modify the contents of the reqs array, e.g. to merge
4572 * requests. However, the fields opaque and error are left unmodified as they
4573 * are used to signal failure for a single request to the caller.
4575 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4577 MultiwriteCB *mcb;
4578 int i;
4580 /* don't submit writes if we don't have a medium */
4581 if (bs->drv == NULL) {
4582 for (i = 0; i < num_reqs; i++) {
4583 reqs[i].error = -ENOMEDIUM;
4585 return -1;
4588 if (num_reqs == 0) {
4589 return 0;
4592 // Create MultiwriteCB structure
4593 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4594 mcb->num_requests = 0;
4595 mcb->num_callbacks = num_reqs;
4597 for (i = 0; i < num_reqs; i++) {
4598 mcb->callbacks[i].cb = reqs[i].cb;
4599 mcb->callbacks[i].opaque = reqs[i].opaque;
4602 // Check for mergable requests
4603 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4605 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4607 /* Run the aio requests. */
4608 mcb->num_requests = num_reqs;
4609 for (i = 0; i < num_reqs; i++) {
4610 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4611 reqs[i].nb_sectors, reqs[i].flags,
4612 multiwrite_cb, mcb,
4613 true);
4616 return 0;
4619 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4621 acb->aiocb_info->cancel(acb);
4624 /**************************************************************/
4625 /* async block device emulation */
4627 typedef struct BlockDriverAIOCBSync {
4628 BlockDriverAIOCB common;
4629 QEMUBH *bh;
4630 int ret;
4631 /* vector translation state */
4632 QEMUIOVector *qiov;
4633 uint8_t *bounce;
4634 int is_write;
4635 } BlockDriverAIOCBSync;
4637 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4639 BlockDriverAIOCBSync *acb =
4640 container_of(blockacb, BlockDriverAIOCBSync, common);
4641 qemu_bh_delete(acb->bh);
4642 acb->bh = NULL;
4643 qemu_aio_release(acb);
4646 static const AIOCBInfo bdrv_em_aiocb_info = {
4647 .aiocb_size = sizeof(BlockDriverAIOCBSync),
4648 .cancel = bdrv_aio_cancel_em,
4651 static void bdrv_aio_bh_cb(void *opaque)
4653 BlockDriverAIOCBSync *acb = opaque;
4655 if (!acb->is_write)
4656 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4657 qemu_vfree(acb->bounce);
4658 acb->common.cb(acb->common.opaque, acb->ret);
4659 qemu_bh_delete(acb->bh);
4660 acb->bh = NULL;
4661 qemu_aio_release(acb);
4664 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4665 int64_t sector_num,
4666 QEMUIOVector *qiov,
4667 int nb_sectors,
4668 BlockDriverCompletionFunc *cb,
4669 void *opaque,
4670 int is_write)
4673 BlockDriverAIOCBSync *acb;
4675 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4676 acb->is_write = is_write;
4677 acb->qiov = qiov;
4678 acb->bounce = qemu_blockalign(bs, qiov->size);
4679 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4681 if (is_write) {
4682 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4683 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4684 } else {
4685 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4688 qemu_bh_schedule(acb->bh);
4690 return &acb->common;
4693 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4694 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4695 BlockDriverCompletionFunc *cb, void *opaque)
4697 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4700 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4701 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4702 BlockDriverCompletionFunc *cb, void *opaque)
4704 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4708 typedef struct BlockDriverAIOCBCoroutine {
4709 BlockDriverAIOCB common;
4710 BlockRequest req;
4711 bool is_write;
4712 bool *done;
4713 QEMUBH* bh;
4714 } BlockDriverAIOCBCoroutine;
4716 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4718 AioContext *aio_context = bdrv_get_aio_context(blockacb->bs);
4719 BlockDriverAIOCBCoroutine *acb =
4720 container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4721 bool done = false;
4723 acb->done = &done;
4724 while (!done) {
4725 aio_poll(aio_context, true);
4729 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4730 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
4731 .cancel = bdrv_aio_co_cancel_em,
4734 static void bdrv_co_em_bh(void *opaque)
4736 BlockDriverAIOCBCoroutine *acb = opaque;
4738 acb->common.cb(acb->common.opaque, acb->req.error);
4740 if (acb->done) {
4741 *acb->done = true;
4744 qemu_bh_delete(acb->bh);
4745 qemu_aio_release(acb);
4748 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4749 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4751 BlockDriverAIOCBCoroutine *acb = opaque;
4752 BlockDriverState *bs = acb->common.bs;
4754 if (!acb->is_write) {
4755 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4756 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4757 } else {
4758 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4759 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4762 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4763 qemu_bh_schedule(acb->bh);
4766 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4767 int64_t sector_num,
4768 QEMUIOVector *qiov,
4769 int nb_sectors,
4770 BdrvRequestFlags flags,
4771 BlockDriverCompletionFunc *cb,
4772 void *opaque,
4773 bool is_write)
4775 Coroutine *co;
4776 BlockDriverAIOCBCoroutine *acb;
4778 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4779 acb->req.sector = sector_num;
4780 acb->req.nb_sectors = nb_sectors;
4781 acb->req.qiov = qiov;
4782 acb->req.flags = flags;
4783 acb->is_write = is_write;
4784 acb->done = NULL;
4786 co = qemu_coroutine_create(bdrv_co_do_rw);
4787 qemu_coroutine_enter(co, acb);
4789 return &acb->common;
4792 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4794 BlockDriverAIOCBCoroutine *acb = opaque;
4795 BlockDriverState *bs = acb->common.bs;
4797 acb->req.error = bdrv_co_flush(bs);
4798 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4799 qemu_bh_schedule(acb->bh);
4802 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4803 BlockDriverCompletionFunc *cb, void *opaque)
4805 trace_bdrv_aio_flush(bs, opaque);
4807 Coroutine *co;
4808 BlockDriverAIOCBCoroutine *acb;
4810 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4811 acb->done = NULL;
4813 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4814 qemu_coroutine_enter(co, acb);
4816 return &acb->common;
4819 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4821 BlockDriverAIOCBCoroutine *acb = opaque;
4822 BlockDriverState *bs = acb->common.bs;
4824 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4825 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4826 qemu_bh_schedule(acb->bh);
4829 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4830 int64_t sector_num, int nb_sectors,
4831 BlockDriverCompletionFunc *cb, void *opaque)
4833 Coroutine *co;
4834 BlockDriverAIOCBCoroutine *acb;
4836 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4838 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4839 acb->req.sector = sector_num;
4840 acb->req.nb_sectors = nb_sectors;
4841 acb->done = NULL;
4842 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4843 qemu_coroutine_enter(co, acb);
4845 return &acb->common;
4848 void bdrv_init(void)
4850 module_call_init(MODULE_INIT_BLOCK);
4853 void bdrv_init_with_whitelist(void)
4855 use_bdrv_whitelist = 1;
4856 bdrv_init();
4859 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4860 BlockDriverCompletionFunc *cb, void *opaque)
4862 BlockDriverAIOCB *acb;
4864 acb = g_slice_alloc(aiocb_info->aiocb_size);
4865 acb->aiocb_info = aiocb_info;
4866 acb->bs = bs;
4867 acb->cb = cb;
4868 acb->opaque = opaque;
4869 return acb;
4872 void qemu_aio_release(void *p)
4874 BlockDriverAIOCB *acb = p;
4875 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4878 /**************************************************************/
4879 /* Coroutine block device emulation */
4881 typedef struct CoroutineIOCompletion {
4882 Coroutine *coroutine;
4883 int ret;
4884 } CoroutineIOCompletion;
4886 static void bdrv_co_io_em_complete(void *opaque, int ret)
4888 CoroutineIOCompletion *co = opaque;
4890 co->ret = ret;
4891 qemu_coroutine_enter(co->coroutine, NULL);
4894 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4895 int nb_sectors, QEMUIOVector *iov,
4896 bool is_write)
4898 CoroutineIOCompletion co = {
4899 .coroutine = qemu_coroutine_self(),
4901 BlockDriverAIOCB *acb;
4903 if (is_write) {
4904 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4905 bdrv_co_io_em_complete, &co);
4906 } else {
4907 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4908 bdrv_co_io_em_complete, &co);
4911 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4912 if (!acb) {
4913 return -EIO;
4915 qemu_coroutine_yield();
4917 return co.ret;
4920 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4921 int64_t sector_num, int nb_sectors,
4922 QEMUIOVector *iov)
4924 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4927 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4928 int64_t sector_num, int nb_sectors,
4929 QEMUIOVector *iov)
4931 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4934 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4936 RwCo *rwco = opaque;
4938 rwco->ret = bdrv_co_flush(rwco->bs);
4941 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4943 int ret;
4945 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4946 return 0;
4949 /* Write back cached data to the OS even with cache=unsafe */
4950 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4951 if (bs->drv->bdrv_co_flush_to_os) {
4952 ret = bs->drv->bdrv_co_flush_to_os(bs);
4953 if (ret < 0) {
4954 return ret;
4958 /* But don't actually force it to the disk with cache=unsafe */
4959 if (bs->open_flags & BDRV_O_NO_FLUSH) {
4960 goto flush_parent;
4963 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4964 if (bs->drv->bdrv_co_flush_to_disk) {
4965 ret = bs->drv->bdrv_co_flush_to_disk(bs);
4966 } else if (bs->drv->bdrv_aio_flush) {
4967 BlockDriverAIOCB *acb;
4968 CoroutineIOCompletion co = {
4969 .coroutine = qemu_coroutine_self(),
4972 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4973 if (acb == NULL) {
4974 ret = -EIO;
4975 } else {
4976 qemu_coroutine_yield();
4977 ret = co.ret;
4979 } else {
4981 * Some block drivers always operate in either writethrough or unsafe
4982 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4983 * know how the server works (because the behaviour is hardcoded or
4984 * depends on server-side configuration), so we can't ensure that
4985 * everything is safe on disk. Returning an error doesn't work because
4986 * that would break guests even if the server operates in writethrough
4987 * mode.
4989 * Let's hope the user knows what he's doing.
4991 ret = 0;
4993 if (ret < 0) {
4994 return ret;
4997 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4998 * in the case of cache=unsafe, so there are no useless flushes.
5000 flush_parent:
5001 return bdrv_co_flush(bs->file);
5004 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
5006 Error *local_err = NULL;
5007 int ret;
5009 if (!bs->drv) {
5010 return;
5013 if (bs->drv->bdrv_invalidate_cache) {
5014 bs->drv->bdrv_invalidate_cache(bs, &local_err);
5015 } else if (bs->file) {
5016 bdrv_invalidate_cache(bs->file, &local_err);
5018 if (local_err) {
5019 error_propagate(errp, local_err);
5020 return;
5023 ret = refresh_total_sectors(bs, bs->total_sectors);
5024 if (ret < 0) {
5025 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5026 return;
5030 void bdrv_invalidate_cache_all(Error **errp)
5032 BlockDriverState *bs;
5033 Error *local_err = NULL;
5035 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5036 AioContext *aio_context = bdrv_get_aio_context(bs);
5038 aio_context_acquire(aio_context);
5039 bdrv_invalidate_cache(bs, &local_err);
5040 aio_context_release(aio_context);
5041 if (local_err) {
5042 error_propagate(errp, local_err);
5043 return;
5048 void bdrv_clear_incoming_migration_all(void)
5050 BlockDriverState *bs;
5052 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5053 AioContext *aio_context = bdrv_get_aio_context(bs);
5055 aio_context_acquire(aio_context);
5056 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
5057 aio_context_release(aio_context);
5061 int bdrv_flush(BlockDriverState *bs)
5063 Coroutine *co;
5064 RwCo rwco = {
5065 .bs = bs,
5066 .ret = NOT_DONE,
5069 if (qemu_in_coroutine()) {
5070 /* Fast-path if already in coroutine context */
5071 bdrv_flush_co_entry(&rwco);
5072 } else {
5073 AioContext *aio_context = bdrv_get_aio_context(bs);
5075 co = qemu_coroutine_create(bdrv_flush_co_entry);
5076 qemu_coroutine_enter(co, &rwco);
5077 while (rwco.ret == NOT_DONE) {
5078 aio_poll(aio_context, true);
5082 return rwco.ret;
5085 typedef struct DiscardCo {
5086 BlockDriverState *bs;
5087 int64_t sector_num;
5088 int nb_sectors;
5089 int ret;
5090 } DiscardCo;
5091 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5093 DiscardCo *rwco = opaque;
5095 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5098 /* if no limit is specified in the BlockLimits use a default
5099 * of 32768 512-byte sectors (16 MiB) per request.
5101 #define MAX_DISCARD_DEFAULT 32768
5103 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5104 int nb_sectors)
5106 int max_discard;
5108 if (!bs->drv) {
5109 return -ENOMEDIUM;
5110 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5111 return -EIO;
5112 } else if (bs->read_only) {
5113 return -EROFS;
5116 bdrv_reset_dirty(bs, sector_num, nb_sectors);
5118 /* Do nothing if disabled. */
5119 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5120 return 0;
5123 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5124 return 0;
5127 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5128 while (nb_sectors > 0) {
5129 int ret;
5130 int num = nb_sectors;
5132 /* align request */
5133 if (bs->bl.discard_alignment &&
5134 num >= bs->bl.discard_alignment &&
5135 sector_num % bs->bl.discard_alignment) {
5136 if (num > bs->bl.discard_alignment) {
5137 num = bs->bl.discard_alignment;
5139 num -= sector_num % bs->bl.discard_alignment;
5142 /* limit request size */
5143 if (num > max_discard) {
5144 num = max_discard;
5147 if (bs->drv->bdrv_co_discard) {
5148 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5149 } else {
5150 BlockDriverAIOCB *acb;
5151 CoroutineIOCompletion co = {
5152 .coroutine = qemu_coroutine_self(),
5155 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5156 bdrv_co_io_em_complete, &co);
5157 if (acb == NULL) {
5158 return -EIO;
5159 } else {
5160 qemu_coroutine_yield();
5161 ret = co.ret;
5164 if (ret && ret != -ENOTSUP) {
5165 return ret;
5168 sector_num += num;
5169 nb_sectors -= num;
5171 return 0;
5174 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5176 Coroutine *co;
5177 DiscardCo rwco = {
5178 .bs = bs,
5179 .sector_num = sector_num,
5180 .nb_sectors = nb_sectors,
5181 .ret = NOT_DONE,
5184 if (qemu_in_coroutine()) {
5185 /* Fast-path if already in coroutine context */
5186 bdrv_discard_co_entry(&rwco);
5187 } else {
5188 AioContext *aio_context = bdrv_get_aio_context(bs);
5190 co = qemu_coroutine_create(bdrv_discard_co_entry);
5191 qemu_coroutine_enter(co, &rwco);
5192 while (rwco.ret == NOT_DONE) {
5193 aio_poll(aio_context, true);
5197 return rwco.ret;
5200 /**************************************************************/
5201 /* removable device support */
5204 * Return TRUE if the media is present
5206 int bdrv_is_inserted(BlockDriverState *bs)
5208 BlockDriver *drv = bs->drv;
5210 if (!drv)
5211 return 0;
5212 if (!drv->bdrv_is_inserted)
5213 return 1;
5214 return drv->bdrv_is_inserted(bs);
5218 * Return whether the media changed since the last call to this
5219 * function, or -ENOTSUP if we don't know. Most drivers don't know.
5221 int bdrv_media_changed(BlockDriverState *bs)
5223 BlockDriver *drv = bs->drv;
5225 if (drv && drv->bdrv_media_changed) {
5226 return drv->bdrv_media_changed(bs);
5228 return -ENOTSUP;
5232 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5234 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5236 BlockDriver *drv = bs->drv;
5238 if (drv && drv->bdrv_eject) {
5239 drv->bdrv_eject(bs, eject_flag);
5242 if (bs->device_name[0] != '\0') {
5243 bdrv_emit_qmp_eject_event(bs, eject_flag);
5248 * Lock or unlock the media (if it is locked, the user won't be able
5249 * to eject it manually).
5251 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5253 BlockDriver *drv = bs->drv;
5255 trace_bdrv_lock_medium(bs, locked);
5257 if (drv && drv->bdrv_lock_medium) {
5258 drv->bdrv_lock_medium(bs, locked);
5262 /* needed for generic scsi interface */
5264 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5266 BlockDriver *drv = bs->drv;
5268 if (drv && drv->bdrv_ioctl)
5269 return drv->bdrv_ioctl(bs, req, buf);
5270 return -ENOTSUP;
5273 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5274 unsigned long int req, void *buf,
5275 BlockDriverCompletionFunc *cb, void *opaque)
5277 BlockDriver *drv = bs->drv;
5279 if (drv && drv->bdrv_aio_ioctl)
5280 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5281 return NULL;
5284 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5286 bs->guest_block_size = align;
5289 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5291 return qemu_memalign(bdrv_opt_mem_align(bs), size);
5295 * Check if all memory in this vector is sector aligned.
5297 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5299 int i;
5300 size_t alignment = bdrv_opt_mem_align(bs);
5302 for (i = 0; i < qiov->niov; i++) {
5303 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5304 return false;
5306 if (qiov->iov[i].iov_len % alignment) {
5307 return false;
5311 return true;
5314 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5315 Error **errp)
5317 int64_t bitmap_size;
5318 BdrvDirtyBitmap *bitmap;
5320 assert((granularity & (granularity - 1)) == 0);
5322 granularity >>= BDRV_SECTOR_BITS;
5323 assert(granularity);
5324 bitmap_size = bdrv_getlength(bs);
5325 if (bitmap_size < 0) {
5326 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5327 errno = -bitmap_size;
5328 return NULL;
5330 bitmap_size >>= BDRV_SECTOR_BITS;
5331 bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5332 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5333 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5334 return bitmap;
5337 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5339 BdrvDirtyBitmap *bm, *next;
5340 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5341 if (bm == bitmap) {
5342 QLIST_REMOVE(bitmap, list);
5343 hbitmap_free(bitmap->bitmap);
5344 g_free(bitmap);
5345 return;
5350 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5352 BdrvDirtyBitmap *bm;
5353 BlockDirtyInfoList *list = NULL;
5354 BlockDirtyInfoList **plist = &list;
5356 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5357 BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5358 BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5359 info->count = bdrv_get_dirty_count(bs, bm);
5360 info->granularity =
5361 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5362 entry->value = info;
5363 *plist = entry;
5364 plist = &entry->next;
5367 return list;
5370 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5372 if (bitmap) {
5373 return hbitmap_get(bitmap->bitmap, sector);
5374 } else {
5375 return 0;
5379 void bdrv_dirty_iter_init(BlockDriverState *bs,
5380 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5382 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5385 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5386 int nr_sectors)
5388 BdrvDirtyBitmap *bitmap;
5389 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5390 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5394 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5396 BdrvDirtyBitmap *bitmap;
5397 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5398 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5402 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5404 return hbitmap_count(bitmap->bitmap);
5407 /* Get a reference to bs */
5408 void bdrv_ref(BlockDriverState *bs)
5410 bs->refcnt++;
5413 /* Release a previously grabbed reference to bs.
5414 * If after releasing, reference count is zero, the BlockDriverState is
5415 * deleted. */
5416 void bdrv_unref(BlockDriverState *bs)
5418 assert(bs->refcnt > 0);
5419 if (--bs->refcnt == 0) {
5420 bdrv_delete(bs);
5424 struct BdrvOpBlocker {
5425 Error *reason;
5426 QLIST_ENTRY(BdrvOpBlocker) list;
5429 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5431 BdrvOpBlocker *blocker;
5432 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5433 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5434 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5435 if (errp) {
5436 error_setg(errp, "Device '%s' is busy: %s",
5437 bs->device_name, error_get_pretty(blocker->reason));
5439 return true;
5441 return false;
5444 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5446 BdrvOpBlocker *blocker;
5447 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5449 blocker = g_malloc0(sizeof(BdrvOpBlocker));
5450 blocker->reason = reason;
5451 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5454 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5456 BdrvOpBlocker *blocker, *next;
5457 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5458 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5459 if (blocker->reason == reason) {
5460 QLIST_REMOVE(blocker, list);
5461 g_free(blocker);
5466 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5468 int i;
5469 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5470 bdrv_op_block(bs, i, reason);
5474 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5476 int i;
5477 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5478 bdrv_op_unblock(bs, i, reason);
5482 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5484 int i;
5486 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5487 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5488 return false;
5491 return true;
5494 void bdrv_iostatus_enable(BlockDriverState *bs)
5496 bs->iostatus_enabled = true;
5497 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5500 /* The I/O status is only enabled if the drive explicitly
5501 * enables it _and_ the VM is configured to stop on errors */
5502 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5504 return (bs->iostatus_enabled &&
5505 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5506 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5507 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5510 void bdrv_iostatus_disable(BlockDriverState *bs)
5512 bs->iostatus_enabled = false;
5515 void bdrv_iostatus_reset(BlockDriverState *bs)
5517 if (bdrv_iostatus_is_enabled(bs)) {
5518 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5519 if (bs->job) {
5520 block_job_iostatus_reset(bs->job);
5525 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5527 assert(bdrv_iostatus_is_enabled(bs));
5528 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5529 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5530 BLOCK_DEVICE_IO_STATUS_FAILED;
5534 void
5535 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5536 enum BlockAcctType type)
5538 assert(type < BDRV_MAX_IOTYPE);
5540 cookie->bytes = bytes;
5541 cookie->start_time_ns = get_clock();
5542 cookie->type = type;
5545 void
5546 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5548 assert(cookie->type < BDRV_MAX_IOTYPE);
5550 bs->nr_bytes[cookie->type] += cookie->bytes;
5551 bs->nr_ops[cookie->type]++;
5552 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5555 void bdrv_img_create(const char *filename, const char *fmt,
5556 const char *base_filename, const char *base_fmt,
5557 char *options, uint64_t img_size, int flags,
5558 Error **errp, bool quiet)
5560 QemuOptsList *create_opts = NULL;
5561 QemuOpts *opts = NULL;
5562 const char *backing_fmt, *backing_file;
5563 int64_t size;
5564 BlockDriver *drv, *proto_drv;
5565 BlockDriver *backing_drv = NULL;
5566 Error *local_err = NULL;
5567 int ret = 0;
5569 /* Find driver and parse its options */
5570 drv = bdrv_find_format(fmt);
5571 if (!drv) {
5572 error_setg(errp, "Unknown file format '%s'", fmt);
5573 return;
5576 proto_drv = bdrv_find_protocol(filename, true);
5577 if (!proto_drv) {
5578 error_setg(errp, "Unknown protocol '%s'", filename);
5579 return;
5582 create_opts = qemu_opts_append(create_opts, drv->create_opts,
5583 drv->create_options);
5584 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts,
5585 proto_drv->create_options);
5587 /* Create parameter list with default values */
5588 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5589 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5591 /* Parse -o options */
5592 if (options) {
5593 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5594 error_setg(errp, "Invalid options for file format '%s'", fmt);
5595 goto out;
5599 if (base_filename) {
5600 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5601 error_setg(errp, "Backing file not supported for file format '%s'",
5602 fmt);
5603 goto out;
5607 if (base_fmt) {
5608 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5609 error_setg(errp, "Backing file format not supported for file "
5610 "format '%s'", fmt);
5611 goto out;
5615 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5616 if (backing_file) {
5617 if (!strcmp(filename, backing_file)) {
5618 error_setg(errp, "Error: Trying to create an image with the "
5619 "same filename as the backing file");
5620 goto out;
5624 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5625 if (backing_fmt) {
5626 backing_drv = bdrv_find_format(backing_fmt);
5627 if (!backing_drv) {
5628 error_setg(errp, "Unknown backing file format '%s'",
5629 backing_fmt);
5630 goto out;
5634 // The size for the image must always be specified, with one exception:
5635 // If we are using a backing file, we can obtain the size from there
5636 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5637 if (size == -1) {
5638 if (backing_file) {
5639 BlockDriverState *bs;
5640 uint64_t size;
5641 char buf[32];
5642 int back_flags;
5644 /* backing files always opened read-only */
5645 back_flags =
5646 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5648 bs = NULL;
5649 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5650 backing_drv, &local_err);
5651 if (ret < 0) {
5652 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5653 backing_file,
5654 error_get_pretty(local_err));
5655 error_free(local_err);
5656 local_err = NULL;
5657 goto out;
5659 bdrv_get_geometry(bs, &size);
5660 size *= 512;
5662 snprintf(buf, sizeof(buf), "%" PRId64, size);
5663 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5665 bdrv_unref(bs);
5666 } else {
5667 error_setg(errp, "Image creation needs a size parameter");
5668 goto out;
5672 if (!quiet) {
5673 printf("Formatting '%s', fmt=%s ", filename, fmt);
5674 qemu_opts_print(opts);
5675 puts("");
5678 ret = bdrv_create(drv, filename, NULL, opts, &local_err);
5680 if (ret == -EFBIG) {
5681 /* This is generally a better message than whatever the driver would
5682 * deliver (especially because of the cluster_size_hint), since that
5683 * is most probably not much different from "image too large". */
5684 const char *cluster_size_hint = "";
5685 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5686 cluster_size_hint = " (try using a larger cluster size)";
5688 error_setg(errp, "The image size is too large for file format '%s'"
5689 "%s", fmt, cluster_size_hint);
5690 error_free(local_err);
5691 local_err = NULL;
5694 out:
5695 qemu_opts_del(opts);
5696 qemu_opts_free(create_opts);
5697 if (local_err) {
5698 error_propagate(errp, local_err);
5702 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5704 return bs->aio_context;
5707 void bdrv_detach_aio_context(BlockDriverState *bs)
5709 if (!bs->drv) {
5710 return;
5713 if (bs->io_limits_enabled) {
5714 throttle_detach_aio_context(&bs->throttle_state);
5716 if (bs->drv->bdrv_detach_aio_context) {
5717 bs->drv->bdrv_detach_aio_context(bs);
5719 if (bs->file) {
5720 bdrv_detach_aio_context(bs->file);
5722 if (bs->backing_hd) {
5723 bdrv_detach_aio_context(bs->backing_hd);
5726 bs->aio_context = NULL;
5729 void bdrv_attach_aio_context(BlockDriverState *bs,
5730 AioContext *new_context)
5732 if (!bs->drv) {
5733 return;
5736 bs->aio_context = new_context;
5738 if (bs->backing_hd) {
5739 bdrv_attach_aio_context(bs->backing_hd, new_context);
5741 if (bs->file) {
5742 bdrv_attach_aio_context(bs->file, new_context);
5744 if (bs->drv->bdrv_attach_aio_context) {
5745 bs->drv->bdrv_attach_aio_context(bs, new_context);
5747 if (bs->io_limits_enabled) {
5748 throttle_attach_aio_context(&bs->throttle_state, new_context);
5752 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5754 bdrv_drain_all(); /* ensure there are no in-flight requests */
5756 bdrv_detach_aio_context(bs);
5758 /* This function executes in the old AioContext so acquire the new one in
5759 * case it runs in a different thread.
5761 aio_context_acquire(new_context);
5762 bdrv_attach_aio_context(bs, new_context);
5763 aio_context_release(new_context);
5766 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5767 NotifierWithReturn *notifier)
5769 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5772 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options,
5773 QemuOpts *opts)
5775 int ret;
5776 assert(!(options && opts));
5778 if (!bs->drv->bdrv_amend_options && !bs->drv->bdrv_amend_options2) {
5779 return -ENOTSUP;
5781 if (bs->drv->bdrv_amend_options2) {
5782 QemuOptsList *opts_list = NULL;
5783 if (options) {
5784 opts_list = params_to_opts(options);
5785 opts = qemu_opts_create(opts_list, NULL, 0, &error_abort);
5787 ret = bs->drv->bdrv_amend_options2(bs, opts);
5788 if (options) {
5789 qemu_opts_del(opts);
5790 qemu_opts_free(opts_list);
5792 } else {
5793 if (opts) {
5794 options = opts_to_params(opts);
5796 ret = bs->drv->bdrv_amend_options(bs, options);
5797 if (opts) {
5798 free_option_parameters(options);
5801 return ret;
5804 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5805 * of block filter and by bdrv_is_first_non_filter.
5806 * It is used to test if the given bs is the candidate or recurse more in the
5807 * node graph.
5809 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5810 BlockDriverState *candidate)
5812 /* return false if basic checks fails */
5813 if (!bs || !bs->drv) {
5814 return false;
5817 /* the code reached a non block filter driver -> check if the bs is
5818 * the same as the candidate. It's the recursion termination condition.
5820 if (!bs->drv->is_filter) {
5821 return bs == candidate;
5823 /* Down this path the driver is a block filter driver */
5825 /* If the block filter recursion method is defined use it to recurse down
5826 * the node graph.
5828 if (bs->drv->bdrv_recurse_is_first_non_filter) {
5829 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5832 /* the driver is a block filter but don't allow to recurse -> return false
5834 return false;
5837 /* This function checks if the candidate is the first non filter bs down it's
5838 * bs chain. Since we don't have pointers to parents it explore all bs chains
5839 * from the top. Some filters can choose not to pass down the recursion.
5841 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5843 BlockDriverState *bs;
5845 /* walk down the bs forest recursively */
5846 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5847 bool perm;
5849 /* try to recurse in this top level bs */
5850 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5852 /* candidate is the first non filter */
5853 if (perm) {
5854 return true;
5858 return false;