block/raw-posix: Strip "file:" prefix on creation
[qemu/ar7.git] / block.c
blobe7387f1556a2b9aa4d3fa7ef6b3b90a53d2037ef
1 /*
2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
53 struct BdrvDirtyBitmap {
54 HBitmap *bitmap;
55 QLIST_ENTRY(BdrvDirtyBitmap) list;
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63 BlockDriverCompletionFunc *cb, void *opaque);
64 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66 BlockDriverCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75 BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78 BdrvRequestFlags flags);
79 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
83 BdrvRequestFlags flags,
84 BlockDriverCompletionFunc *cb,
85 void *opaque,
86 bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
111 int is_windows_drive(const char *filename)
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
121 #endif
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
127 int i;
129 throttle_config(&bs->throttle_state, cfg);
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
143 bs->io_limits_enabled = false;
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
151 bs->io_limits_enabled = enabled;
153 return drained;
156 void bdrv_io_limits_disable(BlockDriverState *bs)
158 bs->io_limits_enabled = false;
160 bdrv_start_throttled_reqs(bs);
162 throttle_destroy(&bs->throttle_state);
165 static void bdrv_throttle_read_timer_cb(void *opaque)
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
171 static void bdrv_throttle_write_timer_cb(void *opaque)
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 QEMU_CLOCK_VIRTUAL,
183 bdrv_throttle_read_timer_cb,
184 bdrv_throttle_write_timer_cb,
185 bs);
186 bs->io_limits_enabled = true;
189 /* This function makes an IO wait if needed
191 * @nb_sectors: the number of sectors of the IO
192 * @is_write: is the IO a write
194 static void bdrv_io_limits_intercept(BlockDriverState *bs,
195 unsigned int bytes,
196 bool is_write)
198 /* does this io must wait */
199 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
201 /* if must wait or any request of this type throttled queue the IO */
202 if (must_wait ||
203 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
207 /* the IO will be executed, do the accounting */
208 throttle_account(&bs->throttle_state, is_write, bytes);
211 /* if the next request must wait -> do nothing */
212 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213 return;
216 /* else queue next request for execution */
217 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
220 size_t bdrv_opt_mem_align(BlockDriverState *bs)
222 if (!bs || !bs->drv) {
223 /* 4k should be on the safe side */
224 return 4096;
227 return bs->bl.opt_mem_alignment;
230 /* check if the path starts with "<protocol>:" */
231 static int path_has_protocol(const char *path)
233 const char *p;
235 #ifdef _WIN32
236 if (is_windows_drive(path) ||
237 is_windows_drive_prefix(path)) {
238 return 0;
240 p = path + strcspn(path, ":/\\");
241 #else
242 p = path + strcspn(path, ":/");
243 #endif
245 return *p == ':';
248 int path_is_absolute(const char *path)
250 #ifdef _WIN32
251 /* specific case for names like: "\\.\d:" */
252 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
253 return 1;
255 return (*path == '/' || *path == '\\');
256 #else
257 return (*path == '/');
258 #endif
261 /* if filename is absolute, just copy it to dest. Otherwise, build a
262 path to it by considering it is relative to base_path. URL are
263 supported. */
264 void path_combine(char *dest, int dest_size,
265 const char *base_path,
266 const char *filename)
268 const char *p, *p1;
269 int len;
271 if (dest_size <= 0)
272 return;
273 if (path_is_absolute(filename)) {
274 pstrcpy(dest, dest_size, filename);
275 } else {
276 p = strchr(base_path, ':');
277 if (p)
278 p++;
279 else
280 p = base_path;
281 p1 = strrchr(base_path, '/');
282 #ifdef _WIN32
284 const char *p2;
285 p2 = strrchr(base_path, '\\');
286 if (!p1 || p2 > p1)
287 p1 = p2;
289 #endif
290 if (p1)
291 p1++;
292 else
293 p1 = base_path;
294 if (p1 > p)
295 p = p1;
296 len = p - base_path;
297 if (len > dest_size - 1)
298 len = dest_size - 1;
299 memcpy(dest, base_path, len);
300 dest[len] = '\0';
301 pstrcat(dest, dest_size, filename);
305 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
307 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308 pstrcpy(dest, sz, bs->backing_file);
309 } else {
310 path_combine(dest, sz, bs->filename, bs->backing_file);
314 void bdrv_register(BlockDriver *bdrv)
316 /* Block drivers without coroutine functions need emulation */
317 if (!bdrv->bdrv_co_readv) {
318 bdrv->bdrv_co_readv = bdrv_co_readv_em;
319 bdrv->bdrv_co_writev = bdrv_co_writev_em;
321 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322 * the block driver lacks aio we need to emulate that too.
324 if (!bdrv->bdrv_aio_readv) {
325 /* add AIO emulation layer */
326 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
331 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
334 /* create a new block device (by default it is empty) */
335 BlockDriverState *bdrv_new(const char *device_name)
337 BlockDriverState *bs;
339 bs = g_malloc0(sizeof(BlockDriverState));
340 QLIST_INIT(&bs->dirty_bitmaps);
341 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
342 if (device_name[0] != '\0') {
343 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
345 bdrv_iostatus_disable(bs);
346 notifier_list_init(&bs->close_notifiers);
347 notifier_with_return_list_init(&bs->before_write_notifiers);
348 qemu_co_queue_init(&bs->throttled_reqs[0]);
349 qemu_co_queue_init(&bs->throttled_reqs[1]);
350 bs->refcnt = 1;
352 return bs;
355 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
357 notifier_list_add(&bs->close_notifiers, notify);
360 BlockDriver *bdrv_find_format(const char *format_name)
362 BlockDriver *drv1;
363 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
364 if (!strcmp(drv1->format_name, format_name)) {
365 return drv1;
368 return NULL;
371 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
373 static const char *whitelist_rw[] = {
374 CONFIG_BDRV_RW_WHITELIST
376 static const char *whitelist_ro[] = {
377 CONFIG_BDRV_RO_WHITELIST
379 const char **p;
381 if (!whitelist_rw[0] && !whitelist_ro[0]) {
382 return 1; /* no whitelist, anything goes */
385 for (p = whitelist_rw; *p; p++) {
386 if (!strcmp(drv->format_name, *p)) {
387 return 1;
390 if (read_only) {
391 for (p = whitelist_ro; *p; p++) {
392 if (!strcmp(drv->format_name, *p)) {
393 return 1;
397 return 0;
400 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
401 bool read_only)
403 BlockDriver *drv = bdrv_find_format(format_name);
404 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
407 typedef struct CreateCo {
408 BlockDriver *drv;
409 char *filename;
410 QEMUOptionParameter *options;
411 int ret;
412 Error *err;
413 } CreateCo;
415 static void coroutine_fn bdrv_create_co_entry(void *opaque)
417 Error *local_err = NULL;
418 int ret;
420 CreateCo *cco = opaque;
421 assert(cco->drv);
423 ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
424 if (local_err) {
425 error_propagate(&cco->err, local_err);
427 cco->ret = ret;
430 int bdrv_create(BlockDriver *drv, const char* filename,
431 QEMUOptionParameter *options, Error **errp)
433 int ret;
435 Coroutine *co;
436 CreateCo cco = {
437 .drv = drv,
438 .filename = g_strdup(filename),
439 .options = options,
440 .ret = NOT_DONE,
441 .err = NULL,
444 if (!drv->bdrv_create) {
445 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
446 ret = -ENOTSUP;
447 goto out;
450 if (qemu_in_coroutine()) {
451 /* Fast-path if already in coroutine context */
452 bdrv_create_co_entry(&cco);
453 } else {
454 co = qemu_coroutine_create(bdrv_create_co_entry);
455 qemu_coroutine_enter(co, &cco);
456 while (cco.ret == NOT_DONE) {
457 qemu_aio_wait();
461 ret = cco.ret;
462 if (ret < 0) {
463 if (cco.err) {
464 error_propagate(errp, cco.err);
465 } else {
466 error_setg_errno(errp, -ret, "Could not create image");
470 out:
471 g_free(cco.filename);
472 return ret;
475 int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
476 Error **errp)
478 BlockDriver *drv;
479 Error *local_err = NULL;
480 int ret;
482 drv = bdrv_find_protocol(filename, true);
483 if (drv == NULL) {
484 error_setg(errp, "Could not find protocol for file '%s'", filename);
485 return -ENOENT;
488 ret = bdrv_create(drv, filename, options, &local_err);
489 if (local_err) {
490 error_propagate(errp, local_err);
492 return ret;
495 int bdrv_refresh_limits(BlockDriverState *bs)
497 BlockDriver *drv = bs->drv;
499 memset(&bs->bl, 0, sizeof(bs->bl));
501 if (!drv) {
502 return 0;
505 /* Take some limits from the children as a default */
506 if (bs->file) {
507 bdrv_refresh_limits(bs->file);
508 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
509 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
510 } else {
511 bs->bl.opt_mem_alignment = 512;
514 if (bs->backing_hd) {
515 bdrv_refresh_limits(bs->backing_hd);
516 bs->bl.opt_transfer_length =
517 MAX(bs->bl.opt_transfer_length,
518 bs->backing_hd->bl.opt_transfer_length);
519 bs->bl.opt_mem_alignment =
520 MAX(bs->bl.opt_mem_alignment,
521 bs->backing_hd->bl.opt_mem_alignment);
524 /* Then let the driver override it */
525 if (drv->bdrv_refresh_limits) {
526 return drv->bdrv_refresh_limits(bs);
529 return 0;
533 * Create a uniquely-named empty temporary file.
534 * Return 0 upon success, otherwise a negative errno value.
536 int get_tmp_filename(char *filename, int size)
538 #ifdef _WIN32
539 char temp_dir[MAX_PATH];
540 /* GetTempFileName requires that its output buffer (4th param)
541 have length MAX_PATH or greater. */
542 assert(size >= MAX_PATH);
543 return (GetTempPath(MAX_PATH, temp_dir)
544 && GetTempFileName(temp_dir, "qem", 0, filename)
545 ? 0 : -GetLastError());
546 #else
547 int fd;
548 const char *tmpdir;
549 tmpdir = getenv("TMPDIR");
550 if (!tmpdir) {
551 tmpdir = "/var/tmp";
553 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
554 return -EOVERFLOW;
556 fd = mkstemp(filename);
557 if (fd < 0) {
558 return -errno;
560 if (close(fd) != 0) {
561 unlink(filename);
562 return -errno;
564 return 0;
565 #endif
569 * Detect host devices. By convention, /dev/cdrom[N] is always
570 * recognized as a host CDROM.
572 static BlockDriver *find_hdev_driver(const char *filename)
574 int score_max = 0, score;
575 BlockDriver *drv = NULL, *d;
577 QLIST_FOREACH(d, &bdrv_drivers, list) {
578 if (d->bdrv_probe_device) {
579 score = d->bdrv_probe_device(filename);
580 if (score > score_max) {
581 score_max = score;
582 drv = d;
587 return drv;
590 BlockDriver *bdrv_find_protocol(const char *filename,
591 bool allow_protocol_prefix)
593 BlockDriver *drv1;
594 char protocol[128];
595 int len;
596 const char *p;
598 /* TODO Drivers without bdrv_file_open must be specified explicitly */
601 * XXX(hch): we really should not let host device detection
602 * override an explicit protocol specification, but moving this
603 * later breaks access to device names with colons in them.
604 * Thanks to the brain-dead persistent naming schemes on udev-
605 * based Linux systems those actually are quite common.
607 drv1 = find_hdev_driver(filename);
608 if (drv1) {
609 return drv1;
612 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
613 return bdrv_find_format("file");
616 p = strchr(filename, ':');
617 assert(p != NULL);
618 len = p - filename;
619 if (len > sizeof(protocol) - 1)
620 len = sizeof(protocol) - 1;
621 memcpy(protocol, filename, len);
622 protocol[len] = '\0';
623 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
624 if (drv1->protocol_name &&
625 !strcmp(drv1->protocol_name, protocol)) {
626 return drv1;
629 return NULL;
632 static int find_image_format(BlockDriverState *bs, const char *filename,
633 BlockDriver **pdrv, Error **errp)
635 int score, score_max;
636 BlockDriver *drv1, *drv;
637 uint8_t buf[2048];
638 int ret = 0;
640 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
641 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
642 drv = bdrv_find_format("raw");
643 if (!drv) {
644 error_setg(errp, "Could not find raw image format");
645 ret = -ENOENT;
647 *pdrv = drv;
648 return ret;
651 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
652 if (ret < 0) {
653 error_setg_errno(errp, -ret, "Could not read image for determining its "
654 "format");
655 *pdrv = NULL;
656 return ret;
659 score_max = 0;
660 drv = NULL;
661 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
662 if (drv1->bdrv_probe) {
663 score = drv1->bdrv_probe(buf, ret, filename);
664 if (score > score_max) {
665 score_max = score;
666 drv = drv1;
670 if (!drv) {
671 error_setg(errp, "Could not determine image format: No compatible "
672 "driver found");
673 ret = -ENOENT;
675 *pdrv = drv;
676 return ret;
680 * Set the current 'total_sectors' value
682 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
684 BlockDriver *drv = bs->drv;
686 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
687 if (bs->sg)
688 return 0;
690 /* query actual device if possible, otherwise just trust the hint */
691 if (drv->bdrv_getlength) {
692 int64_t length = drv->bdrv_getlength(bs);
693 if (length < 0) {
694 return length;
696 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
699 bs->total_sectors = hint;
700 return 0;
704 * Set open flags for a given discard mode
706 * Return 0 on success, -1 if the discard mode was invalid.
708 int bdrv_parse_discard_flags(const char *mode, int *flags)
710 *flags &= ~BDRV_O_UNMAP;
712 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
713 /* do nothing */
714 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
715 *flags |= BDRV_O_UNMAP;
716 } else {
717 return -1;
720 return 0;
724 * Set open flags for a given cache mode
726 * Return 0 on success, -1 if the cache mode was invalid.
728 int bdrv_parse_cache_flags(const char *mode, int *flags)
730 *flags &= ~BDRV_O_CACHE_MASK;
732 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
733 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
734 } else if (!strcmp(mode, "directsync")) {
735 *flags |= BDRV_O_NOCACHE;
736 } else if (!strcmp(mode, "writeback")) {
737 *flags |= BDRV_O_CACHE_WB;
738 } else if (!strcmp(mode, "unsafe")) {
739 *flags |= BDRV_O_CACHE_WB;
740 *flags |= BDRV_O_NO_FLUSH;
741 } else if (!strcmp(mode, "writethrough")) {
742 /* this is the default */
743 } else {
744 return -1;
747 return 0;
751 * The copy-on-read flag is actually a reference count so multiple users may
752 * use the feature without worrying about clobbering its previous state.
753 * Copy-on-read stays enabled until all users have called to disable it.
755 void bdrv_enable_copy_on_read(BlockDriverState *bs)
757 bs->copy_on_read++;
760 void bdrv_disable_copy_on_read(BlockDriverState *bs)
762 assert(bs->copy_on_read > 0);
763 bs->copy_on_read--;
766 static int bdrv_open_flags(BlockDriverState *bs, int flags)
768 int open_flags = flags | BDRV_O_CACHE_WB;
771 * Clear flags that are internal to the block layer before opening the
772 * image.
774 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
777 * Snapshots should be writable.
779 if (bs->is_temporary) {
780 open_flags |= BDRV_O_RDWR;
783 return open_flags;
786 static int bdrv_assign_node_name(BlockDriverState *bs,
787 const char *node_name,
788 Error **errp)
790 if (!node_name) {
791 return 0;
794 /* empty string node name is invalid */
795 if (node_name[0] == '\0') {
796 error_setg(errp, "Empty node name");
797 return -EINVAL;
800 /* takes care of avoiding namespaces collisions */
801 if (bdrv_find(node_name)) {
802 error_setg(errp, "node-name=%s is conflicting with a device id",
803 node_name);
804 return -EINVAL;
807 /* takes care of avoiding duplicates node names */
808 if (bdrv_find_node(node_name)) {
809 error_setg(errp, "Duplicate node name");
810 return -EINVAL;
813 /* copy node name into the bs and insert it into the graph list */
814 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
815 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
817 return 0;
821 * Common part for opening disk images and files
823 * Removes all processed options from *options.
825 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
826 QDict *options, int flags, BlockDriver *drv, Error **errp)
828 int ret, open_flags;
829 const char *filename;
830 const char *node_name = NULL;
831 Error *local_err = NULL;
833 assert(drv != NULL);
834 assert(bs->file == NULL);
835 assert(options != NULL && bs->options != options);
837 if (file != NULL) {
838 filename = file->filename;
839 } else {
840 filename = qdict_get_try_str(options, "filename");
843 if (drv->bdrv_needs_filename && !filename) {
844 error_setg(errp, "The '%s' block driver requires a file name",
845 drv->format_name);
846 return -EINVAL;
849 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
851 node_name = qdict_get_try_str(options, "node-name");
852 ret = bdrv_assign_node_name(bs, node_name, errp);
853 if (ret < 0) {
854 return ret;
856 qdict_del(options, "node-name");
858 /* bdrv_open() with directly using a protocol as drv. This layer is already
859 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
860 * and return immediately. */
861 if (file != NULL && drv->bdrv_file_open) {
862 bdrv_swap(file, bs);
863 return 0;
866 bs->open_flags = flags;
867 bs->guest_block_size = 512;
868 bs->request_alignment = 512;
869 bs->zero_beyond_eof = true;
870 open_flags = bdrv_open_flags(bs, flags);
871 bs->read_only = !(open_flags & BDRV_O_RDWR);
873 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
874 error_setg(errp,
875 !bs->read_only && bdrv_is_whitelisted(drv, true)
876 ? "Driver '%s' can only be used for read-only devices"
877 : "Driver '%s' is not whitelisted",
878 drv->format_name);
879 return -ENOTSUP;
882 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
883 if (flags & BDRV_O_COPY_ON_READ) {
884 if (!bs->read_only) {
885 bdrv_enable_copy_on_read(bs);
886 } else {
887 error_setg(errp, "Can't use copy-on-read on read-only device");
888 return -EINVAL;
892 if (filename != NULL) {
893 pstrcpy(bs->filename, sizeof(bs->filename), filename);
894 } else {
895 bs->filename[0] = '\0';
898 bs->drv = drv;
899 bs->opaque = g_malloc0(drv->instance_size);
901 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
903 /* Open the image, either directly or using a protocol */
904 if (drv->bdrv_file_open) {
905 assert(file == NULL);
906 assert(!drv->bdrv_needs_filename || filename != NULL);
907 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
908 } else {
909 if (file == NULL) {
910 error_setg(errp, "Can't use '%s' as a block driver for the "
911 "protocol level", drv->format_name);
912 ret = -EINVAL;
913 goto free_and_fail;
915 bs->file = file;
916 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
919 if (ret < 0) {
920 if (local_err) {
921 error_propagate(errp, local_err);
922 } else if (bs->filename[0]) {
923 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
924 } else {
925 error_setg_errno(errp, -ret, "Could not open image");
927 goto free_and_fail;
930 ret = refresh_total_sectors(bs, bs->total_sectors);
931 if (ret < 0) {
932 error_setg_errno(errp, -ret, "Could not refresh total sector count");
933 goto free_and_fail;
936 bdrv_refresh_limits(bs);
937 assert(bdrv_opt_mem_align(bs) != 0);
938 assert((bs->request_alignment != 0) || bs->sg);
940 #ifndef _WIN32
941 if (bs->is_temporary) {
942 assert(bs->filename[0] != '\0');
943 unlink(bs->filename);
945 #endif
946 return 0;
948 free_and_fail:
949 bs->file = NULL;
950 g_free(bs->opaque);
951 bs->opaque = NULL;
952 bs->drv = NULL;
953 return ret;
957 * Opens a file using a protocol (file, host_device, nbd, ...)
959 * options is an indirect pointer to a QDict of options to pass to the block
960 * drivers, or pointer to NULL for an empty set of options. If this function
961 * takes ownership of the QDict reference, it will set *options to NULL;
962 * otherwise, it will contain unused/unrecognized options after this function
963 * returns. Then, the caller is responsible for freeing it. If it intends to
964 * reuse the QDict, QINCREF() should be called beforehand.
966 static int bdrv_file_open(BlockDriverState *bs, const char *filename,
967 QDict **options, int flags, Error **errp)
969 BlockDriver *drv;
970 const char *drvname;
971 bool allow_protocol_prefix = false;
972 Error *local_err = NULL;
973 int ret;
975 /* Fetch the file name from the options QDict if necessary */
976 if (!filename) {
977 filename = qdict_get_try_str(*options, "filename");
978 } else if (filename && !qdict_haskey(*options, "filename")) {
979 qdict_put(*options, "filename", qstring_from_str(filename));
980 allow_protocol_prefix = true;
981 } else {
982 error_setg(errp, "Can't specify 'file' and 'filename' options at the "
983 "same time");
984 ret = -EINVAL;
985 goto fail;
988 /* Find the right block driver */
989 drvname = qdict_get_try_str(*options, "driver");
990 if (drvname) {
991 drv = bdrv_find_format(drvname);
992 if (!drv) {
993 error_setg(errp, "Unknown driver '%s'", drvname);
995 qdict_del(*options, "driver");
996 } else if (filename) {
997 drv = bdrv_find_protocol(filename, allow_protocol_prefix);
998 if (!drv) {
999 error_setg(errp, "Unknown protocol");
1001 } else {
1002 error_setg(errp, "Must specify either driver or file");
1003 drv = NULL;
1006 if (!drv) {
1007 /* errp has been set already */
1008 ret = -ENOENT;
1009 goto fail;
1012 /* Parse the filename and open it */
1013 if (drv->bdrv_parse_filename && filename) {
1014 drv->bdrv_parse_filename(filename, *options, &local_err);
1015 if (local_err) {
1016 error_propagate(errp, local_err);
1017 ret = -EINVAL;
1018 goto fail;
1021 if (!drv->bdrv_needs_filename) {
1022 qdict_del(*options, "filename");
1023 } else {
1024 filename = qdict_get_str(*options, "filename");
1028 if (!drv->bdrv_file_open) {
1029 ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err);
1030 *options = NULL;
1031 } else {
1032 ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err);
1034 if (ret < 0) {
1035 error_propagate(errp, local_err);
1036 goto fail;
1039 bs->growable = 1;
1040 return 0;
1042 fail:
1043 return ret;
1047 * Opens the backing file for a BlockDriverState if not yet open
1049 * options is a QDict of options to pass to the block drivers, or NULL for an
1050 * empty set of options. The reference to the QDict is transferred to this
1051 * function (even on failure), so if the caller intends to reuse the dictionary,
1052 * it needs to use QINCREF() before calling bdrv_file_open.
1054 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1056 char backing_filename[PATH_MAX];
1057 int back_flags, ret;
1058 BlockDriver *back_drv = NULL;
1059 Error *local_err = NULL;
1061 if (bs->backing_hd != NULL) {
1062 QDECREF(options);
1063 return 0;
1066 /* NULL means an empty set of options */
1067 if (options == NULL) {
1068 options = qdict_new();
1071 bs->open_flags &= ~BDRV_O_NO_BACKING;
1072 if (qdict_haskey(options, "file.filename")) {
1073 backing_filename[0] = '\0';
1074 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1075 QDECREF(options);
1076 return 0;
1077 } else {
1078 bdrv_get_full_backing_filename(bs, backing_filename,
1079 sizeof(backing_filename));
1082 if (bs->backing_format[0] != '\0') {
1083 back_drv = bdrv_find_format(bs->backing_format);
1086 /* backing files always opened read-only */
1087 back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1088 BDRV_O_COPY_ON_READ);
1090 assert(bs->backing_hd == NULL);
1091 ret = bdrv_open(&bs->backing_hd,
1092 *backing_filename ? backing_filename : NULL, NULL, options,
1093 back_flags, back_drv, &local_err);
1094 if (ret < 0) {
1095 bs->backing_hd = NULL;
1096 bs->open_flags |= BDRV_O_NO_BACKING;
1097 error_setg(errp, "Could not open backing file: %s",
1098 error_get_pretty(local_err));
1099 error_free(local_err);
1100 return ret;
1103 if (bs->backing_hd->file) {
1104 pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1105 bs->backing_hd->file->filename);
1108 /* Recalculate the BlockLimits with the backing file */
1109 bdrv_refresh_limits(bs);
1111 return 0;
1115 * Opens a disk image whose options are given as BlockdevRef in another block
1116 * device's options.
1118 * If allow_none is true, no image will be opened if filename is false and no
1119 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1121 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1122 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1123 * itself, all options starting with "${bdref_key}." are considered part of the
1124 * BlockdevRef.
1126 * The BlockdevRef will be removed from the options QDict.
1128 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1130 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1131 QDict *options, const char *bdref_key, int flags,
1132 bool allow_none, Error **errp)
1134 QDict *image_options;
1135 int ret;
1136 char *bdref_key_dot;
1137 const char *reference;
1139 assert(pbs);
1140 assert(*pbs == NULL);
1142 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1143 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1144 g_free(bdref_key_dot);
1146 reference = qdict_get_try_str(options, bdref_key);
1147 if (!filename && !reference && !qdict_size(image_options)) {
1148 if (allow_none) {
1149 ret = 0;
1150 } else {
1151 error_setg(errp, "A block device must be specified for \"%s\"",
1152 bdref_key);
1153 ret = -EINVAL;
1155 goto done;
1158 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1160 done:
1161 qdict_del(options, bdref_key);
1162 return ret;
1166 * Opens a disk image (raw, qcow2, vmdk, ...)
1168 * options is a QDict of options to pass to the block drivers, or NULL for an
1169 * empty set of options. The reference to the QDict belongs to the block layer
1170 * after the call (even on failure), so if the caller intends to reuse the
1171 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1173 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1174 * If it is not NULL, the referenced BDS will be reused.
1176 * The reference parameter may be used to specify an existing block device which
1177 * should be opened. If specified, neither options nor a filename may be given,
1178 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1180 int bdrv_open(BlockDriverState **pbs, const char *filename,
1181 const char *reference, QDict *options, int flags,
1182 BlockDriver *drv, Error **errp)
1184 int ret;
1185 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1186 char tmp_filename[PATH_MAX + 1];
1187 BlockDriverState *file = NULL, *bs;
1188 const char *drvname;
1189 Error *local_err = NULL;
1191 assert(pbs);
1193 if (reference) {
1194 bool options_non_empty = options ? qdict_size(options) : false;
1195 QDECREF(options);
1197 if (*pbs) {
1198 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1199 "another block device");
1200 return -EINVAL;
1203 if (filename || options_non_empty) {
1204 error_setg(errp, "Cannot reference an existing block device with "
1205 "additional options or a new filename");
1206 return -EINVAL;
1209 bs = bdrv_lookup_bs(reference, reference, errp);
1210 if (!bs) {
1211 return -ENODEV;
1213 bdrv_ref(bs);
1214 *pbs = bs;
1215 return 0;
1218 if (*pbs) {
1219 bs = *pbs;
1220 } else {
1221 bs = bdrv_new("");
1224 /* NULL means an empty set of options */
1225 if (options == NULL) {
1226 options = qdict_new();
1229 bs->options = options;
1230 options = qdict_clone_shallow(options);
1232 if (flags & BDRV_O_PROTOCOL) {
1233 assert(!drv);
1234 ret = bdrv_file_open(bs, filename, &options, flags & ~BDRV_O_PROTOCOL,
1235 &local_err);
1236 if (!ret) {
1237 goto done;
1238 } else if (bs->drv) {
1239 goto close_and_fail;
1240 } else {
1241 goto fail;
1245 /* For snapshot=on, create a temporary qcow2 overlay */
1246 if (flags & BDRV_O_SNAPSHOT) {
1247 BlockDriverState *bs1;
1248 int64_t total_size;
1249 BlockDriver *bdrv_qcow2;
1250 QEMUOptionParameter *create_options;
1251 QDict *snapshot_options;
1253 /* if snapshot, we create a temporary backing file and open it
1254 instead of opening 'filename' directly */
1256 /* Get the required size from the image */
1257 QINCREF(options);
1258 bs1 = NULL;
1259 ret = bdrv_open(&bs1, filename, NULL, options, BDRV_O_NO_BACKING,
1260 drv, &local_err);
1261 if (ret < 0) {
1262 goto fail;
1264 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1266 bdrv_unref(bs1);
1268 /* Create the temporary image */
1269 ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1270 if (ret < 0) {
1271 error_setg_errno(errp, -ret, "Could not get temporary filename");
1272 goto fail;
1275 bdrv_qcow2 = bdrv_find_format("qcow2");
1276 create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1277 NULL);
1279 set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1281 ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1282 free_option_parameters(create_options);
1283 if (ret < 0) {
1284 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1285 "'%s': %s", tmp_filename,
1286 error_get_pretty(local_err));
1287 error_free(local_err);
1288 local_err = NULL;
1289 goto fail;
1292 /* Prepare a new options QDict for the temporary file, where user
1293 * options refer to the backing file */
1294 if (filename) {
1295 qdict_put(options, "file.filename", qstring_from_str(filename));
1297 if (drv) {
1298 qdict_put(options, "driver", qstring_from_str(drv->format_name));
1301 snapshot_options = qdict_new();
1302 qdict_put(snapshot_options, "backing", options);
1303 qdict_flatten(snapshot_options);
1305 bs->options = snapshot_options;
1306 options = qdict_clone_shallow(bs->options);
1308 filename = tmp_filename;
1309 drv = bdrv_qcow2;
1310 bs->is_temporary = 1;
1313 /* Open image file without format layer */
1314 if (flags & BDRV_O_RDWR) {
1315 flags |= BDRV_O_ALLOW_RDWR;
1318 assert(file == NULL);
1319 ret = bdrv_open_image(&file, filename, options, "file",
1320 bdrv_open_flags(bs, flags | BDRV_O_UNMAP) |
1321 BDRV_O_PROTOCOL, true, &local_err);
1322 if (ret < 0) {
1323 goto fail;
1326 /* Find the right image format driver */
1327 drvname = qdict_get_try_str(options, "driver");
1328 if (drvname) {
1329 drv = bdrv_find_format(drvname);
1330 qdict_del(options, "driver");
1331 if (!drv) {
1332 error_setg(errp, "Invalid driver: '%s'", drvname);
1333 ret = -EINVAL;
1334 goto unlink_and_fail;
1338 if (!drv) {
1339 if (file) {
1340 ret = find_image_format(file, filename, &drv, &local_err);
1341 } else {
1342 error_setg(errp, "Must specify either driver or file");
1343 ret = -EINVAL;
1344 goto unlink_and_fail;
1348 if (!drv) {
1349 goto unlink_and_fail;
1352 /* Open the image */
1353 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1354 if (ret < 0) {
1355 goto unlink_and_fail;
1358 if (file && (bs->file != file)) {
1359 bdrv_unref(file);
1360 file = NULL;
1363 /* If there is a backing file, use it */
1364 if ((flags & BDRV_O_NO_BACKING) == 0) {
1365 QDict *backing_options;
1367 qdict_extract_subqdict(options, &backing_options, "backing.");
1368 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1369 if (ret < 0) {
1370 goto close_and_fail;
1374 done:
1375 /* Check if any unknown options were used */
1376 if (options && (qdict_size(options) != 0)) {
1377 const QDictEntry *entry = qdict_first(options);
1378 if (flags & BDRV_O_PROTOCOL) {
1379 error_setg(errp, "Block protocol '%s' doesn't support the option "
1380 "'%s'", drv->format_name, entry->key);
1381 } else {
1382 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1383 "support the option '%s'", drv->format_name,
1384 bs->device_name, entry->key);
1387 ret = -EINVAL;
1388 goto close_and_fail;
1390 QDECREF(options);
1392 if (!bdrv_key_required(bs)) {
1393 bdrv_dev_change_media_cb(bs, true);
1396 *pbs = bs;
1397 return 0;
1399 unlink_and_fail:
1400 if (file != NULL) {
1401 bdrv_unref(file);
1403 if (bs->is_temporary) {
1404 unlink(filename);
1406 fail:
1407 QDECREF(bs->options);
1408 QDECREF(options);
1409 bs->options = NULL;
1410 if (!*pbs) {
1411 /* If *pbs is NULL, a new BDS has been created in this function and
1412 needs to be freed now. Otherwise, it does not need to be closed,
1413 since it has not really been opened yet. */
1414 bdrv_unref(bs);
1416 if (local_err) {
1417 error_propagate(errp, local_err);
1419 return ret;
1421 close_and_fail:
1422 /* See fail path, but now the BDS has to be always closed */
1423 if (*pbs) {
1424 bdrv_close(bs);
1425 } else {
1426 bdrv_unref(bs);
1428 QDECREF(options);
1429 if (local_err) {
1430 error_propagate(errp, local_err);
1432 return ret;
1435 typedef struct BlockReopenQueueEntry {
1436 bool prepared;
1437 BDRVReopenState state;
1438 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1439 } BlockReopenQueueEntry;
1442 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1443 * reopen of multiple devices.
1445 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1446 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1447 * be created and initialized. This newly created BlockReopenQueue should be
1448 * passed back in for subsequent calls that are intended to be of the same
1449 * atomic 'set'.
1451 * bs is the BlockDriverState to add to the reopen queue.
1453 * flags contains the open flags for the associated bs
1455 * returns a pointer to bs_queue, which is either the newly allocated
1456 * bs_queue, or the existing bs_queue being used.
1459 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1460 BlockDriverState *bs, int flags)
1462 assert(bs != NULL);
1464 BlockReopenQueueEntry *bs_entry;
1465 if (bs_queue == NULL) {
1466 bs_queue = g_new0(BlockReopenQueue, 1);
1467 QSIMPLEQ_INIT(bs_queue);
1470 if (bs->file) {
1471 bdrv_reopen_queue(bs_queue, bs->file, flags);
1474 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1475 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1477 bs_entry->state.bs = bs;
1478 bs_entry->state.flags = flags;
1480 return bs_queue;
1484 * Reopen multiple BlockDriverStates atomically & transactionally.
1486 * The queue passed in (bs_queue) must have been built up previous
1487 * via bdrv_reopen_queue().
1489 * Reopens all BDS specified in the queue, with the appropriate
1490 * flags. All devices are prepared for reopen, and failure of any
1491 * device will cause all device changes to be abandonded, and intermediate
1492 * data cleaned up.
1494 * If all devices prepare successfully, then the changes are committed
1495 * to all devices.
1498 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1500 int ret = -1;
1501 BlockReopenQueueEntry *bs_entry, *next;
1502 Error *local_err = NULL;
1504 assert(bs_queue != NULL);
1506 bdrv_drain_all();
1508 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1509 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1510 error_propagate(errp, local_err);
1511 goto cleanup;
1513 bs_entry->prepared = true;
1516 /* If we reach this point, we have success and just need to apply the
1517 * changes
1519 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1520 bdrv_reopen_commit(&bs_entry->state);
1523 ret = 0;
1525 cleanup:
1526 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1527 if (ret && bs_entry->prepared) {
1528 bdrv_reopen_abort(&bs_entry->state);
1530 g_free(bs_entry);
1532 g_free(bs_queue);
1533 return ret;
1537 /* Reopen a single BlockDriverState with the specified flags. */
1538 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1540 int ret = -1;
1541 Error *local_err = NULL;
1542 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1544 ret = bdrv_reopen_multiple(queue, &local_err);
1545 if (local_err != NULL) {
1546 error_propagate(errp, local_err);
1548 return ret;
1553 * Prepares a BlockDriverState for reopen. All changes are staged in the
1554 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1555 * the block driver layer .bdrv_reopen_prepare()
1557 * bs is the BlockDriverState to reopen
1558 * flags are the new open flags
1559 * queue is the reopen queue
1561 * Returns 0 on success, non-zero on error. On error errp will be set
1562 * as well.
1564 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1565 * It is the responsibility of the caller to then call the abort() or
1566 * commit() for any other BDS that have been left in a prepare() state
1569 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1570 Error **errp)
1572 int ret = -1;
1573 Error *local_err = NULL;
1574 BlockDriver *drv;
1576 assert(reopen_state != NULL);
1577 assert(reopen_state->bs->drv != NULL);
1578 drv = reopen_state->bs->drv;
1580 /* if we are to stay read-only, do not allow permission change
1581 * to r/w */
1582 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1583 reopen_state->flags & BDRV_O_RDWR) {
1584 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1585 reopen_state->bs->device_name);
1586 goto error;
1590 ret = bdrv_flush(reopen_state->bs);
1591 if (ret) {
1592 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1593 strerror(-ret));
1594 goto error;
1597 if (drv->bdrv_reopen_prepare) {
1598 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1599 if (ret) {
1600 if (local_err != NULL) {
1601 error_propagate(errp, local_err);
1602 } else {
1603 error_setg(errp, "failed while preparing to reopen image '%s'",
1604 reopen_state->bs->filename);
1606 goto error;
1608 } else {
1609 /* It is currently mandatory to have a bdrv_reopen_prepare()
1610 * handler for each supported drv. */
1611 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1612 drv->format_name, reopen_state->bs->device_name,
1613 "reopening of file");
1614 ret = -1;
1615 goto error;
1618 ret = 0;
1620 error:
1621 return ret;
1625 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1626 * makes them final by swapping the staging BlockDriverState contents into
1627 * the active BlockDriverState contents.
1629 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1631 BlockDriver *drv;
1633 assert(reopen_state != NULL);
1634 drv = reopen_state->bs->drv;
1635 assert(drv != NULL);
1637 /* If there are any driver level actions to take */
1638 if (drv->bdrv_reopen_commit) {
1639 drv->bdrv_reopen_commit(reopen_state);
1642 /* set BDS specific flags now */
1643 reopen_state->bs->open_flags = reopen_state->flags;
1644 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1645 BDRV_O_CACHE_WB);
1646 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1648 bdrv_refresh_limits(reopen_state->bs);
1652 * Abort the reopen, and delete and free the staged changes in
1653 * reopen_state
1655 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1657 BlockDriver *drv;
1659 assert(reopen_state != NULL);
1660 drv = reopen_state->bs->drv;
1661 assert(drv != NULL);
1663 if (drv->bdrv_reopen_abort) {
1664 drv->bdrv_reopen_abort(reopen_state);
1669 void bdrv_close(BlockDriverState *bs)
1671 if (bs->job) {
1672 block_job_cancel_sync(bs->job);
1674 bdrv_drain_all(); /* complete I/O */
1675 bdrv_flush(bs);
1676 bdrv_drain_all(); /* in case flush left pending I/O */
1677 notifier_list_notify(&bs->close_notifiers, bs);
1679 if (bs->drv) {
1680 if (bs->backing_hd) {
1681 bdrv_unref(bs->backing_hd);
1682 bs->backing_hd = NULL;
1684 bs->drv->bdrv_close(bs);
1685 g_free(bs->opaque);
1686 #ifdef _WIN32
1687 if (bs->is_temporary) {
1688 unlink(bs->filename);
1690 #endif
1691 bs->opaque = NULL;
1692 bs->drv = NULL;
1693 bs->copy_on_read = 0;
1694 bs->backing_file[0] = '\0';
1695 bs->backing_format[0] = '\0';
1696 bs->total_sectors = 0;
1697 bs->encrypted = 0;
1698 bs->valid_key = 0;
1699 bs->sg = 0;
1700 bs->growable = 0;
1701 bs->zero_beyond_eof = false;
1702 QDECREF(bs->options);
1703 bs->options = NULL;
1705 if (bs->file != NULL) {
1706 bdrv_unref(bs->file);
1707 bs->file = NULL;
1711 bdrv_dev_change_media_cb(bs, false);
1713 /*throttling disk I/O limits*/
1714 if (bs->io_limits_enabled) {
1715 bdrv_io_limits_disable(bs);
1719 void bdrv_close_all(void)
1721 BlockDriverState *bs;
1723 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1724 bdrv_close(bs);
1728 /* Check if any requests are in-flight (including throttled requests) */
1729 static bool bdrv_requests_pending(BlockDriverState *bs)
1731 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1732 return true;
1734 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1735 return true;
1737 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1738 return true;
1740 if (bs->file && bdrv_requests_pending(bs->file)) {
1741 return true;
1743 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1744 return true;
1746 return false;
1749 static bool bdrv_requests_pending_all(void)
1751 BlockDriverState *bs;
1752 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1753 if (bdrv_requests_pending(bs)) {
1754 return true;
1757 return false;
1761 * Wait for pending requests to complete across all BlockDriverStates
1763 * This function does not flush data to disk, use bdrv_flush_all() for that
1764 * after calling this function.
1766 * Note that completion of an asynchronous I/O operation can trigger any
1767 * number of other I/O operations on other devices---for example a coroutine
1768 * can be arbitrarily complex and a constant flow of I/O can come until the
1769 * coroutine is complete. Because of this, it is not possible to have a
1770 * function to drain a single device's I/O queue.
1772 void bdrv_drain_all(void)
1774 /* Always run first iteration so any pending completion BHs run */
1775 bool busy = true;
1776 BlockDriverState *bs;
1778 while (busy) {
1779 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1780 bdrv_start_throttled_reqs(bs);
1783 busy = bdrv_requests_pending_all();
1784 busy |= aio_poll(qemu_get_aio_context(), busy);
1788 /* make a BlockDriverState anonymous by removing from bdrv_state and
1789 * graph_bdrv_state list.
1790 Also, NULL terminate the device_name to prevent double remove */
1791 void bdrv_make_anon(BlockDriverState *bs)
1793 if (bs->device_name[0] != '\0') {
1794 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1796 bs->device_name[0] = '\0';
1797 if (bs->node_name[0] != '\0') {
1798 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1800 bs->node_name[0] = '\0';
1803 static void bdrv_rebind(BlockDriverState *bs)
1805 if (bs->drv && bs->drv->bdrv_rebind) {
1806 bs->drv->bdrv_rebind(bs);
1810 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1811 BlockDriverState *bs_src)
1813 /* move some fields that need to stay attached to the device */
1814 bs_dest->open_flags = bs_src->open_flags;
1816 /* dev info */
1817 bs_dest->dev_ops = bs_src->dev_ops;
1818 bs_dest->dev_opaque = bs_src->dev_opaque;
1819 bs_dest->dev = bs_src->dev;
1820 bs_dest->guest_block_size = bs_src->guest_block_size;
1821 bs_dest->copy_on_read = bs_src->copy_on_read;
1823 bs_dest->enable_write_cache = bs_src->enable_write_cache;
1825 /* i/o throttled req */
1826 memcpy(&bs_dest->throttle_state,
1827 &bs_src->throttle_state,
1828 sizeof(ThrottleState));
1829 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
1830 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
1831 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
1833 /* r/w error */
1834 bs_dest->on_read_error = bs_src->on_read_error;
1835 bs_dest->on_write_error = bs_src->on_write_error;
1837 /* i/o status */
1838 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
1839 bs_dest->iostatus = bs_src->iostatus;
1841 /* dirty bitmap */
1842 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
1844 /* reference count */
1845 bs_dest->refcnt = bs_src->refcnt;
1847 /* job */
1848 bs_dest->in_use = bs_src->in_use;
1849 bs_dest->job = bs_src->job;
1851 /* keep the same entry in bdrv_states */
1852 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1853 bs_src->device_name);
1854 bs_dest->device_list = bs_src->device_list;
1858 * Swap bs contents for two image chains while they are live,
1859 * while keeping required fields on the BlockDriverState that is
1860 * actually attached to a device.
1862 * This will modify the BlockDriverState fields, and swap contents
1863 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1865 * bs_new is required to be anonymous.
1867 * This function does not create any image files.
1869 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1871 BlockDriverState tmp;
1873 /* The code needs to swap the node_name but simply swapping node_list won't
1874 * work so first remove the nodes from the graph list, do the swap then
1875 * insert them back if needed.
1877 if (bs_new->node_name[0] != '\0') {
1878 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
1880 if (bs_old->node_name[0] != '\0') {
1881 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
1884 /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1885 assert(bs_new->device_name[0] == '\0');
1886 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1887 assert(bs_new->job == NULL);
1888 assert(bs_new->dev == NULL);
1889 assert(bs_new->in_use == 0);
1890 assert(bs_new->io_limits_enabled == false);
1891 assert(!throttle_have_timer(&bs_new->throttle_state));
1893 tmp = *bs_new;
1894 *bs_new = *bs_old;
1895 *bs_old = tmp;
1897 /* there are some fields that should not be swapped, move them back */
1898 bdrv_move_feature_fields(&tmp, bs_old);
1899 bdrv_move_feature_fields(bs_old, bs_new);
1900 bdrv_move_feature_fields(bs_new, &tmp);
1902 /* bs_new shouldn't be in bdrv_states even after the swap! */
1903 assert(bs_new->device_name[0] == '\0');
1905 /* Check a few fields that should remain attached to the device */
1906 assert(bs_new->dev == NULL);
1907 assert(bs_new->job == NULL);
1908 assert(bs_new->in_use == 0);
1909 assert(bs_new->io_limits_enabled == false);
1910 assert(!throttle_have_timer(&bs_new->throttle_state));
1912 /* insert the nodes back into the graph node list if needed */
1913 if (bs_new->node_name[0] != '\0') {
1914 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
1916 if (bs_old->node_name[0] != '\0') {
1917 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
1920 bdrv_rebind(bs_new);
1921 bdrv_rebind(bs_old);
1925 * Add new bs contents at the top of an image chain while the chain is
1926 * live, while keeping required fields on the top layer.
1928 * This will modify the BlockDriverState fields, and swap contents
1929 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1931 * bs_new is required to be anonymous.
1933 * This function does not create any image files.
1935 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1937 bdrv_swap(bs_new, bs_top);
1939 /* The contents of 'tmp' will become bs_top, as we are
1940 * swapping bs_new and bs_top contents. */
1941 bs_top->backing_hd = bs_new;
1942 bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1943 pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1944 bs_new->filename);
1945 pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1946 bs_new->drv ? bs_new->drv->format_name : "");
1949 static void bdrv_delete(BlockDriverState *bs)
1951 assert(!bs->dev);
1952 assert(!bs->job);
1953 assert(!bs->in_use);
1954 assert(!bs->refcnt);
1955 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1957 bdrv_close(bs);
1959 /* remove from list, if necessary */
1960 bdrv_make_anon(bs);
1962 g_free(bs);
1965 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1966 /* TODO change to DeviceState *dev when all users are qdevified */
1968 if (bs->dev) {
1969 return -EBUSY;
1971 bs->dev = dev;
1972 bdrv_iostatus_reset(bs);
1973 return 0;
1976 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1977 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1979 if (bdrv_attach_dev(bs, dev) < 0) {
1980 abort();
1984 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1985 /* TODO change to DeviceState *dev when all users are qdevified */
1987 assert(bs->dev == dev);
1988 bs->dev = NULL;
1989 bs->dev_ops = NULL;
1990 bs->dev_opaque = NULL;
1991 bs->guest_block_size = 512;
1994 /* TODO change to return DeviceState * when all users are qdevified */
1995 void *bdrv_get_attached_dev(BlockDriverState *bs)
1997 return bs->dev;
2000 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2001 void *opaque)
2003 bs->dev_ops = ops;
2004 bs->dev_opaque = opaque;
2007 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2008 enum MonitorEvent ev,
2009 BlockErrorAction action, bool is_read)
2011 QObject *data;
2012 const char *action_str;
2014 switch (action) {
2015 case BDRV_ACTION_REPORT:
2016 action_str = "report";
2017 break;
2018 case BDRV_ACTION_IGNORE:
2019 action_str = "ignore";
2020 break;
2021 case BDRV_ACTION_STOP:
2022 action_str = "stop";
2023 break;
2024 default:
2025 abort();
2028 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2029 bdrv->device_name,
2030 action_str,
2031 is_read ? "read" : "write");
2032 monitor_protocol_event(ev, data);
2034 qobject_decref(data);
2037 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2039 QObject *data;
2041 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2042 bdrv_get_device_name(bs), ejected);
2043 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2045 qobject_decref(data);
2048 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2050 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2051 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2052 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2053 if (tray_was_closed) {
2054 /* tray open */
2055 bdrv_emit_qmp_eject_event(bs, true);
2057 if (load) {
2058 /* tray close */
2059 bdrv_emit_qmp_eject_event(bs, false);
2064 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2066 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2069 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2071 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2072 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2076 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2078 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2079 return bs->dev_ops->is_tray_open(bs->dev_opaque);
2081 return false;
2084 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2086 if (bs->dev_ops && bs->dev_ops->resize_cb) {
2087 bs->dev_ops->resize_cb(bs->dev_opaque);
2091 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2093 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2094 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2096 return false;
2100 * Run consistency checks on an image
2102 * Returns 0 if the check could be completed (it doesn't mean that the image is
2103 * free of errors) or -errno when an internal error occurred. The results of the
2104 * check are stored in res.
2106 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2108 if (bs->drv->bdrv_check == NULL) {
2109 return -ENOTSUP;
2112 memset(res, 0, sizeof(*res));
2113 return bs->drv->bdrv_check(bs, res, fix);
2116 #define COMMIT_BUF_SECTORS 2048
2118 /* commit COW file into the raw image */
2119 int bdrv_commit(BlockDriverState *bs)
2121 BlockDriver *drv = bs->drv;
2122 int64_t sector, total_sectors, length, backing_length;
2123 int n, ro, open_flags;
2124 int ret = 0;
2125 uint8_t *buf = NULL;
2126 char filename[PATH_MAX];
2128 if (!drv)
2129 return -ENOMEDIUM;
2131 if (!bs->backing_hd) {
2132 return -ENOTSUP;
2135 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2136 return -EBUSY;
2139 ro = bs->backing_hd->read_only;
2140 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2141 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2142 open_flags = bs->backing_hd->open_flags;
2144 if (ro) {
2145 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2146 return -EACCES;
2150 length = bdrv_getlength(bs);
2151 if (length < 0) {
2152 ret = length;
2153 goto ro_cleanup;
2156 backing_length = bdrv_getlength(bs->backing_hd);
2157 if (backing_length < 0) {
2158 ret = backing_length;
2159 goto ro_cleanup;
2162 /* If our top snapshot is larger than the backing file image,
2163 * grow the backing file image if possible. If not possible,
2164 * we must return an error */
2165 if (length > backing_length) {
2166 ret = bdrv_truncate(bs->backing_hd, length);
2167 if (ret < 0) {
2168 goto ro_cleanup;
2172 total_sectors = length >> BDRV_SECTOR_BITS;
2173 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2175 for (sector = 0; sector < total_sectors; sector += n) {
2176 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2177 if (ret < 0) {
2178 goto ro_cleanup;
2180 if (ret) {
2181 ret = bdrv_read(bs, sector, buf, n);
2182 if (ret < 0) {
2183 goto ro_cleanup;
2186 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2187 if (ret < 0) {
2188 goto ro_cleanup;
2193 if (drv->bdrv_make_empty) {
2194 ret = drv->bdrv_make_empty(bs);
2195 if (ret < 0) {
2196 goto ro_cleanup;
2198 bdrv_flush(bs);
2202 * Make sure all data we wrote to the backing device is actually
2203 * stable on disk.
2205 if (bs->backing_hd) {
2206 bdrv_flush(bs->backing_hd);
2209 ret = 0;
2210 ro_cleanup:
2211 g_free(buf);
2213 if (ro) {
2214 /* ignoring error return here */
2215 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2218 return ret;
2221 int bdrv_commit_all(void)
2223 BlockDriverState *bs;
2225 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2226 if (bs->drv && bs->backing_hd) {
2227 int ret = bdrv_commit(bs);
2228 if (ret < 0) {
2229 return ret;
2233 return 0;
2237 * Remove an active request from the tracked requests list
2239 * This function should be called when a tracked request is completing.
2241 static void tracked_request_end(BdrvTrackedRequest *req)
2243 if (req->serialising) {
2244 req->bs->serialising_in_flight--;
2247 QLIST_REMOVE(req, list);
2248 qemu_co_queue_restart_all(&req->wait_queue);
2252 * Add an active request to the tracked requests list
2254 static void tracked_request_begin(BdrvTrackedRequest *req,
2255 BlockDriverState *bs,
2256 int64_t offset,
2257 unsigned int bytes, bool is_write)
2259 *req = (BdrvTrackedRequest){
2260 .bs = bs,
2261 .offset = offset,
2262 .bytes = bytes,
2263 .is_write = is_write,
2264 .co = qemu_coroutine_self(),
2265 .serialising = false,
2266 .overlap_offset = offset,
2267 .overlap_bytes = bytes,
2270 qemu_co_queue_init(&req->wait_queue);
2272 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2275 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2277 int64_t overlap_offset = req->offset & ~(align - 1);
2278 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2279 - overlap_offset;
2281 if (!req->serialising) {
2282 req->bs->serialising_in_flight++;
2283 req->serialising = true;
2286 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2287 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2291 * Round a region to cluster boundaries
2293 void bdrv_round_to_clusters(BlockDriverState *bs,
2294 int64_t sector_num, int nb_sectors,
2295 int64_t *cluster_sector_num,
2296 int *cluster_nb_sectors)
2298 BlockDriverInfo bdi;
2300 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2301 *cluster_sector_num = sector_num;
2302 *cluster_nb_sectors = nb_sectors;
2303 } else {
2304 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2305 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2306 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2307 nb_sectors, c);
2311 static int bdrv_get_cluster_size(BlockDriverState *bs)
2313 BlockDriverInfo bdi;
2314 int ret;
2316 ret = bdrv_get_info(bs, &bdi);
2317 if (ret < 0 || bdi.cluster_size == 0) {
2318 return bs->request_alignment;
2319 } else {
2320 return bdi.cluster_size;
2324 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2325 int64_t offset, unsigned int bytes)
2327 /* aaaa bbbb */
2328 if (offset >= req->overlap_offset + req->overlap_bytes) {
2329 return false;
2331 /* bbbb aaaa */
2332 if (req->overlap_offset >= offset + bytes) {
2333 return false;
2335 return true;
2338 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2340 BlockDriverState *bs = self->bs;
2341 BdrvTrackedRequest *req;
2342 bool retry;
2343 bool waited = false;
2345 if (!bs->serialising_in_flight) {
2346 return false;
2349 do {
2350 retry = false;
2351 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2352 if (req == self || (!req->serialising && !self->serialising)) {
2353 continue;
2355 if (tracked_request_overlaps(req, self->overlap_offset,
2356 self->overlap_bytes))
2358 /* Hitting this means there was a reentrant request, for
2359 * example, a block driver issuing nested requests. This must
2360 * never happen since it means deadlock.
2362 assert(qemu_coroutine_self() != req->co);
2364 /* If the request is already (indirectly) waiting for us, or
2365 * will wait for us as soon as it wakes up, then just go on
2366 * (instead of producing a deadlock in the former case). */
2367 if (!req->waiting_for) {
2368 self->waiting_for = req;
2369 qemu_co_queue_wait(&req->wait_queue);
2370 self->waiting_for = NULL;
2371 retry = true;
2372 waited = true;
2373 break;
2377 } while (retry);
2379 return waited;
2383 * Return values:
2384 * 0 - success
2385 * -EINVAL - backing format specified, but no file
2386 * -ENOSPC - can't update the backing file because no space is left in the
2387 * image file header
2388 * -ENOTSUP - format driver doesn't support changing the backing file
2390 int bdrv_change_backing_file(BlockDriverState *bs,
2391 const char *backing_file, const char *backing_fmt)
2393 BlockDriver *drv = bs->drv;
2394 int ret;
2396 /* Backing file format doesn't make sense without a backing file */
2397 if (backing_fmt && !backing_file) {
2398 return -EINVAL;
2401 if (drv->bdrv_change_backing_file != NULL) {
2402 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2403 } else {
2404 ret = -ENOTSUP;
2407 if (ret == 0) {
2408 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2409 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2411 return ret;
2415 * Finds the image layer in the chain that has 'bs' as its backing file.
2417 * active is the current topmost image.
2419 * Returns NULL if bs is not found in active's image chain,
2420 * or if active == bs.
2422 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2423 BlockDriverState *bs)
2425 BlockDriverState *overlay = NULL;
2426 BlockDriverState *intermediate;
2428 assert(active != NULL);
2429 assert(bs != NULL);
2431 /* if bs is the same as active, then by definition it has no overlay
2433 if (active == bs) {
2434 return NULL;
2437 intermediate = active;
2438 while (intermediate->backing_hd) {
2439 if (intermediate->backing_hd == bs) {
2440 overlay = intermediate;
2441 break;
2443 intermediate = intermediate->backing_hd;
2446 return overlay;
2449 typedef struct BlkIntermediateStates {
2450 BlockDriverState *bs;
2451 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2452 } BlkIntermediateStates;
2456 * Drops images above 'base' up to and including 'top', and sets the image
2457 * above 'top' to have base as its backing file.
2459 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2460 * information in 'bs' can be properly updated.
2462 * E.g., this will convert the following chain:
2463 * bottom <- base <- intermediate <- top <- active
2465 * to
2467 * bottom <- base <- active
2469 * It is allowed for bottom==base, in which case it converts:
2471 * base <- intermediate <- top <- active
2473 * to
2475 * base <- active
2477 * Error conditions:
2478 * if active == top, that is considered an error
2481 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2482 BlockDriverState *base)
2484 BlockDriverState *intermediate;
2485 BlockDriverState *base_bs = NULL;
2486 BlockDriverState *new_top_bs = NULL;
2487 BlkIntermediateStates *intermediate_state, *next;
2488 int ret = -EIO;
2490 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2491 QSIMPLEQ_INIT(&states_to_delete);
2493 if (!top->drv || !base->drv) {
2494 goto exit;
2497 new_top_bs = bdrv_find_overlay(active, top);
2499 if (new_top_bs == NULL) {
2500 /* we could not find the image above 'top', this is an error */
2501 goto exit;
2504 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2505 * to do, no intermediate images */
2506 if (new_top_bs->backing_hd == base) {
2507 ret = 0;
2508 goto exit;
2511 intermediate = top;
2513 /* now we will go down through the list, and add each BDS we find
2514 * into our deletion queue, until we hit the 'base'
2516 while (intermediate) {
2517 intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2518 intermediate_state->bs = intermediate;
2519 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2521 if (intermediate->backing_hd == base) {
2522 base_bs = intermediate->backing_hd;
2523 break;
2525 intermediate = intermediate->backing_hd;
2527 if (base_bs == NULL) {
2528 /* something went wrong, we did not end at the base. safely
2529 * unravel everything, and exit with error */
2530 goto exit;
2533 /* success - we can delete the intermediate states, and link top->base */
2534 ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2535 base_bs->drv ? base_bs->drv->format_name : "");
2536 if (ret) {
2537 goto exit;
2539 new_top_bs->backing_hd = base_bs;
2541 bdrv_refresh_limits(new_top_bs);
2543 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2544 /* so that bdrv_close() does not recursively close the chain */
2545 intermediate_state->bs->backing_hd = NULL;
2546 bdrv_unref(intermediate_state->bs);
2548 ret = 0;
2550 exit:
2551 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2552 g_free(intermediate_state);
2554 return ret;
2558 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2559 size_t size)
2561 int64_t len;
2563 if (!bdrv_is_inserted(bs))
2564 return -ENOMEDIUM;
2566 if (bs->growable)
2567 return 0;
2569 len = bdrv_getlength(bs);
2571 if (offset < 0)
2572 return -EIO;
2574 if ((offset > len) || (len - offset < size))
2575 return -EIO;
2577 return 0;
2580 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2581 int nb_sectors)
2583 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2584 nb_sectors * BDRV_SECTOR_SIZE);
2587 typedef struct RwCo {
2588 BlockDriverState *bs;
2589 int64_t offset;
2590 QEMUIOVector *qiov;
2591 bool is_write;
2592 int ret;
2593 BdrvRequestFlags flags;
2594 } RwCo;
2596 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2598 RwCo *rwco = opaque;
2600 if (!rwco->is_write) {
2601 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2602 rwco->qiov->size, rwco->qiov,
2603 rwco->flags);
2604 } else {
2605 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2606 rwco->qiov->size, rwco->qiov,
2607 rwco->flags);
2612 * Process a vectored synchronous request using coroutines
2614 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2615 QEMUIOVector *qiov, bool is_write,
2616 BdrvRequestFlags flags)
2618 Coroutine *co;
2619 RwCo rwco = {
2620 .bs = bs,
2621 .offset = offset,
2622 .qiov = qiov,
2623 .is_write = is_write,
2624 .ret = NOT_DONE,
2625 .flags = flags,
2629 * In sync call context, when the vcpu is blocked, this throttling timer
2630 * will not fire; so the I/O throttling function has to be disabled here
2631 * if it has been enabled.
2633 if (bs->io_limits_enabled) {
2634 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2635 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2636 bdrv_io_limits_disable(bs);
2639 if (qemu_in_coroutine()) {
2640 /* Fast-path if already in coroutine context */
2641 bdrv_rw_co_entry(&rwco);
2642 } else {
2643 co = qemu_coroutine_create(bdrv_rw_co_entry);
2644 qemu_coroutine_enter(co, &rwco);
2645 while (rwco.ret == NOT_DONE) {
2646 qemu_aio_wait();
2649 return rwco.ret;
2653 * Process a synchronous request using coroutines
2655 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2656 int nb_sectors, bool is_write, BdrvRequestFlags flags)
2658 QEMUIOVector qiov;
2659 struct iovec iov = {
2660 .iov_base = (void *)buf,
2661 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2664 qemu_iovec_init_external(&qiov, &iov, 1);
2665 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2666 &qiov, is_write, flags);
2669 /* return < 0 if error. See bdrv_write() for the return codes */
2670 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2671 uint8_t *buf, int nb_sectors)
2673 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2676 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2677 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2678 uint8_t *buf, int nb_sectors)
2680 bool enabled;
2681 int ret;
2683 enabled = bs->io_limits_enabled;
2684 bs->io_limits_enabled = false;
2685 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2686 bs->io_limits_enabled = enabled;
2687 return ret;
2690 /* Return < 0 if error. Important errors are:
2691 -EIO generic I/O error (may happen for all errors)
2692 -ENOMEDIUM No media inserted.
2693 -EINVAL Invalid sector number or nb_sectors
2694 -EACCES Trying to write a read-only device
2696 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2697 const uint8_t *buf, int nb_sectors)
2699 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2702 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2703 int nb_sectors, BdrvRequestFlags flags)
2705 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2706 BDRV_REQ_ZERO_WRITE | flags);
2710 * Completely zero out a block device with the help of bdrv_write_zeroes.
2711 * The operation is sped up by checking the block status and only writing
2712 * zeroes to the device if they currently do not return zeroes. Optional
2713 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2715 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2717 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2719 int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2720 int64_t ret, nb_sectors, sector_num = 0;
2721 int n;
2723 for (;;) {
2724 nb_sectors = target_size - sector_num;
2725 if (nb_sectors <= 0) {
2726 return 0;
2728 if (nb_sectors > INT_MAX) {
2729 nb_sectors = INT_MAX;
2731 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2732 if (ret < 0) {
2733 error_report("error getting block status at sector %" PRId64 ": %s",
2734 sector_num, strerror(-ret));
2735 return ret;
2737 if (ret & BDRV_BLOCK_ZERO) {
2738 sector_num += n;
2739 continue;
2741 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2742 if (ret < 0) {
2743 error_report("error writing zeroes at sector %" PRId64 ": %s",
2744 sector_num, strerror(-ret));
2745 return ret;
2747 sector_num += n;
2751 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2753 QEMUIOVector qiov;
2754 struct iovec iov = {
2755 .iov_base = (void *)buf,
2756 .iov_len = bytes,
2758 int ret;
2760 if (bytes < 0) {
2761 return -EINVAL;
2764 qemu_iovec_init_external(&qiov, &iov, 1);
2765 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2766 if (ret < 0) {
2767 return ret;
2770 return bytes;
2773 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2775 int ret;
2777 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2778 if (ret < 0) {
2779 return ret;
2782 return qiov->size;
2785 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2786 const void *buf, int bytes)
2788 QEMUIOVector qiov;
2789 struct iovec iov = {
2790 .iov_base = (void *) buf,
2791 .iov_len = bytes,
2794 if (bytes < 0) {
2795 return -EINVAL;
2798 qemu_iovec_init_external(&qiov, &iov, 1);
2799 return bdrv_pwritev(bs, offset, &qiov);
2803 * Writes to the file and ensures that no writes are reordered across this
2804 * request (acts as a barrier)
2806 * Returns 0 on success, -errno in error cases.
2808 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2809 const void *buf, int count)
2811 int ret;
2813 ret = bdrv_pwrite(bs, offset, buf, count);
2814 if (ret < 0) {
2815 return ret;
2818 /* No flush needed for cache modes that already do it */
2819 if (bs->enable_write_cache) {
2820 bdrv_flush(bs);
2823 return 0;
2826 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2827 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2829 /* Perform I/O through a temporary buffer so that users who scribble over
2830 * their read buffer while the operation is in progress do not end up
2831 * modifying the image file. This is critical for zero-copy guest I/O
2832 * where anything might happen inside guest memory.
2834 void *bounce_buffer;
2836 BlockDriver *drv = bs->drv;
2837 struct iovec iov;
2838 QEMUIOVector bounce_qiov;
2839 int64_t cluster_sector_num;
2840 int cluster_nb_sectors;
2841 size_t skip_bytes;
2842 int ret;
2844 /* Cover entire cluster so no additional backing file I/O is required when
2845 * allocating cluster in the image file.
2847 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2848 &cluster_sector_num, &cluster_nb_sectors);
2850 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2851 cluster_sector_num, cluster_nb_sectors);
2853 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2854 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2855 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2857 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2858 &bounce_qiov);
2859 if (ret < 0) {
2860 goto err;
2863 if (drv->bdrv_co_write_zeroes &&
2864 buffer_is_zero(bounce_buffer, iov.iov_len)) {
2865 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2866 cluster_nb_sectors, 0);
2867 } else {
2868 /* This does not change the data on the disk, it is not necessary
2869 * to flush even in cache=writethrough mode.
2871 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2872 &bounce_qiov);
2875 if (ret < 0) {
2876 /* It might be okay to ignore write errors for guest requests. If this
2877 * is a deliberate copy-on-read then we don't want to ignore the error.
2878 * Simply report it in all cases.
2880 goto err;
2883 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2884 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2885 nb_sectors * BDRV_SECTOR_SIZE);
2887 err:
2888 qemu_vfree(bounce_buffer);
2889 return ret;
2893 * Forwards an already correctly aligned request to the BlockDriver. This
2894 * handles copy on read and zeroing after EOF; any other features must be
2895 * implemented by the caller.
2897 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2898 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2899 int64_t align, QEMUIOVector *qiov, int flags)
2901 BlockDriver *drv = bs->drv;
2902 int ret;
2904 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2905 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2907 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2908 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2910 /* Handle Copy on Read and associated serialisation */
2911 if (flags & BDRV_REQ_COPY_ON_READ) {
2912 /* If we touch the same cluster it counts as an overlap. This
2913 * guarantees that allocating writes will be serialized and not race
2914 * with each other for the same cluster. For example, in copy-on-read
2915 * it ensures that the CoR read and write operations are atomic and
2916 * guest writes cannot interleave between them. */
2917 mark_request_serialising(req, bdrv_get_cluster_size(bs));
2920 wait_serialising_requests(req);
2922 if (flags & BDRV_REQ_COPY_ON_READ) {
2923 int pnum;
2925 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2926 if (ret < 0) {
2927 goto out;
2930 if (!ret || pnum != nb_sectors) {
2931 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2932 goto out;
2936 /* Forward the request to the BlockDriver */
2937 if (!(bs->zero_beyond_eof && bs->growable)) {
2938 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2939 } else {
2940 /* Read zeros after EOF of growable BDSes */
2941 int64_t len, total_sectors, max_nb_sectors;
2943 len = bdrv_getlength(bs);
2944 if (len < 0) {
2945 ret = len;
2946 goto out;
2949 total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2950 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
2951 align >> BDRV_SECTOR_BITS);
2952 if (max_nb_sectors > 0) {
2953 ret = drv->bdrv_co_readv(bs, sector_num,
2954 MIN(nb_sectors, max_nb_sectors), qiov);
2955 } else {
2956 ret = 0;
2959 /* Reading beyond end of file is supposed to produce zeroes */
2960 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2961 uint64_t offset = MAX(0, total_sectors - sector_num);
2962 uint64_t bytes = (sector_num + nb_sectors - offset) *
2963 BDRV_SECTOR_SIZE;
2964 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2968 out:
2969 return ret;
2973 * Handle a read request in coroutine context
2975 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
2976 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
2977 BdrvRequestFlags flags)
2979 BlockDriver *drv = bs->drv;
2980 BdrvTrackedRequest req;
2982 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
2983 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
2984 uint8_t *head_buf = NULL;
2985 uint8_t *tail_buf = NULL;
2986 QEMUIOVector local_qiov;
2987 bool use_local_qiov = false;
2988 int ret;
2990 if (!drv) {
2991 return -ENOMEDIUM;
2993 if (bdrv_check_byte_request(bs, offset, bytes)) {
2994 return -EIO;
2997 if (bs->copy_on_read) {
2998 flags |= BDRV_REQ_COPY_ON_READ;
3001 /* throttling disk I/O */
3002 if (bs->io_limits_enabled) {
3003 bdrv_io_limits_intercept(bs, bytes, false);
3006 /* Align read if necessary by padding qiov */
3007 if (offset & (align - 1)) {
3008 head_buf = qemu_blockalign(bs, align);
3009 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3010 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3011 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3012 use_local_qiov = true;
3014 bytes += offset & (align - 1);
3015 offset = offset & ~(align - 1);
3018 if ((offset + bytes) & (align - 1)) {
3019 if (!use_local_qiov) {
3020 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3021 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3022 use_local_qiov = true;
3024 tail_buf = qemu_blockalign(bs, align);
3025 qemu_iovec_add(&local_qiov, tail_buf,
3026 align - ((offset + bytes) & (align - 1)));
3028 bytes = ROUND_UP(bytes, align);
3031 tracked_request_begin(&req, bs, offset, bytes, false);
3032 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3033 use_local_qiov ? &local_qiov : qiov,
3034 flags);
3035 tracked_request_end(&req);
3037 if (use_local_qiov) {
3038 qemu_iovec_destroy(&local_qiov);
3039 qemu_vfree(head_buf);
3040 qemu_vfree(tail_buf);
3043 return ret;
3046 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3047 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3048 BdrvRequestFlags flags)
3050 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3051 return -EINVAL;
3054 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3055 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3058 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3059 int nb_sectors, QEMUIOVector *qiov)
3061 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3063 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3066 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3067 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3069 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3071 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3072 BDRV_REQ_COPY_ON_READ);
3075 /* if no limit is specified in the BlockLimits use a default
3076 * of 32768 512-byte sectors (16 MiB) per request.
3078 #define MAX_WRITE_ZEROES_DEFAULT 32768
3080 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3081 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3083 BlockDriver *drv = bs->drv;
3084 QEMUIOVector qiov;
3085 struct iovec iov = {0};
3086 int ret = 0;
3088 int max_write_zeroes = bs->bl.max_write_zeroes ?
3089 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3091 while (nb_sectors > 0 && !ret) {
3092 int num = nb_sectors;
3094 /* Align request. Block drivers can expect the "bulk" of the request
3095 * to be aligned.
3097 if (bs->bl.write_zeroes_alignment
3098 && num > bs->bl.write_zeroes_alignment) {
3099 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3100 /* Make a small request up to the first aligned sector. */
3101 num = bs->bl.write_zeroes_alignment;
3102 num -= sector_num % bs->bl.write_zeroes_alignment;
3103 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3104 /* Shorten the request to the last aligned sector. num cannot
3105 * underflow because num > bs->bl.write_zeroes_alignment.
3107 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3111 /* limit request size */
3112 if (num > max_write_zeroes) {
3113 num = max_write_zeroes;
3116 ret = -ENOTSUP;
3117 /* First try the efficient write zeroes operation */
3118 if (drv->bdrv_co_write_zeroes) {
3119 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3122 if (ret == -ENOTSUP) {
3123 /* Fall back to bounce buffer if write zeroes is unsupported */
3124 iov.iov_len = num * BDRV_SECTOR_SIZE;
3125 if (iov.iov_base == NULL) {
3126 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3127 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3129 qemu_iovec_init_external(&qiov, &iov, 1);
3131 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3133 /* Keep bounce buffer around if it is big enough for all
3134 * all future requests.
3136 if (num < max_write_zeroes) {
3137 qemu_vfree(iov.iov_base);
3138 iov.iov_base = NULL;
3142 sector_num += num;
3143 nb_sectors -= num;
3146 qemu_vfree(iov.iov_base);
3147 return ret;
3151 * Forwards an already correctly aligned write request to the BlockDriver.
3153 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3154 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3155 QEMUIOVector *qiov, int flags)
3157 BlockDriver *drv = bs->drv;
3158 bool waited;
3159 int ret;
3161 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3162 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3164 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3165 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3167 waited = wait_serialising_requests(req);
3168 assert(!waited || !req->serialising);
3169 assert(req->overlap_offset <= offset);
3170 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3172 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3174 if (ret < 0) {
3175 /* Do nothing, write notifier decided to fail this request */
3176 } else if (flags & BDRV_REQ_ZERO_WRITE) {
3177 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3178 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3179 } else {
3180 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3181 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3183 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3185 if (ret == 0 && !bs->enable_write_cache) {
3186 ret = bdrv_co_flush(bs);
3189 bdrv_set_dirty(bs, sector_num, nb_sectors);
3191 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3192 bs->wr_highest_sector = sector_num + nb_sectors - 1;
3194 if (bs->growable && ret >= 0) {
3195 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3198 return ret;
3202 * Handle a write request in coroutine context
3204 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3205 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3206 BdrvRequestFlags flags)
3208 BdrvTrackedRequest req;
3209 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3210 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3211 uint8_t *head_buf = NULL;
3212 uint8_t *tail_buf = NULL;
3213 QEMUIOVector local_qiov;
3214 bool use_local_qiov = false;
3215 int ret;
3217 if (!bs->drv) {
3218 return -ENOMEDIUM;
3220 if (bs->read_only) {
3221 return -EACCES;
3223 if (bdrv_check_byte_request(bs, offset, bytes)) {
3224 return -EIO;
3227 /* throttling disk I/O */
3228 if (bs->io_limits_enabled) {
3229 bdrv_io_limits_intercept(bs, bytes, true);
3233 * Align write if necessary by performing a read-modify-write cycle.
3234 * Pad qiov with the read parts and be sure to have a tracked request not
3235 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3237 tracked_request_begin(&req, bs, offset, bytes, true);
3239 if (offset & (align - 1)) {
3240 QEMUIOVector head_qiov;
3241 struct iovec head_iov;
3243 mark_request_serialising(&req, align);
3244 wait_serialising_requests(&req);
3246 head_buf = qemu_blockalign(bs, align);
3247 head_iov = (struct iovec) {
3248 .iov_base = head_buf,
3249 .iov_len = align,
3251 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3253 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3254 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3255 align, &head_qiov, 0);
3256 if (ret < 0) {
3257 goto fail;
3259 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3261 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3262 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3263 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3264 use_local_qiov = true;
3266 bytes += offset & (align - 1);
3267 offset = offset & ~(align - 1);
3270 if ((offset + bytes) & (align - 1)) {
3271 QEMUIOVector tail_qiov;
3272 struct iovec tail_iov;
3273 size_t tail_bytes;
3274 bool waited;
3276 mark_request_serialising(&req, align);
3277 waited = wait_serialising_requests(&req);
3278 assert(!waited || !use_local_qiov);
3280 tail_buf = qemu_blockalign(bs, align);
3281 tail_iov = (struct iovec) {
3282 .iov_base = tail_buf,
3283 .iov_len = align,
3285 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3287 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3288 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3289 align, &tail_qiov, 0);
3290 if (ret < 0) {
3291 goto fail;
3293 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3295 if (!use_local_qiov) {
3296 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3297 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3298 use_local_qiov = true;
3301 tail_bytes = (offset + bytes) & (align - 1);
3302 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3304 bytes = ROUND_UP(bytes, align);
3307 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3308 use_local_qiov ? &local_qiov : qiov,
3309 flags);
3311 fail:
3312 tracked_request_end(&req);
3314 if (use_local_qiov) {
3315 qemu_iovec_destroy(&local_qiov);
3317 qemu_vfree(head_buf);
3318 qemu_vfree(tail_buf);
3320 return ret;
3323 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3324 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3325 BdrvRequestFlags flags)
3327 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3328 return -EINVAL;
3331 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3332 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3335 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3336 int nb_sectors, QEMUIOVector *qiov)
3338 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3340 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3343 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3344 int64_t sector_num, int nb_sectors,
3345 BdrvRequestFlags flags)
3347 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3349 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3350 flags &= ~BDRV_REQ_MAY_UNMAP;
3353 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3354 BDRV_REQ_ZERO_WRITE | flags);
3358 * Truncate file to 'offset' bytes (needed only for file protocols)
3360 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3362 BlockDriver *drv = bs->drv;
3363 int ret;
3364 if (!drv)
3365 return -ENOMEDIUM;
3366 if (!drv->bdrv_truncate)
3367 return -ENOTSUP;
3368 if (bs->read_only)
3369 return -EACCES;
3370 if (bdrv_in_use(bs))
3371 return -EBUSY;
3372 ret = drv->bdrv_truncate(bs, offset);
3373 if (ret == 0) {
3374 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3375 bdrv_dev_resize_cb(bs);
3377 return ret;
3381 * Length of a allocated file in bytes. Sparse files are counted by actual
3382 * allocated space. Return < 0 if error or unknown.
3384 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3386 BlockDriver *drv = bs->drv;
3387 if (!drv) {
3388 return -ENOMEDIUM;
3390 if (drv->bdrv_get_allocated_file_size) {
3391 return drv->bdrv_get_allocated_file_size(bs);
3393 if (bs->file) {
3394 return bdrv_get_allocated_file_size(bs->file);
3396 return -ENOTSUP;
3400 * Length of a file in bytes. Return < 0 if error or unknown.
3402 int64_t bdrv_getlength(BlockDriverState *bs)
3404 BlockDriver *drv = bs->drv;
3405 if (!drv)
3406 return -ENOMEDIUM;
3408 if (drv->has_variable_length) {
3409 int ret = refresh_total_sectors(bs, bs->total_sectors);
3410 if (ret < 0) {
3411 return ret;
3414 return bs->total_sectors * BDRV_SECTOR_SIZE;
3417 /* return 0 as number of sectors if no device present or error */
3418 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3420 int64_t length;
3421 length = bdrv_getlength(bs);
3422 if (length < 0)
3423 length = 0;
3424 else
3425 length = length >> BDRV_SECTOR_BITS;
3426 *nb_sectors_ptr = length;
3429 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3430 BlockdevOnError on_write_error)
3432 bs->on_read_error = on_read_error;
3433 bs->on_write_error = on_write_error;
3436 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3438 return is_read ? bs->on_read_error : bs->on_write_error;
3441 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3443 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3445 switch (on_err) {
3446 case BLOCKDEV_ON_ERROR_ENOSPC:
3447 return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3448 case BLOCKDEV_ON_ERROR_STOP:
3449 return BDRV_ACTION_STOP;
3450 case BLOCKDEV_ON_ERROR_REPORT:
3451 return BDRV_ACTION_REPORT;
3452 case BLOCKDEV_ON_ERROR_IGNORE:
3453 return BDRV_ACTION_IGNORE;
3454 default:
3455 abort();
3459 /* This is done by device models because, while the block layer knows
3460 * about the error, it does not know whether an operation comes from
3461 * the device or the block layer (from a job, for example).
3463 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3464 bool is_read, int error)
3466 assert(error >= 0);
3467 bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3468 if (action == BDRV_ACTION_STOP) {
3469 vm_stop(RUN_STATE_IO_ERROR);
3470 bdrv_iostatus_set_err(bs, error);
3474 int bdrv_is_read_only(BlockDriverState *bs)
3476 return bs->read_only;
3479 int bdrv_is_sg(BlockDriverState *bs)
3481 return bs->sg;
3484 int bdrv_enable_write_cache(BlockDriverState *bs)
3486 return bs->enable_write_cache;
3489 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3491 bs->enable_write_cache = wce;
3493 /* so a reopen() will preserve wce */
3494 if (wce) {
3495 bs->open_flags |= BDRV_O_CACHE_WB;
3496 } else {
3497 bs->open_flags &= ~BDRV_O_CACHE_WB;
3501 int bdrv_is_encrypted(BlockDriverState *bs)
3503 if (bs->backing_hd && bs->backing_hd->encrypted)
3504 return 1;
3505 return bs->encrypted;
3508 int bdrv_key_required(BlockDriverState *bs)
3510 BlockDriverState *backing_hd = bs->backing_hd;
3512 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3513 return 1;
3514 return (bs->encrypted && !bs->valid_key);
3517 int bdrv_set_key(BlockDriverState *bs, const char *key)
3519 int ret;
3520 if (bs->backing_hd && bs->backing_hd->encrypted) {
3521 ret = bdrv_set_key(bs->backing_hd, key);
3522 if (ret < 0)
3523 return ret;
3524 if (!bs->encrypted)
3525 return 0;
3527 if (!bs->encrypted) {
3528 return -EINVAL;
3529 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3530 return -ENOMEDIUM;
3532 ret = bs->drv->bdrv_set_key(bs, key);
3533 if (ret < 0) {
3534 bs->valid_key = 0;
3535 } else if (!bs->valid_key) {
3536 bs->valid_key = 1;
3537 /* call the change callback now, we skipped it on open */
3538 bdrv_dev_change_media_cb(bs, true);
3540 return ret;
3543 const char *bdrv_get_format_name(BlockDriverState *bs)
3545 return bs->drv ? bs->drv->format_name : NULL;
3548 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3549 void *opaque)
3551 BlockDriver *drv;
3553 QLIST_FOREACH(drv, &bdrv_drivers, list) {
3554 it(opaque, drv->format_name);
3558 /* This function is to find block backend bs */
3559 BlockDriverState *bdrv_find(const char *name)
3561 BlockDriverState *bs;
3563 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3564 if (!strcmp(name, bs->device_name)) {
3565 return bs;
3568 return NULL;
3571 /* This function is to find a node in the bs graph */
3572 BlockDriverState *bdrv_find_node(const char *node_name)
3574 BlockDriverState *bs;
3576 assert(node_name);
3578 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3579 if (!strcmp(node_name, bs->node_name)) {
3580 return bs;
3583 return NULL;
3586 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3587 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3589 BlockDeviceInfoList *list, *entry;
3590 BlockDriverState *bs;
3592 list = NULL;
3593 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3594 entry = g_malloc0(sizeof(*entry));
3595 entry->value = bdrv_block_device_info(bs);
3596 entry->next = list;
3597 list = entry;
3600 return list;
3603 BlockDriverState *bdrv_lookup_bs(const char *device,
3604 const char *node_name,
3605 Error **errp)
3607 BlockDriverState *bs = NULL;
3609 if (device) {
3610 bs = bdrv_find(device);
3612 if (bs) {
3613 return bs;
3617 if (node_name) {
3618 bs = bdrv_find_node(node_name);
3620 if (bs) {
3621 return bs;
3625 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3626 device ? device : "",
3627 node_name ? node_name : "");
3628 return NULL;
3631 BlockDriverState *bdrv_next(BlockDriverState *bs)
3633 if (!bs) {
3634 return QTAILQ_FIRST(&bdrv_states);
3636 return QTAILQ_NEXT(bs, device_list);
3639 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3641 BlockDriverState *bs;
3643 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3644 it(opaque, bs);
3648 const char *bdrv_get_device_name(BlockDriverState *bs)
3650 return bs->device_name;
3653 int bdrv_get_flags(BlockDriverState *bs)
3655 return bs->open_flags;
3658 int bdrv_flush_all(void)
3660 BlockDriverState *bs;
3661 int result = 0;
3663 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3664 int ret = bdrv_flush(bs);
3665 if (ret < 0 && !result) {
3666 result = ret;
3670 return result;
3673 int bdrv_has_zero_init_1(BlockDriverState *bs)
3675 return 1;
3678 int bdrv_has_zero_init(BlockDriverState *bs)
3680 assert(bs->drv);
3682 /* If BS is a copy on write image, it is initialized to
3683 the contents of the base image, which may not be zeroes. */
3684 if (bs->backing_hd) {
3685 return 0;
3687 if (bs->drv->bdrv_has_zero_init) {
3688 return bs->drv->bdrv_has_zero_init(bs);
3691 /* safe default */
3692 return 0;
3695 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3697 BlockDriverInfo bdi;
3699 if (bs->backing_hd) {
3700 return false;
3703 if (bdrv_get_info(bs, &bdi) == 0) {
3704 return bdi.unallocated_blocks_are_zero;
3707 return false;
3710 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3712 BlockDriverInfo bdi;
3714 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3715 return false;
3718 if (bdrv_get_info(bs, &bdi) == 0) {
3719 return bdi.can_write_zeroes_with_unmap;
3722 return false;
3725 typedef struct BdrvCoGetBlockStatusData {
3726 BlockDriverState *bs;
3727 BlockDriverState *base;
3728 int64_t sector_num;
3729 int nb_sectors;
3730 int *pnum;
3731 int64_t ret;
3732 bool done;
3733 } BdrvCoGetBlockStatusData;
3736 * Returns true iff the specified sector is present in the disk image. Drivers
3737 * not implementing the functionality are assumed to not support backing files,
3738 * hence all their sectors are reported as allocated.
3740 * If 'sector_num' is beyond the end of the disk image the return value is 0
3741 * and 'pnum' is set to 0.
3743 * 'pnum' is set to the number of sectors (including and immediately following
3744 * the specified sector) that are known to be in the same
3745 * allocated/unallocated state.
3747 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3748 * beyond the end of the disk image it will be clamped.
3750 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3751 int64_t sector_num,
3752 int nb_sectors, int *pnum)
3754 int64_t length;
3755 int64_t n;
3756 int64_t ret, ret2;
3758 length = bdrv_getlength(bs);
3759 if (length < 0) {
3760 return length;
3763 if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3764 *pnum = 0;
3765 return 0;
3768 n = bs->total_sectors - sector_num;
3769 if (n < nb_sectors) {
3770 nb_sectors = n;
3773 if (!bs->drv->bdrv_co_get_block_status) {
3774 *pnum = nb_sectors;
3775 ret = BDRV_BLOCK_DATA;
3776 if (bs->drv->protocol_name) {
3777 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3779 return ret;
3782 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3783 if (ret < 0) {
3784 *pnum = 0;
3785 return ret;
3788 if (ret & BDRV_BLOCK_RAW) {
3789 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3790 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3791 *pnum, pnum);
3794 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3795 if (bdrv_unallocated_blocks_are_zero(bs)) {
3796 ret |= BDRV_BLOCK_ZERO;
3797 } else if (bs->backing_hd) {
3798 BlockDriverState *bs2 = bs->backing_hd;
3799 int64_t length2 = bdrv_getlength(bs2);
3800 if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3801 ret |= BDRV_BLOCK_ZERO;
3806 if (bs->file &&
3807 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3808 (ret & BDRV_BLOCK_OFFSET_VALID)) {
3809 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3810 *pnum, pnum);
3811 if (ret2 >= 0) {
3812 /* Ignore errors. This is just providing extra information, it
3813 * is useful but not necessary.
3815 ret |= (ret2 & BDRV_BLOCK_ZERO);
3819 return ret;
3822 /* Coroutine wrapper for bdrv_get_block_status() */
3823 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3825 BdrvCoGetBlockStatusData *data = opaque;
3826 BlockDriverState *bs = data->bs;
3828 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3829 data->pnum);
3830 data->done = true;
3834 * Synchronous wrapper around bdrv_co_get_block_status().
3836 * See bdrv_co_get_block_status() for details.
3838 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3839 int nb_sectors, int *pnum)
3841 Coroutine *co;
3842 BdrvCoGetBlockStatusData data = {
3843 .bs = bs,
3844 .sector_num = sector_num,
3845 .nb_sectors = nb_sectors,
3846 .pnum = pnum,
3847 .done = false,
3850 if (qemu_in_coroutine()) {
3851 /* Fast-path if already in coroutine context */
3852 bdrv_get_block_status_co_entry(&data);
3853 } else {
3854 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3855 qemu_coroutine_enter(co, &data);
3856 while (!data.done) {
3857 qemu_aio_wait();
3860 return data.ret;
3863 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3864 int nb_sectors, int *pnum)
3866 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3867 if (ret < 0) {
3868 return ret;
3870 return
3871 (ret & BDRV_BLOCK_DATA) ||
3872 ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3876 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3878 * Return true if the given sector is allocated in any image between
3879 * BASE and TOP (inclusive). BASE can be NULL to check if the given
3880 * sector is allocated in any image of the chain. Return false otherwise.
3882 * 'pnum' is set to the number of sectors (including and immediately following
3883 * the specified sector) that are known to be in the same
3884 * allocated/unallocated state.
3887 int bdrv_is_allocated_above(BlockDriverState *top,
3888 BlockDriverState *base,
3889 int64_t sector_num,
3890 int nb_sectors, int *pnum)
3892 BlockDriverState *intermediate;
3893 int ret, n = nb_sectors;
3895 intermediate = top;
3896 while (intermediate && intermediate != base) {
3897 int pnum_inter;
3898 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3899 &pnum_inter);
3900 if (ret < 0) {
3901 return ret;
3902 } else if (ret) {
3903 *pnum = pnum_inter;
3904 return 1;
3908 * [sector_num, nb_sectors] is unallocated on top but intermediate
3909 * might have
3911 * [sector_num+x, nr_sectors] allocated.
3913 if (n > pnum_inter &&
3914 (intermediate == top ||
3915 sector_num + pnum_inter < intermediate->total_sectors)) {
3916 n = pnum_inter;
3919 intermediate = intermediate->backing_hd;
3922 *pnum = n;
3923 return 0;
3926 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3928 if (bs->backing_hd && bs->backing_hd->encrypted)
3929 return bs->backing_file;
3930 else if (bs->encrypted)
3931 return bs->filename;
3932 else
3933 return NULL;
3936 void bdrv_get_backing_filename(BlockDriverState *bs,
3937 char *filename, int filename_size)
3939 pstrcpy(filename, filename_size, bs->backing_file);
3942 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3943 const uint8_t *buf, int nb_sectors)
3945 BlockDriver *drv = bs->drv;
3946 if (!drv)
3947 return -ENOMEDIUM;
3948 if (!drv->bdrv_write_compressed)
3949 return -ENOTSUP;
3950 if (bdrv_check_request(bs, sector_num, nb_sectors))
3951 return -EIO;
3953 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3955 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3958 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3960 BlockDriver *drv = bs->drv;
3961 if (!drv)
3962 return -ENOMEDIUM;
3963 if (!drv->bdrv_get_info)
3964 return -ENOTSUP;
3965 memset(bdi, 0, sizeof(*bdi));
3966 return drv->bdrv_get_info(bs, bdi);
3969 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3971 BlockDriver *drv = bs->drv;
3972 if (drv && drv->bdrv_get_specific_info) {
3973 return drv->bdrv_get_specific_info(bs);
3975 return NULL;
3978 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3979 int64_t pos, int size)
3981 QEMUIOVector qiov;
3982 struct iovec iov = {
3983 .iov_base = (void *) buf,
3984 .iov_len = size,
3987 qemu_iovec_init_external(&qiov, &iov, 1);
3988 return bdrv_writev_vmstate(bs, &qiov, pos);
3991 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3993 BlockDriver *drv = bs->drv;
3995 if (!drv) {
3996 return -ENOMEDIUM;
3997 } else if (drv->bdrv_save_vmstate) {
3998 return drv->bdrv_save_vmstate(bs, qiov, pos);
3999 } else if (bs->file) {
4000 return bdrv_writev_vmstate(bs->file, qiov, pos);
4003 return -ENOTSUP;
4006 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4007 int64_t pos, int size)
4009 BlockDriver *drv = bs->drv;
4010 if (!drv)
4011 return -ENOMEDIUM;
4012 if (drv->bdrv_load_vmstate)
4013 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4014 if (bs->file)
4015 return bdrv_load_vmstate(bs->file, buf, pos, size);
4016 return -ENOTSUP;
4019 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4021 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4022 return;
4025 bs->drv->bdrv_debug_event(bs, event);
4028 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4029 const char *tag)
4031 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4032 bs = bs->file;
4035 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4036 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4039 return -ENOTSUP;
4042 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4044 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4045 bs = bs->file;
4048 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4049 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4052 return -ENOTSUP;
4055 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4057 while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
4058 bs = bs->file;
4061 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4062 return bs->drv->bdrv_debug_resume(bs, tag);
4065 return -ENOTSUP;
4068 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4070 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4071 bs = bs->file;
4074 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4075 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4078 return false;
4081 int bdrv_is_snapshot(BlockDriverState *bs)
4083 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4086 /* backing_file can either be relative, or absolute, or a protocol. If it is
4087 * relative, it must be relative to the chain. So, passing in bs->filename
4088 * from a BDS as backing_file should not be done, as that may be relative to
4089 * the CWD rather than the chain. */
4090 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4091 const char *backing_file)
4093 char *filename_full = NULL;
4094 char *backing_file_full = NULL;
4095 char *filename_tmp = NULL;
4096 int is_protocol = 0;
4097 BlockDriverState *curr_bs = NULL;
4098 BlockDriverState *retval = NULL;
4100 if (!bs || !bs->drv || !backing_file) {
4101 return NULL;
4104 filename_full = g_malloc(PATH_MAX);
4105 backing_file_full = g_malloc(PATH_MAX);
4106 filename_tmp = g_malloc(PATH_MAX);
4108 is_protocol = path_has_protocol(backing_file);
4110 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4112 /* If either of the filename paths is actually a protocol, then
4113 * compare unmodified paths; otherwise make paths relative */
4114 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4115 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4116 retval = curr_bs->backing_hd;
4117 break;
4119 } else {
4120 /* If not an absolute filename path, make it relative to the current
4121 * image's filename path */
4122 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4123 backing_file);
4125 /* We are going to compare absolute pathnames */
4126 if (!realpath(filename_tmp, filename_full)) {
4127 continue;
4130 /* We need to make sure the backing filename we are comparing against
4131 * is relative to the current image filename (or absolute) */
4132 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4133 curr_bs->backing_file);
4135 if (!realpath(filename_tmp, backing_file_full)) {
4136 continue;
4139 if (strcmp(backing_file_full, filename_full) == 0) {
4140 retval = curr_bs->backing_hd;
4141 break;
4146 g_free(filename_full);
4147 g_free(backing_file_full);
4148 g_free(filename_tmp);
4149 return retval;
4152 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4154 if (!bs->drv) {
4155 return 0;
4158 if (!bs->backing_hd) {
4159 return 0;
4162 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4165 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4167 BlockDriverState *curr_bs = NULL;
4169 if (!bs) {
4170 return NULL;
4173 curr_bs = bs;
4175 while (curr_bs->backing_hd) {
4176 curr_bs = curr_bs->backing_hd;
4178 return curr_bs;
4181 /**************************************************************/
4182 /* async I/Os */
4184 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4185 QEMUIOVector *qiov, int nb_sectors,
4186 BlockDriverCompletionFunc *cb, void *opaque)
4188 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4190 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4191 cb, opaque, false);
4194 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4195 QEMUIOVector *qiov, int nb_sectors,
4196 BlockDriverCompletionFunc *cb, void *opaque)
4198 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4200 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4201 cb, opaque, true);
4204 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4205 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4206 BlockDriverCompletionFunc *cb, void *opaque)
4208 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4210 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4211 BDRV_REQ_ZERO_WRITE | flags,
4212 cb, opaque, true);
4216 typedef struct MultiwriteCB {
4217 int error;
4218 int num_requests;
4219 int num_callbacks;
4220 struct {
4221 BlockDriverCompletionFunc *cb;
4222 void *opaque;
4223 QEMUIOVector *free_qiov;
4224 } callbacks[];
4225 } MultiwriteCB;
4227 static void multiwrite_user_cb(MultiwriteCB *mcb)
4229 int i;
4231 for (i = 0; i < mcb->num_callbacks; i++) {
4232 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4233 if (mcb->callbacks[i].free_qiov) {
4234 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4236 g_free(mcb->callbacks[i].free_qiov);
4240 static void multiwrite_cb(void *opaque, int ret)
4242 MultiwriteCB *mcb = opaque;
4244 trace_multiwrite_cb(mcb, ret);
4246 if (ret < 0 && !mcb->error) {
4247 mcb->error = ret;
4250 mcb->num_requests--;
4251 if (mcb->num_requests == 0) {
4252 multiwrite_user_cb(mcb);
4253 g_free(mcb);
4257 static int multiwrite_req_compare(const void *a, const void *b)
4259 const BlockRequest *req1 = a, *req2 = b;
4262 * Note that we can't simply subtract req2->sector from req1->sector
4263 * here as that could overflow the return value.
4265 if (req1->sector > req2->sector) {
4266 return 1;
4267 } else if (req1->sector < req2->sector) {
4268 return -1;
4269 } else {
4270 return 0;
4275 * Takes a bunch of requests and tries to merge them. Returns the number of
4276 * requests that remain after merging.
4278 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4279 int num_reqs, MultiwriteCB *mcb)
4281 int i, outidx;
4283 // Sort requests by start sector
4284 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4286 // Check if adjacent requests touch the same clusters. If so, combine them,
4287 // filling up gaps with zero sectors.
4288 outidx = 0;
4289 for (i = 1; i < num_reqs; i++) {
4290 int merge = 0;
4291 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4293 // Handle exactly sequential writes and overlapping writes.
4294 if (reqs[i].sector <= oldreq_last) {
4295 merge = 1;
4298 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4299 merge = 0;
4302 if (merge) {
4303 size_t size;
4304 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4305 qemu_iovec_init(qiov,
4306 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4308 // Add the first request to the merged one. If the requests are
4309 // overlapping, drop the last sectors of the first request.
4310 size = (reqs[i].sector - reqs[outidx].sector) << 9;
4311 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4313 // We should need to add any zeros between the two requests
4314 assert (reqs[i].sector <= oldreq_last);
4316 // Add the second request
4317 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4319 reqs[outidx].nb_sectors = qiov->size >> 9;
4320 reqs[outidx].qiov = qiov;
4322 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4323 } else {
4324 outidx++;
4325 reqs[outidx].sector = reqs[i].sector;
4326 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4327 reqs[outidx].qiov = reqs[i].qiov;
4331 return outidx + 1;
4335 * Submit multiple AIO write requests at once.
4337 * On success, the function returns 0 and all requests in the reqs array have
4338 * been submitted. In error case this function returns -1, and any of the
4339 * requests may or may not be submitted yet. In particular, this means that the
4340 * callback will be called for some of the requests, for others it won't. The
4341 * caller must check the error field of the BlockRequest to wait for the right
4342 * callbacks (if error != 0, no callback will be called).
4344 * The implementation may modify the contents of the reqs array, e.g. to merge
4345 * requests. However, the fields opaque and error are left unmodified as they
4346 * are used to signal failure for a single request to the caller.
4348 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4350 MultiwriteCB *mcb;
4351 int i;
4353 /* don't submit writes if we don't have a medium */
4354 if (bs->drv == NULL) {
4355 for (i = 0; i < num_reqs; i++) {
4356 reqs[i].error = -ENOMEDIUM;
4358 return -1;
4361 if (num_reqs == 0) {
4362 return 0;
4365 // Create MultiwriteCB structure
4366 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4367 mcb->num_requests = 0;
4368 mcb->num_callbacks = num_reqs;
4370 for (i = 0; i < num_reqs; i++) {
4371 mcb->callbacks[i].cb = reqs[i].cb;
4372 mcb->callbacks[i].opaque = reqs[i].opaque;
4375 // Check for mergable requests
4376 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4378 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4380 /* Run the aio requests. */
4381 mcb->num_requests = num_reqs;
4382 for (i = 0; i < num_reqs; i++) {
4383 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4384 reqs[i].nb_sectors, reqs[i].flags,
4385 multiwrite_cb, mcb,
4386 true);
4389 return 0;
4392 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4394 acb->aiocb_info->cancel(acb);
4397 /**************************************************************/
4398 /* async block device emulation */
4400 typedef struct BlockDriverAIOCBSync {
4401 BlockDriverAIOCB common;
4402 QEMUBH *bh;
4403 int ret;
4404 /* vector translation state */
4405 QEMUIOVector *qiov;
4406 uint8_t *bounce;
4407 int is_write;
4408 } BlockDriverAIOCBSync;
4410 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4412 BlockDriverAIOCBSync *acb =
4413 container_of(blockacb, BlockDriverAIOCBSync, common);
4414 qemu_bh_delete(acb->bh);
4415 acb->bh = NULL;
4416 qemu_aio_release(acb);
4419 static const AIOCBInfo bdrv_em_aiocb_info = {
4420 .aiocb_size = sizeof(BlockDriverAIOCBSync),
4421 .cancel = bdrv_aio_cancel_em,
4424 static void bdrv_aio_bh_cb(void *opaque)
4426 BlockDriverAIOCBSync *acb = opaque;
4428 if (!acb->is_write)
4429 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4430 qemu_vfree(acb->bounce);
4431 acb->common.cb(acb->common.opaque, acb->ret);
4432 qemu_bh_delete(acb->bh);
4433 acb->bh = NULL;
4434 qemu_aio_release(acb);
4437 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4438 int64_t sector_num,
4439 QEMUIOVector *qiov,
4440 int nb_sectors,
4441 BlockDriverCompletionFunc *cb,
4442 void *opaque,
4443 int is_write)
4446 BlockDriverAIOCBSync *acb;
4448 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4449 acb->is_write = is_write;
4450 acb->qiov = qiov;
4451 acb->bounce = qemu_blockalign(bs, qiov->size);
4452 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4454 if (is_write) {
4455 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4456 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4457 } else {
4458 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4461 qemu_bh_schedule(acb->bh);
4463 return &acb->common;
4466 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4467 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4468 BlockDriverCompletionFunc *cb, void *opaque)
4470 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4473 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4474 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4475 BlockDriverCompletionFunc *cb, void *opaque)
4477 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4481 typedef struct BlockDriverAIOCBCoroutine {
4482 BlockDriverAIOCB common;
4483 BlockRequest req;
4484 bool is_write;
4485 bool *done;
4486 QEMUBH* bh;
4487 } BlockDriverAIOCBCoroutine;
4489 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4491 BlockDriverAIOCBCoroutine *acb =
4492 container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4493 bool done = false;
4495 acb->done = &done;
4496 while (!done) {
4497 qemu_aio_wait();
4501 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4502 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
4503 .cancel = bdrv_aio_co_cancel_em,
4506 static void bdrv_co_em_bh(void *opaque)
4508 BlockDriverAIOCBCoroutine *acb = opaque;
4510 acb->common.cb(acb->common.opaque, acb->req.error);
4512 if (acb->done) {
4513 *acb->done = true;
4516 qemu_bh_delete(acb->bh);
4517 qemu_aio_release(acb);
4520 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4521 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4523 BlockDriverAIOCBCoroutine *acb = opaque;
4524 BlockDriverState *bs = acb->common.bs;
4526 if (!acb->is_write) {
4527 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4528 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4529 } else {
4530 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4531 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4534 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4535 qemu_bh_schedule(acb->bh);
4538 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4539 int64_t sector_num,
4540 QEMUIOVector *qiov,
4541 int nb_sectors,
4542 BdrvRequestFlags flags,
4543 BlockDriverCompletionFunc *cb,
4544 void *opaque,
4545 bool is_write)
4547 Coroutine *co;
4548 BlockDriverAIOCBCoroutine *acb;
4550 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4551 acb->req.sector = sector_num;
4552 acb->req.nb_sectors = nb_sectors;
4553 acb->req.qiov = qiov;
4554 acb->req.flags = flags;
4555 acb->is_write = is_write;
4556 acb->done = NULL;
4558 co = qemu_coroutine_create(bdrv_co_do_rw);
4559 qemu_coroutine_enter(co, acb);
4561 return &acb->common;
4564 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4566 BlockDriverAIOCBCoroutine *acb = opaque;
4567 BlockDriverState *bs = acb->common.bs;
4569 acb->req.error = bdrv_co_flush(bs);
4570 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4571 qemu_bh_schedule(acb->bh);
4574 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4575 BlockDriverCompletionFunc *cb, void *opaque)
4577 trace_bdrv_aio_flush(bs, opaque);
4579 Coroutine *co;
4580 BlockDriverAIOCBCoroutine *acb;
4582 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4583 acb->done = NULL;
4585 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4586 qemu_coroutine_enter(co, acb);
4588 return &acb->common;
4591 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4593 BlockDriverAIOCBCoroutine *acb = opaque;
4594 BlockDriverState *bs = acb->common.bs;
4596 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4597 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4598 qemu_bh_schedule(acb->bh);
4601 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4602 int64_t sector_num, int nb_sectors,
4603 BlockDriverCompletionFunc *cb, void *opaque)
4605 Coroutine *co;
4606 BlockDriverAIOCBCoroutine *acb;
4608 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4610 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4611 acb->req.sector = sector_num;
4612 acb->req.nb_sectors = nb_sectors;
4613 acb->done = NULL;
4614 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4615 qemu_coroutine_enter(co, acb);
4617 return &acb->common;
4620 void bdrv_init(void)
4622 module_call_init(MODULE_INIT_BLOCK);
4625 void bdrv_init_with_whitelist(void)
4627 use_bdrv_whitelist = 1;
4628 bdrv_init();
4631 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4632 BlockDriverCompletionFunc *cb, void *opaque)
4634 BlockDriverAIOCB *acb;
4636 acb = g_slice_alloc(aiocb_info->aiocb_size);
4637 acb->aiocb_info = aiocb_info;
4638 acb->bs = bs;
4639 acb->cb = cb;
4640 acb->opaque = opaque;
4641 return acb;
4644 void qemu_aio_release(void *p)
4646 BlockDriverAIOCB *acb = p;
4647 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4650 /**************************************************************/
4651 /* Coroutine block device emulation */
4653 typedef struct CoroutineIOCompletion {
4654 Coroutine *coroutine;
4655 int ret;
4656 } CoroutineIOCompletion;
4658 static void bdrv_co_io_em_complete(void *opaque, int ret)
4660 CoroutineIOCompletion *co = opaque;
4662 co->ret = ret;
4663 qemu_coroutine_enter(co->coroutine, NULL);
4666 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4667 int nb_sectors, QEMUIOVector *iov,
4668 bool is_write)
4670 CoroutineIOCompletion co = {
4671 .coroutine = qemu_coroutine_self(),
4673 BlockDriverAIOCB *acb;
4675 if (is_write) {
4676 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4677 bdrv_co_io_em_complete, &co);
4678 } else {
4679 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4680 bdrv_co_io_em_complete, &co);
4683 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4684 if (!acb) {
4685 return -EIO;
4687 qemu_coroutine_yield();
4689 return co.ret;
4692 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4693 int64_t sector_num, int nb_sectors,
4694 QEMUIOVector *iov)
4696 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4699 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4700 int64_t sector_num, int nb_sectors,
4701 QEMUIOVector *iov)
4703 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4706 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4708 RwCo *rwco = opaque;
4710 rwco->ret = bdrv_co_flush(rwco->bs);
4713 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4715 int ret;
4717 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4718 return 0;
4721 /* Write back cached data to the OS even with cache=unsafe */
4722 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4723 if (bs->drv->bdrv_co_flush_to_os) {
4724 ret = bs->drv->bdrv_co_flush_to_os(bs);
4725 if (ret < 0) {
4726 return ret;
4730 /* But don't actually force it to the disk with cache=unsafe */
4731 if (bs->open_flags & BDRV_O_NO_FLUSH) {
4732 goto flush_parent;
4735 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4736 if (bs->drv->bdrv_co_flush_to_disk) {
4737 ret = bs->drv->bdrv_co_flush_to_disk(bs);
4738 } else if (bs->drv->bdrv_aio_flush) {
4739 BlockDriverAIOCB *acb;
4740 CoroutineIOCompletion co = {
4741 .coroutine = qemu_coroutine_self(),
4744 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4745 if (acb == NULL) {
4746 ret = -EIO;
4747 } else {
4748 qemu_coroutine_yield();
4749 ret = co.ret;
4751 } else {
4753 * Some block drivers always operate in either writethrough or unsafe
4754 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4755 * know how the server works (because the behaviour is hardcoded or
4756 * depends on server-side configuration), so we can't ensure that
4757 * everything is safe on disk. Returning an error doesn't work because
4758 * that would break guests even if the server operates in writethrough
4759 * mode.
4761 * Let's hope the user knows what he's doing.
4763 ret = 0;
4765 if (ret < 0) {
4766 return ret;
4769 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4770 * in the case of cache=unsafe, so there are no useless flushes.
4772 flush_parent:
4773 return bdrv_co_flush(bs->file);
4776 void bdrv_invalidate_cache(BlockDriverState *bs)
4778 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4779 bs->drv->bdrv_invalidate_cache(bs);
4783 void bdrv_invalidate_cache_all(void)
4785 BlockDriverState *bs;
4787 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4788 bdrv_invalidate_cache(bs);
4792 void bdrv_clear_incoming_migration_all(void)
4794 BlockDriverState *bs;
4796 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4797 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4801 int bdrv_flush(BlockDriverState *bs)
4803 Coroutine *co;
4804 RwCo rwco = {
4805 .bs = bs,
4806 .ret = NOT_DONE,
4809 if (qemu_in_coroutine()) {
4810 /* Fast-path if already in coroutine context */
4811 bdrv_flush_co_entry(&rwco);
4812 } else {
4813 co = qemu_coroutine_create(bdrv_flush_co_entry);
4814 qemu_coroutine_enter(co, &rwco);
4815 while (rwco.ret == NOT_DONE) {
4816 qemu_aio_wait();
4820 return rwco.ret;
4823 typedef struct DiscardCo {
4824 BlockDriverState *bs;
4825 int64_t sector_num;
4826 int nb_sectors;
4827 int ret;
4828 } DiscardCo;
4829 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4831 DiscardCo *rwco = opaque;
4833 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4836 /* if no limit is specified in the BlockLimits use a default
4837 * of 32768 512-byte sectors (16 MiB) per request.
4839 #define MAX_DISCARD_DEFAULT 32768
4841 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4842 int nb_sectors)
4844 int max_discard;
4846 if (!bs->drv) {
4847 return -ENOMEDIUM;
4848 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4849 return -EIO;
4850 } else if (bs->read_only) {
4851 return -EROFS;
4854 bdrv_reset_dirty(bs, sector_num, nb_sectors);
4856 /* Do nothing if disabled. */
4857 if (!(bs->open_flags & BDRV_O_UNMAP)) {
4858 return 0;
4861 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4862 return 0;
4865 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4866 while (nb_sectors > 0) {
4867 int ret;
4868 int num = nb_sectors;
4870 /* align request */
4871 if (bs->bl.discard_alignment &&
4872 num >= bs->bl.discard_alignment &&
4873 sector_num % bs->bl.discard_alignment) {
4874 if (num > bs->bl.discard_alignment) {
4875 num = bs->bl.discard_alignment;
4877 num -= sector_num % bs->bl.discard_alignment;
4880 /* limit request size */
4881 if (num > max_discard) {
4882 num = max_discard;
4885 if (bs->drv->bdrv_co_discard) {
4886 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4887 } else {
4888 BlockDriverAIOCB *acb;
4889 CoroutineIOCompletion co = {
4890 .coroutine = qemu_coroutine_self(),
4893 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4894 bdrv_co_io_em_complete, &co);
4895 if (acb == NULL) {
4896 return -EIO;
4897 } else {
4898 qemu_coroutine_yield();
4899 ret = co.ret;
4902 if (ret && ret != -ENOTSUP) {
4903 return ret;
4906 sector_num += num;
4907 nb_sectors -= num;
4909 return 0;
4912 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4914 Coroutine *co;
4915 DiscardCo rwco = {
4916 .bs = bs,
4917 .sector_num = sector_num,
4918 .nb_sectors = nb_sectors,
4919 .ret = NOT_DONE,
4922 if (qemu_in_coroutine()) {
4923 /* Fast-path if already in coroutine context */
4924 bdrv_discard_co_entry(&rwco);
4925 } else {
4926 co = qemu_coroutine_create(bdrv_discard_co_entry);
4927 qemu_coroutine_enter(co, &rwco);
4928 while (rwco.ret == NOT_DONE) {
4929 qemu_aio_wait();
4933 return rwco.ret;
4936 /**************************************************************/
4937 /* removable device support */
4940 * Return TRUE if the media is present
4942 int bdrv_is_inserted(BlockDriverState *bs)
4944 BlockDriver *drv = bs->drv;
4946 if (!drv)
4947 return 0;
4948 if (!drv->bdrv_is_inserted)
4949 return 1;
4950 return drv->bdrv_is_inserted(bs);
4954 * Return whether the media changed since the last call to this
4955 * function, or -ENOTSUP if we don't know. Most drivers don't know.
4957 int bdrv_media_changed(BlockDriverState *bs)
4959 BlockDriver *drv = bs->drv;
4961 if (drv && drv->bdrv_media_changed) {
4962 return drv->bdrv_media_changed(bs);
4964 return -ENOTSUP;
4968 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4970 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4972 BlockDriver *drv = bs->drv;
4974 if (drv && drv->bdrv_eject) {
4975 drv->bdrv_eject(bs, eject_flag);
4978 if (bs->device_name[0] != '\0') {
4979 bdrv_emit_qmp_eject_event(bs, eject_flag);
4984 * Lock or unlock the media (if it is locked, the user won't be able
4985 * to eject it manually).
4987 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4989 BlockDriver *drv = bs->drv;
4991 trace_bdrv_lock_medium(bs, locked);
4993 if (drv && drv->bdrv_lock_medium) {
4994 drv->bdrv_lock_medium(bs, locked);
4998 /* needed for generic scsi interface */
5000 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5002 BlockDriver *drv = bs->drv;
5004 if (drv && drv->bdrv_ioctl)
5005 return drv->bdrv_ioctl(bs, req, buf);
5006 return -ENOTSUP;
5009 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5010 unsigned long int req, void *buf,
5011 BlockDriverCompletionFunc *cb, void *opaque)
5013 BlockDriver *drv = bs->drv;
5015 if (drv && drv->bdrv_aio_ioctl)
5016 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5017 return NULL;
5020 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5022 bs->guest_block_size = align;
5025 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5027 return qemu_memalign(bdrv_opt_mem_align(bs), size);
5031 * Check if all memory in this vector is sector aligned.
5033 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5035 int i;
5036 size_t alignment = bdrv_opt_mem_align(bs);
5038 for (i = 0; i < qiov->niov; i++) {
5039 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5040 return false;
5042 if (qiov->iov[i].iov_len % alignment) {
5043 return false;
5047 return true;
5050 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
5052 int64_t bitmap_size;
5053 BdrvDirtyBitmap *bitmap;
5055 assert((granularity & (granularity - 1)) == 0);
5057 granularity >>= BDRV_SECTOR_BITS;
5058 assert(granularity);
5059 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
5060 bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5061 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5062 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5063 return bitmap;
5066 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5068 BdrvDirtyBitmap *bm, *next;
5069 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5070 if (bm == bitmap) {
5071 QLIST_REMOVE(bitmap, list);
5072 hbitmap_free(bitmap->bitmap);
5073 g_free(bitmap);
5074 return;
5079 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5081 BdrvDirtyBitmap *bm;
5082 BlockDirtyInfoList *list = NULL;
5083 BlockDirtyInfoList **plist = &list;
5085 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5086 BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5087 BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5088 info->count = bdrv_get_dirty_count(bs, bm);
5089 info->granularity =
5090 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5091 entry->value = info;
5092 *plist = entry;
5093 plist = &entry->next;
5096 return list;
5099 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5101 if (bitmap) {
5102 return hbitmap_get(bitmap->bitmap, sector);
5103 } else {
5104 return 0;
5108 void bdrv_dirty_iter_init(BlockDriverState *bs,
5109 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5111 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5114 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5115 int nr_sectors)
5117 BdrvDirtyBitmap *bitmap;
5118 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5119 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5123 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5125 BdrvDirtyBitmap *bitmap;
5126 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5127 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5131 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5133 return hbitmap_count(bitmap->bitmap);
5136 /* Get a reference to bs */
5137 void bdrv_ref(BlockDriverState *bs)
5139 bs->refcnt++;
5142 /* Release a previously grabbed reference to bs.
5143 * If after releasing, reference count is zero, the BlockDriverState is
5144 * deleted. */
5145 void bdrv_unref(BlockDriverState *bs)
5147 assert(bs->refcnt > 0);
5148 if (--bs->refcnt == 0) {
5149 bdrv_delete(bs);
5153 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5155 assert(bs->in_use != in_use);
5156 bs->in_use = in_use;
5159 int bdrv_in_use(BlockDriverState *bs)
5161 return bs->in_use;
5164 void bdrv_iostatus_enable(BlockDriverState *bs)
5166 bs->iostatus_enabled = true;
5167 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5170 /* The I/O status is only enabled if the drive explicitly
5171 * enables it _and_ the VM is configured to stop on errors */
5172 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5174 return (bs->iostatus_enabled &&
5175 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5176 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5177 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5180 void bdrv_iostatus_disable(BlockDriverState *bs)
5182 bs->iostatus_enabled = false;
5185 void bdrv_iostatus_reset(BlockDriverState *bs)
5187 if (bdrv_iostatus_is_enabled(bs)) {
5188 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5189 if (bs->job) {
5190 block_job_iostatus_reset(bs->job);
5195 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5197 assert(bdrv_iostatus_is_enabled(bs));
5198 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5199 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5200 BLOCK_DEVICE_IO_STATUS_FAILED;
5204 void
5205 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5206 enum BlockAcctType type)
5208 assert(type < BDRV_MAX_IOTYPE);
5210 cookie->bytes = bytes;
5211 cookie->start_time_ns = get_clock();
5212 cookie->type = type;
5215 void
5216 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5218 assert(cookie->type < BDRV_MAX_IOTYPE);
5220 bs->nr_bytes[cookie->type] += cookie->bytes;
5221 bs->nr_ops[cookie->type]++;
5222 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5225 void bdrv_img_create(const char *filename, const char *fmt,
5226 const char *base_filename, const char *base_fmt,
5227 char *options, uint64_t img_size, int flags,
5228 Error **errp, bool quiet)
5230 QEMUOptionParameter *param = NULL, *create_options = NULL;
5231 QEMUOptionParameter *backing_fmt, *backing_file, *size;
5232 BlockDriver *drv, *proto_drv;
5233 BlockDriver *backing_drv = NULL;
5234 Error *local_err = NULL;
5235 int ret = 0;
5237 /* Find driver and parse its options */
5238 drv = bdrv_find_format(fmt);
5239 if (!drv) {
5240 error_setg(errp, "Unknown file format '%s'", fmt);
5241 return;
5244 proto_drv = bdrv_find_protocol(filename, true);
5245 if (!proto_drv) {
5246 error_setg(errp, "Unknown protocol '%s'", filename);
5247 return;
5250 create_options = append_option_parameters(create_options,
5251 drv->create_options);
5252 create_options = append_option_parameters(create_options,
5253 proto_drv->create_options);
5255 /* Create parameter list with default values */
5256 param = parse_option_parameters("", create_options, param);
5258 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5260 /* Parse -o options */
5261 if (options) {
5262 param = parse_option_parameters(options, create_options, param);
5263 if (param == NULL) {
5264 error_setg(errp, "Invalid options for file format '%s'.", fmt);
5265 goto out;
5269 if (base_filename) {
5270 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5271 base_filename)) {
5272 error_setg(errp, "Backing file not supported for file format '%s'",
5273 fmt);
5274 goto out;
5278 if (base_fmt) {
5279 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5280 error_setg(errp, "Backing file format not supported for file "
5281 "format '%s'", fmt);
5282 goto out;
5286 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5287 if (backing_file && backing_file->value.s) {
5288 if (!strcmp(filename, backing_file->value.s)) {
5289 error_setg(errp, "Error: Trying to create an image with the "
5290 "same filename as the backing file");
5291 goto out;
5295 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5296 if (backing_fmt && backing_fmt->value.s) {
5297 backing_drv = bdrv_find_format(backing_fmt->value.s);
5298 if (!backing_drv) {
5299 error_setg(errp, "Unknown backing file format '%s'",
5300 backing_fmt->value.s);
5301 goto out;
5305 // The size for the image must always be specified, with one exception:
5306 // If we are using a backing file, we can obtain the size from there
5307 size = get_option_parameter(param, BLOCK_OPT_SIZE);
5308 if (size && size->value.n == -1) {
5309 if (backing_file && backing_file->value.s) {
5310 BlockDriverState *bs;
5311 uint64_t size;
5312 char buf[32];
5313 int back_flags;
5315 /* backing files always opened read-only */
5316 back_flags =
5317 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5319 bs = NULL;
5320 ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
5321 backing_drv, &local_err);
5322 if (ret < 0) {
5323 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5324 backing_file->value.s,
5325 error_get_pretty(local_err));
5326 error_free(local_err);
5327 local_err = NULL;
5328 goto out;
5330 bdrv_get_geometry(bs, &size);
5331 size *= 512;
5333 snprintf(buf, sizeof(buf), "%" PRId64, size);
5334 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5336 bdrv_unref(bs);
5337 } else {
5338 error_setg(errp, "Image creation needs a size parameter");
5339 goto out;
5343 if (!quiet) {
5344 printf("Formatting '%s', fmt=%s ", filename, fmt);
5345 print_option_parameters(param);
5346 puts("");
5348 ret = bdrv_create(drv, filename, param, &local_err);
5349 if (ret == -EFBIG) {
5350 /* This is generally a better message than whatever the driver would
5351 * deliver (especially because of the cluster_size_hint), since that
5352 * is most probably not much different from "image too large". */
5353 const char *cluster_size_hint = "";
5354 if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5355 cluster_size_hint = " (try using a larger cluster size)";
5357 error_setg(errp, "The image size is too large for file format '%s'"
5358 "%s", fmt, cluster_size_hint);
5359 error_free(local_err);
5360 local_err = NULL;
5363 out:
5364 free_option_parameters(create_options);
5365 free_option_parameters(param);
5367 if (local_err) {
5368 error_propagate(errp, local_err);
5372 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5374 /* Currently BlockDriverState always uses the main loop AioContext */
5375 return qemu_get_aio_context();
5378 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5379 NotifierWithReturn *notifier)
5381 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5384 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5386 if (bs->drv->bdrv_amend_options == NULL) {
5387 return -ENOTSUP;
5389 return bs->drv->bdrv_amend_options(bs, options);
5392 /* Used to recurse on single child block filters.
5393 * Single child block filter will store their child in bs->file.
5395 bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
5396 BlockDriverState *candidate)
5398 if (!bs->drv) {
5399 return false;
5402 if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
5403 if (bs == candidate) {
5404 return true;
5405 } else {
5406 return false;
5410 if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
5411 return false;
5414 if (!bs->file) {
5415 return false;
5418 return bdrv_recurse_is_first_non_filter(bs->file, candidate);
5421 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5422 BlockDriverState *candidate)
5424 if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
5425 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5428 return bdrv_generic_is_first_non_filter(bs, candidate);
5431 /* This function checks if the candidate is the first non filter bs down it's
5432 * bs chain. Since we don't have pointers to parents it explore all bs chains
5433 * from the top. Some filters can choose not to pass down the recursion.
5435 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5437 BlockDriverState *bs;
5439 /* walk down the bs forest recursively */
5440 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5441 bool perm;
5443 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5445 /* candidate is the first non filter */
5446 if (perm) {
5447 return true;
5451 return false;