block: Reuse reference handling from bdrv_open()
[qemu/ar7.git] / block.c
blob29accdde76400e5d0b81c1f767d774e28513f85e
1 /*
2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
53 struct BdrvDirtyBitmap {
54 HBitmap *bitmap;
55 QLIST_ENTRY(BdrvDirtyBitmap) list;
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63 BlockDriverCompletionFunc *cb, void *opaque);
64 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66 BlockDriverCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75 BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78 BdrvRequestFlags flags);
79 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
83 BdrvRequestFlags flags,
84 BlockDriverCompletionFunc *cb,
85 void *opaque,
86 bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
111 int is_windows_drive(const char *filename)
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
121 #endif
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
127 int i;
129 throttle_config(&bs->throttle_state, cfg);
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
143 bs->io_limits_enabled = false;
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
151 bs->io_limits_enabled = enabled;
153 return drained;
156 void bdrv_io_limits_disable(BlockDriverState *bs)
158 bs->io_limits_enabled = false;
160 bdrv_start_throttled_reqs(bs);
162 throttle_destroy(&bs->throttle_state);
165 static void bdrv_throttle_read_timer_cb(void *opaque)
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
171 static void bdrv_throttle_write_timer_cb(void *opaque)
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 QEMU_CLOCK_VIRTUAL,
183 bdrv_throttle_read_timer_cb,
184 bdrv_throttle_write_timer_cb,
185 bs);
186 bs->io_limits_enabled = true;
189 /* This function makes an IO wait if needed
191 * @nb_sectors: the number of sectors of the IO
192 * @is_write: is the IO a write
194 static void bdrv_io_limits_intercept(BlockDriverState *bs,
195 unsigned int bytes,
196 bool is_write)
198 /* does this io must wait */
199 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
201 /* if must wait or any request of this type throttled queue the IO */
202 if (must_wait ||
203 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
207 /* the IO will be executed, do the accounting */
208 throttle_account(&bs->throttle_state, is_write, bytes);
211 /* if the next request must wait -> do nothing */
212 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213 return;
216 /* else queue next request for execution */
217 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
220 size_t bdrv_opt_mem_align(BlockDriverState *bs)
222 if (!bs || !bs->drv) {
223 /* 4k should be on the safe side */
224 return 4096;
227 return bs->bl.opt_mem_alignment;
230 /* check if the path starts with "<protocol>:" */
231 static int path_has_protocol(const char *path)
233 const char *p;
235 #ifdef _WIN32
236 if (is_windows_drive(path) ||
237 is_windows_drive_prefix(path)) {
238 return 0;
240 p = path + strcspn(path, ":/\\");
241 #else
242 p = path + strcspn(path, ":/");
243 #endif
245 return *p == ':';
248 int path_is_absolute(const char *path)
250 #ifdef _WIN32
251 /* specific case for names like: "\\.\d:" */
252 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
253 return 1;
255 return (*path == '/' || *path == '\\');
256 #else
257 return (*path == '/');
258 #endif
261 /* if filename is absolute, just copy it to dest. Otherwise, build a
262 path to it by considering it is relative to base_path. URL are
263 supported. */
264 void path_combine(char *dest, int dest_size,
265 const char *base_path,
266 const char *filename)
268 const char *p, *p1;
269 int len;
271 if (dest_size <= 0)
272 return;
273 if (path_is_absolute(filename)) {
274 pstrcpy(dest, dest_size, filename);
275 } else {
276 p = strchr(base_path, ':');
277 if (p)
278 p++;
279 else
280 p = base_path;
281 p1 = strrchr(base_path, '/');
282 #ifdef _WIN32
284 const char *p2;
285 p2 = strrchr(base_path, '\\');
286 if (!p1 || p2 > p1)
287 p1 = p2;
289 #endif
290 if (p1)
291 p1++;
292 else
293 p1 = base_path;
294 if (p1 > p)
295 p = p1;
296 len = p - base_path;
297 if (len > dest_size - 1)
298 len = dest_size - 1;
299 memcpy(dest, base_path, len);
300 dest[len] = '\0';
301 pstrcat(dest, dest_size, filename);
305 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
307 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308 pstrcpy(dest, sz, bs->backing_file);
309 } else {
310 path_combine(dest, sz, bs->filename, bs->backing_file);
314 void bdrv_register(BlockDriver *bdrv)
316 /* Block drivers without coroutine functions need emulation */
317 if (!bdrv->bdrv_co_readv) {
318 bdrv->bdrv_co_readv = bdrv_co_readv_em;
319 bdrv->bdrv_co_writev = bdrv_co_writev_em;
321 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322 * the block driver lacks aio we need to emulate that too.
324 if (!bdrv->bdrv_aio_readv) {
325 /* add AIO emulation layer */
326 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
331 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
334 /* create a new block device (by default it is empty) */
335 BlockDriverState *bdrv_new(const char *device_name)
337 BlockDriverState *bs;
339 bs = g_malloc0(sizeof(BlockDriverState));
340 QLIST_INIT(&bs->dirty_bitmaps);
341 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
342 if (device_name[0] != '\0') {
343 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
345 bdrv_iostatus_disable(bs);
346 notifier_list_init(&bs->close_notifiers);
347 notifier_with_return_list_init(&bs->before_write_notifiers);
348 qemu_co_queue_init(&bs->throttled_reqs[0]);
349 qemu_co_queue_init(&bs->throttled_reqs[1]);
350 bs->refcnt = 1;
352 return bs;
355 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
357 notifier_list_add(&bs->close_notifiers, notify);
360 BlockDriver *bdrv_find_format(const char *format_name)
362 BlockDriver *drv1;
363 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
364 if (!strcmp(drv1->format_name, format_name)) {
365 return drv1;
368 return NULL;
371 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
373 static const char *whitelist_rw[] = {
374 CONFIG_BDRV_RW_WHITELIST
376 static const char *whitelist_ro[] = {
377 CONFIG_BDRV_RO_WHITELIST
379 const char **p;
381 if (!whitelist_rw[0] && !whitelist_ro[0]) {
382 return 1; /* no whitelist, anything goes */
385 for (p = whitelist_rw; *p; p++) {
386 if (!strcmp(drv->format_name, *p)) {
387 return 1;
390 if (read_only) {
391 for (p = whitelist_ro; *p; p++) {
392 if (!strcmp(drv->format_name, *p)) {
393 return 1;
397 return 0;
400 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
401 bool read_only)
403 BlockDriver *drv = bdrv_find_format(format_name);
404 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
407 typedef struct CreateCo {
408 BlockDriver *drv;
409 char *filename;
410 QEMUOptionParameter *options;
411 int ret;
412 Error *err;
413 } CreateCo;
415 static void coroutine_fn bdrv_create_co_entry(void *opaque)
417 Error *local_err = NULL;
418 int ret;
420 CreateCo *cco = opaque;
421 assert(cco->drv);
423 ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
424 if (local_err) {
425 error_propagate(&cco->err, local_err);
427 cco->ret = ret;
430 int bdrv_create(BlockDriver *drv, const char* filename,
431 QEMUOptionParameter *options, Error **errp)
433 int ret;
435 Coroutine *co;
436 CreateCo cco = {
437 .drv = drv,
438 .filename = g_strdup(filename),
439 .options = options,
440 .ret = NOT_DONE,
441 .err = NULL,
444 if (!drv->bdrv_create) {
445 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
446 ret = -ENOTSUP;
447 goto out;
450 if (qemu_in_coroutine()) {
451 /* Fast-path if already in coroutine context */
452 bdrv_create_co_entry(&cco);
453 } else {
454 co = qemu_coroutine_create(bdrv_create_co_entry);
455 qemu_coroutine_enter(co, &cco);
456 while (cco.ret == NOT_DONE) {
457 qemu_aio_wait();
461 ret = cco.ret;
462 if (ret < 0) {
463 if (cco.err) {
464 error_propagate(errp, cco.err);
465 } else {
466 error_setg_errno(errp, -ret, "Could not create image");
470 out:
471 g_free(cco.filename);
472 return ret;
475 int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
476 Error **errp)
478 BlockDriver *drv;
479 Error *local_err = NULL;
480 int ret;
482 drv = bdrv_find_protocol(filename, true);
483 if (drv == NULL) {
484 error_setg(errp, "Could not find protocol for file '%s'", filename);
485 return -ENOENT;
488 ret = bdrv_create(drv, filename, options, &local_err);
489 if (local_err) {
490 error_propagate(errp, local_err);
492 return ret;
495 int bdrv_refresh_limits(BlockDriverState *bs)
497 BlockDriver *drv = bs->drv;
499 memset(&bs->bl, 0, sizeof(bs->bl));
501 if (!drv) {
502 return 0;
505 /* Take some limits from the children as a default */
506 if (bs->file) {
507 bdrv_refresh_limits(bs->file);
508 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
509 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
510 } else {
511 bs->bl.opt_mem_alignment = 512;
514 if (bs->backing_hd) {
515 bdrv_refresh_limits(bs->backing_hd);
516 bs->bl.opt_transfer_length =
517 MAX(bs->bl.opt_transfer_length,
518 bs->backing_hd->bl.opt_transfer_length);
519 bs->bl.opt_mem_alignment =
520 MAX(bs->bl.opt_mem_alignment,
521 bs->backing_hd->bl.opt_mem_alignment);
524 /* Then let the driver override it */
525 if (drv->bdrv_refresh_limits) {
526 return drv->bdrv_refresh_limits(bs);
529 return 0;
533 * Create a uniquely-named empty temporary file.
534 * Return 0 upon success, otherwise a negative errno value.
536 int get_tmp_filename(char *filename, int size)
538 #ifdef _WIN32
539 char temp_dir[MAX_PATH];
540 /* GetTempFileName requires that its output buffer (4th param)
541 have length MAX_PATH or greater. */
542 assert(size >= MAX_PATH);
543 return (GetTempPath(MAX_PATH, temp_dir)
544 && GetTempFileName(temp_dir, "qem", 0, filename)
545 ? 0 : -GetLastError());
546 #else
547 int fd;
548 const char *tmpdir;
549 tmpdir = getenv("TMPDIR");
550 if (!tmpdir)
551 tmpdir = "/tmp";
552 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
553 return -EOVERFLOW;
555 fd = mkstemp(filename);
556 if (fd < 0) {
557 return -errno;
559 if (close(fd) != 0) {
560 unlink(filename);
561 return -errno;
563 return 0;
564 #endif
568 * Detect host devices. By convention, /dev/cdrom[N] is always
569 * recognized as a host CDROM.
571 static BlockDriver *find_hdev_driver(const char *filename)
573 int score_max = 0, score;
574 BlockDriver *drv = NULL, *d;
576 QLIST_FOREACH(d, &bdrv_drivers, list) {
577 if (d->bdrv_probe_device) {
578 score = d->bdrv_probe_device(filename);
579 if (score > score_max) {
580 score_max = score;
581 drv = d;
586 return drv;
589 BlockDriver *bdrv_find_protocol(const char *filename,
590 bool allow_protocol_prefix)
592 BlockDriver *drv1;
593 char protocol[128];
594 int len;
595 const char *p;
597 /* TODO Drivers without bdrv_file_open must be specified explicitly */
600 * XXX(hch): we really should not let host device detection
601 * override an explicit protocol specification, but moving this
602 * later breaks access to device names with colons in them.
603 * Thanks to the brain-dead persistent naming schemes on udev-
604 * based Linux systems those actually are quite common.
606 drv1 = find_hdev_driver(filename);
607 if (drv1) {
608 return drv1;
611 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
612 return bdrv_find_format("file");
615 p = strchr(filename, ':');
616 assert(p != NULL);
617 len = p - filename;
618 if (len > sizeof(protocol) - 1)
619 len = sizeof(protocol) - 1;
620 memcpy(protocol, filename, len);
621 protocol[len] = '\0';
622 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
623 if (drv1->protocol_name &&
624 !strcmp(drv1->protocol_name, protocol)) {
625 return drv1;
628 return NULL;
631 static int find_image_format(BlockDriverState *bs, const char *filename,
632 BlockDriver **pdrv, Error **errp)
634 int score, score_max;
635 BlockDriver *drv1, *drv;
636 uint8_t buf[2048];
637 int ret = 0;
639 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
640 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
641 drv = bdrv_find_format("raw");
642 if (!drv) {
643 error_setg(errp, "Could not find raw image format");
644 ret = -ENOENT;
646 *pdrv = drv;
647 return ret;
650 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
651 if (ret < 0) {
652 error_setg_errno(errp, -ret, "Could not read image for determining its "
653 "format");
654 *pdrv = NULL;
655 return ret;
658 score_max = 0;
659 drv = NULL;
660 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
661 if (drv1->bdrv_probe) {
662 score = drv1->bdrv_probe(buf, ret, filename);
663 if (score > score_max) {
664 score_max = score;
665 drv = drv1;
669 if (!drv) {
670 error_setg(errp, "Could not determine image format: No compatible "
671 "driver found");
672 ret = -ENOENT;
674 *pdrv = drv;
675 return ret;
679 * Set the current 'total_sectors' value
681 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
683 BlockDriver *drv = bs->drv;
685 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
686 if (bs->sg)
687 return 0;
689 /* query actual device if possible, otherwise just trust the hint */
690 if (drv->bdrv_getlength) {
691 int64_t length = drv->bdrv_getlength(bs);
692 if (length < 0) {
693 return length;
695 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
698 bs->total_sectors = hint;
699 return 0;
703 * Set open flags for a given discard mode
705 * Return 0 on success, -1 if the discard mode was invalid.
707 int bdrv_parse_discard_flags(const char *mode, int *flags)
709 *flags &= ~BDRV_O_UNMAP;
711 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
712 /* do nothing */
713 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
714 *flags |= BDRV_O_UNMAP;
715 } else {
716 return -1;
719 return 0;
723 * Set open flags for a given cache mode
725 * Return 0 on success, -1 if the cache mode was invalid.
727 int bdrv_parse_cache_flags(const char *mode, int *flags)
729 *flags &= ~BDRV_O_CACHE_MASK;
731 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
732 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
733 } else if (!strcmp(mode, "directsync")) {
734 *flags |= BDRV_O_NOCACHE;
735 } else if (!strcmp(mode, "writeback")) {
736 *flags |= BDRV_O_CACHE_WB;
737 } else if (!strcmp(mode, "unsafe")) {
738 *flags |= BDRV_O_CACHE_WB;
739 *flags |= BDRV_O_NO_FLUSH;
740 } else if (!strcmp(mode, "writethrough")) {
741 /* this is the default */
742 } else {
743 return -1;
746 return 0;
750 * The copy-on-read flag is actually a reference count so multiple users may
751 * use the feature without worrying about clobbering its previous state.
752 * Copy-on-read stays enabled until all users have called to disable it.
754 void bdrv_enable_copy_on_read(BlockDriverState *bs)
756 bs->copy_on_read++;
759 void bdrv_disable_copy_on_read(BlockDriverState *bs)
761 assert(bs->copy_on_read > 0);
762 bs->copy_on_read--;
765 static int bdrv_open_flags(BlockDriverState *bs, int flags)
767 int open_flags = flags | BDRV_O_CACHE_WB;
770 * Clear flags that are internal to the block layer before opening the
771 * image.
773 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
776 * Snapshots should be writable.
778 if (bs->is_temporary) {
779 open_flags |= BDRV_O_RDWR;
782 return open_flags;
785 static int bdrv_assign_node_name(BlockDriverState *bs,
786 const char *node_name,
787 Error **errp)
789 if (!node_name) {
790 return 0;
793 /* empty string node name is invalid */
794 if (node_name[0] == '\0') {
795 error_setg(errp, "Empty node name");
796 return -EINVAL;
799 /* takes care of avoiding namespaces collisions */
800 if (bdrv_find(node_name)) {
801 error_setg(errp, "node-name=%s is conflicting with a device id",
802 node_name);
803 return -EINVAL;
806 /* takes care of avoiding duplicates node names */
807 if (bdrv_find_node(node_name)) {
808 error_setg(errp, "Duplicate node name");
809 return -EINVAL;
812 /* copy node name into the bs and insert it into the graph list */
813 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
814 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
816 return 0;
820 * Common part for opening disk images and files
822 * Removes all processed options from *options.
824 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
825 QDict *options, int flags, BlockDriver *drv, Error **errp)
827 int ret, open_flags;
828 const char *filename;
829 const char *node_name = NULL;
830 Error *local_err = NULL;
832 assert(drv != NULL);
833 assert(bs->file == NULL);
834 assert(options != NULL && bs->options != options);
836 if (file != NULL) {
837 filename = file->filename;
838 } else {
839 filename = qdict_get_try_str(options, "filename");
842 if (drv->bdrv_needs_filename && !filename) {
843 error_setg(errp, "The '%s' block driver requires a file name",
844 drv->format_name);
845 return -EINVAL;
848 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
850 node_name = qdict_get_try_str(options, "node-name");
851 ret = bdrv_assign_node_name(bs, node_name, errp);
852 if (ret < 0) {
853 return ret;
855 qdict_del(options, "node-name");
857 /* bdrv_open() with directly using a protocol as drv. This layer is already
858 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
859 * and return immediately. */
860 if (file != NULL && drv->bdrv_file_open) {
861 bdrv_swap(file, bs);
862 return 0;
865 bs->open_flags = flags;
866 bs->guest_block_size = 512;
867 bs->request_alignment = 512;
868 bs->zero_beyond_eof = true;
869 open_flags = bdrv_open_flags(bs, flags);
870 bs->read_only = !(open_flags & BDRV_O_RDWR);
872 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
873 error_setg(errp,
874 !bs->read_only && bdrv_is_whitelisted(drv, true)
875 ? "Driver '%s' can only be used for read-only devices"
876 : "Driver '%s' is not whitelisted",
877 drv->format_name);
878 return -ENOTSUP;
881 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
882 if (flags & BDRV_O_COPY_ON_READ) {
883 if (!bs->read_only) {
884 bdrv_enable_copy_on_read(bs);
885 } else {
886 error_setg(errp, "Can't use copy-on-read on read-only device");
887 return -EINVAL;
891 if (filename != NULL) {
892 pstrcpy(bs->filename, sizeof(bs->filename), filename);
893 } else {
894 bs->filename[0] = '\0';
897 bs->drv = drv;
898 bs->opaque = g_malloc0(drv->instance_size);
900 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
902 /* Open the image, either directly or using a protocol */
903 if (drv->bdrv_file_open) {
904 assert(file == NULL);
905 assert(!drv->bdrv_needs_filename || filename != NULL);
906 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
907 } else {
908 if (file == NULL) {
909 error_setg(errp, "Can't use '%s' as a block driver for the "
910 "protocol level", drv->format_name);
911 ret = -EINVAL;
912 goto free_and_fail;
914 bs->file = file;
915 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
918 if (ret < 0) {
919 if (local_err) {
920 error_propagate(errp, local_err);
921 } else if (bs->filename[0]) {
922 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
923 } else {
924 error_setg_errno(errp, -ret, "Could not open image");
926 goto free_and_fail;
929 ret = refresh_total_sectors(bs, bs->total_sectors);
930 if (ret < 0) {
931 error_setg_errno(errp, -ret, "Could not refresh total sector count");
932 goto free_and_fail;
935 bdrv_refresh_limits(bs);
936 assert(bdrv_opt_mem_align(bs) != 0);
937 assert(bs->request_alignment != 0);
939 #ifndef _WIN32
940 if (bs->is_temporary) {
941 assert(bs->filename[0] != '\0');
942 unlink(bs->filename);
944 #endif
945 return 0;
947 free_and_fail:
948 bs->file = NULL;
949 g_free(bs->opaque);
950 bs->opaque = NULL;
951 bs->drv = NULL;
952 return ret;
956 * Opens a file using a protocol (file, host_device, nbd, ...)
958 * options is a QDict of options to pass to the block drivers, or NULL for an
959 * empty set of options. The reference to the QDict belongs to the block layer
960 * after the call (even on failure), so if the caller intends to reuse the
961 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
963 static int bdrv_file_open(BlockDriverState **pbs, const char *filename,
964 QDict *options, int flags, Error **errp)
966 BlockDriverState *bs = NULL;
967 BlockDriver *drv;
968 const char *drvname;
969 bool allow_protocol_prefix = false;
970 Error *local_err = NULL;
971 int ret;
973 /* NULL means an empty set of options */
974 if (options == NULL) {
975 options = qdict_new();
978 bs = bdrv_new("");
979 bs->options = options;
980 options = qdict_clone_shallow(options);
982 /* Fetch the file name from the options QDict if necessary */
983 if (!filename) {
984 filename = qdict_get_try_str(options, "filename");
985 } else if (filename && !qdict_haskey(options, "filename")) {
986 qdict_put(options, "filename", qstring_from_str(filename));
987 allow_protocol_prefix = true;
988 } else {
989 error_setg(errp, "Can't specify 'file' and 'filename' options at the "
990 "same time");
991 ret = -EINVAL;
992 goto fail;
995 /* Find the right block driver */
996 drvname = qdict_get_try_str(options, "driver");
997 if (drvname) {
998 drv = bdrv_find_format(drvname);
999 if (!drv) {
1000 error_setg(errp, "Unknown driver '%s'", drvname);
1002 qdict_del(options, "driver");
1003 } else if (filename) {
1004 drv = bdrv_find_protocol(filename, allow_protocol_prefix);
1005 if (!drv) {
1006 error_setg(errp, "Unknown protocol");
1008 } else {
1009 error_setg(errp, "Must specify either driver or file");
1010 drv = NULL;
1013 if (!drv) {
1014 /* errp has been set already */
1015 ret = -ENOENT;
1016 goto fail;
1019 /* Parse the filename and open it */
1020 if (drv->bdrv_parse_filename && filename) {
1021 drv->bdrv_parse_filename(filename, options, &local_err);
1022 if (local_err) {
1023 error_propagate(errp, local_err);
1024 ret = -EINVAL;
1025 goto fail;
1027 qdict_del(options, "filename");
1030 if (!drv->bdrv_file_open) {
1031 ret = bdrv_open(&bs, filename, NULL, options, flags, drv, &local_err);
1032 options = NULL;
1033 } else {
1034 ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
1036 if (ret < 0) {
1037 error_propagate(errp, local_err);
1038 goto fail;
1041 /* Check if any unknown options were used */
1042 if (options && (qdict_size(options) != 0)) {
1043 const QDictEntry *entry = qdict_first(options);
1044 error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
1045 drv->format_name, entry->key);
1046 ret = -EINVAL;
1047 goto fail;
1049 QDECREF(options);
1051 bs->growable = 1;
1052 *pbs = bs;
1053 return 0;
1055 fail:
1056 QDECREF(options);
1057 if (!bs->drv) {
1058 QDECREF(bs->options);
1060 bdrv_unref(bs);
1061 return ret;
1065 * Opens the backing file for a BlockDriverState if not yet open
1067 * options is a QDict of options to pass to the block drivers, or NULL for an
1068 * empty set of options. The reference to the QDict is transferred to this
1069 * function (even on failure), so if the caller intends to reuse the dictionary,
1070 * it needs to use QINCREF() before calling bdrv_file_open.
1072 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1074 char backing_filename[PATH_MAX];
1075 int back_flags, ret;
1076 BlockDriver *back_drv = NULL;
1077 Error *local_err = NULL;
1079 if (bs->backing_hd != NULL) {
1080 QDECREF(options);
1081 return 0;
1084 /* NULL means an empty set of options */
1085 if (options == NULL) {
1086 options = qdict_new();
1089 bs->open_flags &= ~BDRV_O_NO_BACKING;
1090 if (qdict_haskey(options, "file.filename")) {
1091 backing_filename[0] = '\0';
1092 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1093 QDECREF(options);
1094 return 0;
1095 } else {
1096 bdrv_get_full_backing_filename(bs, backing_filename,
1097 sizeof(backing_filename));
1100 if (bs->backing_format[0] != '\0') {
1101 back_drv = bdrv_find_format(bs->backing_format);
1104 /* backing files always opened read-only */
1105 back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1106 BDRV_O_COPY_ON_READ);
1108 assert(bs->backing_hd == NULL);
1109 ret = bdrv_open(&bs->backing_hd,
1110 *backing_filename ? backing_filename : NULL, NULL, options,
1111 back_flags, back_drv, &local_err);
1112 if (ret < 0) {
1113 bs->backing_hd = NULL;
1114 bs->open_flags |= BDRV_O_NO_BACKING;
1115 error_setg(errp, "Could not open backing file: %s",
1116 error_get_pretty(local_err));
1117 error_free(local_err);
1118 return ret;
1121 if (bs->backing_hd->file) {
1122 pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1123 bs->backing_hd->file->filename);
1126 /* Recalculate the BlockLimits with the backing file */
1127 bdrv_refresh_limits(bs);
1129 return 0;
1133 * Opens a disk image whose options are given as BlockdevRef in another block
1134 * device's options.
1136 * If force_raw is true, bdrv_file_open() will be used, thereby preventing any
1137 * image format auto-detection. If it is false and a filename is given,
1138 * bdrv_open() will be used for auto-detection.
1140 * If allow_none is true, no image will be opened if filename is false and no
1141 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1143 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1144 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1145 * itself, all options starting with "${bdref_key}." are considered part of the
1146 * BlockdevRef.
1148 * The BlockdevRef will be removed from the options QDict.
1150 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1152 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1153 QDict *options, const char *bdref_key, int flags,
1154 bool force_raw, bool allow_none, Error **errp)
1156 QDict *image_options;
1157 int ret;
1158 char *bdref_key_dot;
1159 const char *reference;
1161 assert(pbs);
1162 assert(*pbs == NULL);
1164 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1165 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1166 g_free(bdref_key_dot);
1168 reference = qdict_get_try_str(options, bdref_key);
1169 if (!filename && !reference && !qdict_size(image_options)) {
1170 if (allow_none) {
1171 ret = 0;
1172 } else {
1173 error_setg(errp, "A block device must be specified for \"%s\"",
1174 bdref_key);
1175 ret = -EINVAL;
1177 goto done;
1180 if (filename && !force_raw) {
1181 /* If a filename is given and the block driver should be detected
1182 automatically (instead of using none), use bdrv_open() in order to do
1183 that auto-detection. */
1184 if (reference) {
1185 error_setg(errp, "Cannot reference an existing block device while "
1186 "giving a filename");
1187 ret = -EINVAL;
1188 goto done;
1191 ret = bdrv_open(pbs, filename, NULL, image_options, flags, NULL, errp);
1192 } else {
1193 ret = bdrv_open(pbs, filename, reference, image_options,
1194 flags | BDRV_O_PROTOCOL, NULL, errp);
1197 done:
1198 qdict_del(options, bdref_key);
1199 return ret;
1203 * Opens a disk image (raw, qcow2, vmdk, ...)
1205 * options is a QDict of options to pass to the block drivers, or NULL for an
1206 * empty set of options. The reference to the QDict belongs to the block layer
1207 * after the call (even on failure), so if the caller intends to reuse the
1208 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1210 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1211 * If it is not NULL, the referenced BDS will be reused.
1213 * The reference parameter may be used to specify an existing block device which
1214 * should be opened. If specified, neither options nor a filename may be given,
1215 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1217 int bdrv_open(BlockDriverState **pbs, const char *filename,
1218 const char *reference, QDict *options, int flags,
1219 BlockDriver *drv, Error **errp)
1221 int ret;
1222 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1223 char tmp_filename[PATH_MAX + 1];
1224 BlockDriverState *file = NULL, *bs;
1225 const char *drvname;
1226 Error *local_err = NULL;
1228 assert(pbs);
1230 if (reference) {
1231 bool options_non_empty = options ? qdict_size(options) : false;
1232 QDECREF(options);
1234 if (*pbs) {
1235 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1236 "another block device");
1237 return -EINVAL;
1240 if (filename || options_non_empty) {
1241 error_setg(errp, "Cannot reference an existing block device with "
1242 "additional options or a new filename");
1243 return -EINVAL;
1246 bs = bdrv_lookup_bs(reference, reference, errp);
1247 if (!bs) {
1248 return -ENODEV;
1250 bdrv_ref(bs);
1251 *pbs = bs;
1252 return 0;
1255 if (flags & BDRV_O_PROTOCOL) {
1256 assert(!drv);
1257 return bdrv_file_open(pbs, filename, options, flags & ~BDRV_O_PROTOCOL,
1258 errp);
1261 if (*pbs) {
1262 bs = *pbs;
1263 } else {
1264 bs = bdrv_new("");
1267 /* NULL means an empty set of options */
1268 if (options == NULL) {
1269 options = qdict_new();
1272 bs->options = options;
1273 options = qdict_clone_shallow(options);
1275 /* For snapshot=on, create a temporary qcow2 overlay */
1276 if (flags & BDRV_O_SNAPSHOT) {
1277 BlockDriverState *bs1;
1278 int64_t total_size;
1279 BlockDriver *bdrv_qcow2;
1280 QEMUOptionParameter *create_options;
1281 QDict *snapshot_options;
1283 /* if snapshot, we create a temporary backing file and open it
1284 instead of opening 'filename' directly */
1286 /* Get the required size from the image */
1287 QINCREF(options);
1288 bs1 = NULL;
1289 ret = bdrv_open(&bs1, filename, NULL, options, BDRV_O_NO_BACKING,
1290 drv, &local_err);
1291 if (ret < 0) {
1292 goto fail;
1294 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1296 bdrv_unref(bs1);
1298 /* Create the temporary image */
1299 ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1300 if (ret < 0) {
1301 error_setg_errno(errp, -ret, "Could not get temporary filename");
1302 goto fail;
1305 bdrv_qcow2 = bdrv_find_format("qcow2");
1306 create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1307 NULL);
1309 set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1311 ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1312 free_option_parameters(create_options);
1313 if (ret < 0) {
1314 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1315 "'%s': %s", tmp_filename,
1316 error_get_pretty(local_err));
1317 error_free(local_err);
1318 local_err = NULL;
1319 goto fail;
1322 /* Prepare a new options QDict for the temporary file, where user
1323 * options refer to the backing file */
1324 if (filename) {
1325 qdict_put(options, "file.filename", qstring_from_str(filename));
1327 if (drv) {
1328 qdict_put(options, "driver", qstring_from_str(drv->format_name));
1331 snapshot_options = qdict_new();
1332 qdict_put(snapshot_options, "backing", options);
1333 qdict_flatten(snapshot_options);
1335 bs->options = snapshot_options;
1336 options = qdict_clone_shallow(bs->options);
1338 filename = tmp_filename;
1339 drv = bdrv_qcow2;
1340 bs->is_temporary = 1;
1343 /* Open image file without format layer */
1344 if (flags & BDRV_O_RDWR) {
1345 flags |= BDRV_O_ALLOW_RDWR;
1348 assert(file == NULL);
1349 ret = bdrv_open_image(&file, filename, options, "file",
1350 bdrv_open_flags(bs, flags | BDRV_O_UNMAP), true, true,
1351 &local_err);
1352 if (ret < 0) {
1353 goto fail;
1356 /* Find the right image format driver */
1357 drvname = qdict_get_try_str(options, "driver");
1358 if (drvname) {
1359 drv = bdrv_find_format(drvname);
1360 qdict_del(options, "driver");
1361 if (!drv) {
1362 error_setg(errp, "Invalid driver: '%s'", drvname);
1363 ret = -EINVAL;
1364 goto unlink_and_fail;
1368 if (!drv) {
1369 if (file) {
1370 ret = find_image_format(file, filename, &drv, &local_err);
1371 } else {
1372 error_setg(errp, "Must specify either driver or file");
1373 ret = -EINVAL;
1374 goto unlink_and_fail;
1378 if (!drv) {
1379 goto unlink_and_fail;
1382 /* Open the image */
1383 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1384 if (ret < 0) {
1385 goto unlink_and_fail;
1388 if (file && (bs->file != file)) {
1389 bdrv_unref(file);
1390 file = NULL;
1393 /* If there is a backing file, use it */
1394 if ((flags & BDRV_O_NO_BACKING) == 0) {
1395 QDict *backing_options;
1397 qdict_extract_subqdict(options, &backing_options, "backing.");
1398 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1399 if (ret < 0) {
1400 goto close_and_fail;
1404 /* Check if any unknown options were used */
1405 if (qdict_size(options) != 0) {
1406 const QDictEntry *entry = qdict_first(options);
1407 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1408 "support the option '%s'", drv->format_name, bs->device_name,
1409 entry->key);
1411 ret = -EINVAL;
1412 goto close_and_fail;
1414 QDECREF(options);
1416 if (!bdrv_key_required(bs)) {
1417 bdrv_dev_change_media_cb(bs, true);
1420 *pbs = bs;
1421 return 0;
1423 unlink_and_fail:
1424 if (file != NULL) {
1425 bdrv_unref(file);
1427 if (bs->is_temporary) {
1428 unlink(filename);
1430 fail:
1431 QDECREF(bs->options);
1432 QDECREF(options);
1433 bs->options = NULL;
1434 if (!*pbs) {
1435 /* If *pbs is NULL, a new BDS has been created in this function and
1436 needs to be freed now. Otherwise, it does not need to be closed,
1437 since it has not really been opened yet. */
1438 bdrv_unref(bs);
1440 if (local_err) {
1441 error_propagate(errp, local_err);
1443 return ret;
1445 close_and_fail:
1446 /* See fail path, but now the BDS has to be always closed */
1447 if (*pbs) {
1448 bdrv_close(bs);
1449 } else {
1450 bdrv_unref(bs);
1452 QDECREF(options);
1453 if (local_err) {
1454 error_propagate(errp, local_err);
1456 return ret;
1459 typedef struct BlockReopenQueueEntry {
1460 bool prepared;
1461 BDRVReopenState state;
1462 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1463 } BlockReopenQueueEntry;
1466 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1467 * reopen of multiple devices.
1469 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1470 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1471 * be created and initialized. This newly created BlockReopenQueue should be
1472 * passed back in for subsequent calls that are intended to be of the same
1473 * atomic 'set'.
1475 * bs is the BlockDriverState to add to the reopen queue.
1477 * flags contains the open flags for the associated bs
1479 * returns a pointer to bs_queue, which is either the newly allocated
1480 * bs_queue, or the existing bs_queue being used.
1483 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1484 BlockDriverState *bs, int flags)
1486 assert(bs != NULL);
1488 BlockReopenQueueEntry *bs_entry;
1489 if (bs_queue == NULL) {
1490 bs_queue = g_new0(BlockReopenQueue, 1);
1491 QSIMPLEQ_INIT(bs_queue);
1494 if (bs->file) {
1495 bdrv_reopen_queue(bs_queue, bs->file, flags);
1498 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1499 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1501 bs_entry->state.bs = bs;
1502 bs_entry->state.flags = flags;
1504 return bs_queue;
1508 * Reopen multiple BlockDriverStates atomically & transactionally.
1510 * The queue passed in (bs_queue) must have been built up previous
1511 * via bdrv_reopen_queue().
1513 * Reopens all BDS specified in the queue, with the appropriate
1514 * flags. All devices are prepared for reopen, and failure of any
1515 * device will cause all device changes to be abandonded, and intermediate
1516 * data cleaned up.
1518 * If all devices prepare successfully, then the changes are committed
1519 * to all devices.
1522 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1524 int ret = -1;
1525 BlockReopenQueueEntry *bs_entry, *next;
1526 Error *local_err = NULL;
1528 assert(bs_queue != NULL);
1530 bdrv_drain_all();
1532 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1533 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1534 error_propagate(errp, local_err);
1535 goto cleanup;
1537 bs_entry->prepared = true;
1540 /* If we reach this point, we have success and just need to apply the
1541 * changes
1543 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1544 bdrv_reopen_commit(&bs_entry->state);
1547 ret = 0;
1549 cleanup:
1550 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1551 if (ret && bs_entry->prepared) {
1552 bdrv_reopen_abort(&bs_entry->state);
1554 g_free(bs_entry);
1556 g_free(bs_queue);
1557 return ret;
1561 /* Reopen a single BlockDriverState with the specified flags. */
1562 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1564 int ret = -1;
1565 Error *local_err = NULL;
1566 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1568 ret = bdrv_reopen_multiple(queue, &local_err);
1569 if (local_err != NULL) {
1570 error_propagate(errp, local_err);
1572 return ret;
1577 * Prepares a BlockDriverState for reopen. All changes are staged in the
1578 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1579 * the block driver layer .bdrv_reopen_prepare()
1581 * bs is the BlockDriverState to reopen
1582 * flags are the new open flags
1583 * queue is the reopen queue
1585 * Returns 0 on success, non-zero on error. On error errp will be set
1586 * as well.
1588 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1589 * It is the responsibility of the caller to then call the abort() or
1590 * commit() for any other BDS that have been left in a prepare() state
1593 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1594 Error **errp)
1596 int ret = -1;
1597 Error *local_err = NULL;
1598 BlockDriver *drv;
1600 assert(reopen_state != NULL);
1601 assert(reopen_state->bs->drv != NULL);
1602 drv = reopen_state->bs->drv;
1604 /* if we are to stay read-only, do not allow permission change
1605 * to r/w */
1606 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1607 reopen_state->flags & BDRV_O_RDWR) {
1608 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1609 reopen_state->bs->device_name);
1610 goto error;
1614 ret = bdrv_flush(reopen_state->bs);
1615 if (ret) {
1616 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1617 strerror(-ret));
1618 goto error;
1621 if (drv->bdrv_reopen_prepare) {
1622 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1623 if (ret) {
1624 if (local_err != NULL) {
1625 error_propagate(errp, local_err);
1626 } else {
1627 error_setg(errp, "failed while preparing to reopen image '%s'",
1628 reopen_state->bs->filename);
1630 goto error;
1632 } else {
1633 /* It is currently mandatory to have a bdrv_reopen_prepare()
1634 * handler for each supported drv. */
1635 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1636 drv->format_name, reopen_state->bs->device_name,
1637 "reopening of file");
1638 ret = -1;
1639 goto error;
1642 ret = 0;
1644 error:
1645 return ret;
1649 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1650 * makes them final by swapping the staging BlockDriverState contents into
1651 * the active BlockDriverState contents.
1653 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1655 BlockDriver *drv;
1657 assert(reopen_state != NULL);
1658 drv = reopen_state->bs->drv;
1659 assert(drv != NULL);
1661 /* If there are any driver level actions to take */
1662 if (drv->bdrv_reopen_commit) {
1663 drv->bdrv_reopen_commit(reopen_state);
1666 /* set BDS specific flags now */
1667 reopen_state->bs->open_flags = reopen_state->flags;
1668 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1669 BDRV_O_CACHE_WB);
1670 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1672 bdrv_refresh_limits(reopen_state->bs);
1676 * Abort the reopen, and delete and free the staged changes in
1677 * reopen_state
1679 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1681 BlockDriver *drv;
1683 assert(reopen_state != NULL);
1684 drv = reopen_state->bs->drv;
1685 assert(drv != NULL);
1687 if (drv->bdrv_reopen_abort) {
1688 drv->bdrv_reopen_abort(reopen_state);
1693 void bdrv_close(BlockDriverState *bs)
1695 if (bs->job) {
1696 block_job_cancel_sync(bs->job);
1698 bdrv_drain_all(); /* complete I/O */
1699 bdrv_flush(bs);
1700 bdrv_drain_all(); /* in case flush left pending I/O */
1701 notifier_list_notify(&bs->close_notifiers, bs);
1703 if (bs->drv) {
1704 if (bs->backing_hd) {
1705 bdrv_unref(bs->backing_hd);
1706 bs->backing_hd = NULL;
1708 bs->drv->bdrv_close(bs);
1709 g_free(bs->opaque);
1710 #ifdef _WIN32
1711 if (bs->is_temporary) {
1712 unlink(bs->filename);
1714 #endif
1715 bs->opaque = NULL;
1716 bs->drv = NULL;
1717 bs->copy_on_read = 0;
1718 bs->backing_file[0] = '\0';
1719 bs->backing_format[0] = '\0';
1720 bs->total_sectors = 0;
1721 bs->encrypted = 0;
1722 bs->valid_key = 0;
1723 bs->sg = 0;
1724 bs->growable = 0;
1725 bs->zero_beyond_eof = false;
1726 QDECREF(bs->options);
1727 bs->options = NULL;
1729 if (bs->file != NULL) {
1730 bdrv_unref(bs->file);
1731 bs->file = NULL;
1735 bdrv_dev_change_media_cb(bs, false);
1737 /*throttling disk I/O limits*/
1738 if (bs->io_limits_enabled) {
1739 bdrv_io_limits_disable(bs);
1743 void bdrv_close_all(void)
1745 BlockDriverState *bs;
1747 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1748 bdrv_close(bs);
1752 /* Check if any requests are in-flight (including throttled requests) */
1753 static bool bdrv_requests_pending(BlockDriverState *bs)
1755 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1756 return true;
1758 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1759 return true;
1761 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1762 return true;
1764 if (bs->file && bdrv_requests_pending(bs->file)) {
1765 return true;
1767 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1768 return true;
1770 return false;
1773 static bool bdrv_requests_pending_all(void)
1775 BlockDriverState *bs;
1776 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1777 if (bdrv_requests_pending(bs)) {
1778 return true;
1781 return false;
1785 * Wait for pending requests to complete across all BlockDriverStates
1787 * This function does not flush data to disk, use bdrv_flush_all() for that
1788 * after calling this function.
1790 * Note that completion of an asynchronous I/O operation can trigger any
1791 * number of other I/O operations on other devices---for example a coroutine
1792 * can be arbitrarily complex and a constant flow of I/O can come until the
1793 * coroutine is complete. Because of this, it is not possible to have a
1794 * function to drain a single device's I/O queue.
1796 void bdrv_drain_all(void)
1798 /* Always run first iteration so any pending completion BHs run */
1799 bool busy = true;
1800 BlockDriverState *bs;
1802 while (busy) {
1803 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1804 bdrv_start_throttled_reqs(bs);
1807 busy = bdrv_requests_pending_all();
1808 busy |= aio_poll(qemu_get_aio_context(), busy);
1812 /* make a BlockDriverState anonymous by removing from bdrv_state and
1813 * graph_bdrv_state list.
1814 Also, NULL terminate the device_name to prevent double remove */
1815 void bdrv_make_anon(BlockDriverState *bs)
1817 if (bs->device_name[0] != '\0') {
1818 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1820 bs->device_name[0] = '\0';
1821 if (bs->node_name[0] != '\0') {
1822 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1824 bs->node_name[0] = '\0';
1827 static void bdrv_rebind(BlockDriverState *bs)
1829 if (bs->drv && bs->drv->bdrv_rebind) {
1830 bs->drv->bdrv_rebind(bs);
1834 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1835 BlockDriverState *bs_src)
1837 /* move some fields that need to stay attached to the device */
1838 bs_dest->open_flags = bs_src->open_flags;
1840 /* dev info */
1841 bs_dest->dev_ops = bs_src->dev_ops;
1842 bs_dest->dev_opaque = bs_src->dev_opaque;
1843 bs_dest->dev = bs_src->dev;
1844 bs_dest->guest_block_size = bs_src->guest_block_size;
1845 bs_dest->copy_on_read = bs_src->copy_on_read;
1847 bs_dest->enable_write_cache = bs_src->enable_write_cache;
1849 /* i/o throttled req */
1850 memcpy(&bs_dest->throttle_state,
1851 &bs_src->throttle_state,
1852 sizeof(ThrottleState));
1853 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
1854 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
1855 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
1857 /* r/w error */
1858 bs_dest->on_read_error = bs_src->on_read_error;
1859 bs_dest->on_write_error = bs_src->on_write_error;
1861 /* i/o status */
1862 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
1863 bs_dest->iostatus = bs_src->iostatus;
1865 /* dirty bitmap */
1866 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
1868 /* reference count */
1869 bs_dest->refcnt = bs_src->refcnt;
1871 /* job */
1872 bs_dest->in_use = bs_src->in_use;
1873 bs_dest->job = bs_src->job;
1875 /* keep the same entry in bdrv_states */
1876 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1877 bs_src->device_name);
1878 bs_dest->device_list = bs_src->device_list;
1880 /* keep the same entry in graph_bdrv_states
1881 * We do want to swap name but don't want to swap linked list entries
1883 bs_dest->node_list = bs_src->node_list;
1887 * Swap bs contents for two image chains while they are live,
1888 * while keeping required fields on the BlockDriverState that is
1889 * actually attached to a device.
1891 * This will modify the BlockDriverState fields, and swap contents
1892 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1894 * bs_new is required to be anonymous.
1896 * This function does not create any image files.
1898 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1900 BlockDriverState tmp;
1902 /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1903 assert(bs_new->device_name[0] == '\0');
1904 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1905 assert(bs_new->job == NULL);
1906 assert(bs_new->dev == NULL);
1907 assert(bs_new->in_use == 0);
1908 assert(bs_new->io_limits_enabled == false);
1909 assert(!throttle_have_timer(&bs_new->throttle_state));
1911 tmp = *bs_new;
1912 *bs_new = *bs_old;
1913 *bs_old = tmp;
1915 /* there are some fields that should not be swapped, move them back */
1916 bdrv_move_feature_fields(&tmp, bs_old);
1917 bdrv_move_feature_fields(bs_old, bs_new);
1918 bdrv_move_feature_fields(bs_new, &tmp);
1920 /* bs_new shouldn't be in bdrv_states even after the swap! */
1921 assert(bs_new->device_name[0] == '\0');
1923 /* Check a few fields that should remain attached to the device */
1924 assert(bs_new->dev == NULL);
1925 assert(bs_new->job == NULL);
1926 assert(bs_new->in_use == 0);
1927 assert(bs_new->io_limits_enabled == false);
1928 assert(!throttle_have_timer(&bs_new->throttle_state));
1930 bdrv_rebind(bs_new);
1931 bdrv_rebind(bs_old);
1935 * Add new bs contents at the top of an image chain while the chain is
1936 * live, while keeping required fields on the top layer.
1938 * This will modify the BlockDriverState fields, and swap contents
1939 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1941 * bs_new is required to be anonymous.
1943 * This function does not create any image files.
1945 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1947 bdrv_swap(bs_new, bs_top);
1949 /* The contents of 'tmp' will become bs_top, as we are
1950 * swapping bs_new and bs_top contents. */
1951 bs_top->backing_hd = bs_new;
1952 bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1953 pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1954 bs_new->filename);
1955 pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1956 bs_new->drv ? bs_new->drv->format_name : "");
1959 static void bdrv_delete(BlockDriverState *bs)
1961 assert(!bs->dev);
1962 assert(!bs->job);
1963 assert(!bs->in_use);
1964 assert(!bs->refcnt);
1965 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1967 bdrv_close(bs);
1969 /* remove from list, if necessary */
1970 bdrv_make_anon(bs);
1972 g_free(bs);
1975 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1976 /* TODO change to DeviceState *dev when all users are qdevified */
1978 if (bs->dev) {
1979 return -EBUSY;
1981 bs->dev = dev;
1982 bdrv_iostatus_reset(bs);
1983 return 0;
1986 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1987 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1989 if (bdrv_attach_dev(bs, dev) < 0) {
1990 abort();
1994 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1995 /* TODO change to DeviceState *dev when all users are qdevified */
1997 assert(bs->dev == dev);
1998 bs->dev = NULL;
1999 bs->dev_ops = NULL;
2000 bs->dev_opaque = NULL;
2001 bs->guest_block_size = 512;
2004 /* TODO change to return DeviceState * when all users are qdevified */
2005 void *bdrv_get_attached_dev(BlockDriverState *bs)
2007 return bs->dev;
2010 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2011 void *opaque)
2013 bs->dev_ops = ops;
2014 bs->dev_opaque = opaque;
2017 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2018 enum MonitorEvent ev,
2019 BlockErrorAction action, bool is_read)
2021 QObject *data;
2022 const char *action_str;
2024 switch (action) {
2025 case BDRV_ACTION_REPORT:
2026 action_str = "report";
2027 break;
2028 case BDRV_ACTION_IGNORE:
2029 action_str = "ignore";
2030 break;
2031 case BDRV_ACTION_STOP:
2032 action_str = "stop";
2033 break;
2034 default:
2035 abort();
2038 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2039 bdrv->device_name,
2040 action_str,
2041 is_read ? "read" : "write");
2042 monitor_protocol_event(ev, data);
2044 qobject_decref(data);
2047 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2049 QObject *data;
2051 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2052 bdrv_get_device_name(bs), ejected);
2053 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2055 qobject_decref(data);
2058 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2060 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2061 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2062 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2063 if (tray_was_closed) {
2064 /* tray open */
2065 bdrv_emit_qmp_eject_event(bs, true);
2067 if (load) {
2068 /* tray close */
2069 bdrv_emit_qmp_eject_event(bs, false);
2074 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2076 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2079 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2081 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2082 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2086 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2088 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2089 return bs->dev_ops->is_tray_open(bs->dev_opaque);
2091 return false;
2094 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2096 if (bs->dev_ops && bs->dev_ops->resize_cb) {
2097 bs->dev_ops->resize_cb(bs->dev_opaque);
2101 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2103 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2104 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2106 return false;
2110 * Run consistency checks on an image
2112 * Returns 0 if the check could be completed (it doesn't mean that the image is
2113 * free of errors) or -errno when an internal error occurred. The results of the
2114 * check are stored in res.
2116 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2118 if (bs->drv->bdrv_check == NULL) {
2119 return -ENOTSUP;
2122 memset(res, 0, sizeof(*res));
2123 return bs->drv->bdrv_check(bs, res, fix);
2126 #define COMMIT_BUF_SECTORS 2048
2128 /* commit COW file into the raw image */
2129 int bdrv_commit(BlockDriverState *bs)
2131 BlockDriver *drv = bs->drv;
2132 int64_t sector, total_sectors, length, backing_length;
2133 int n, ro, open_flags;
2134 int ret = 0;
2135 uint8_t *buf = NULL;
2136 char filename[PATH_MAX];
2138 if (!drv)
2139 return -ENOMEDIUM;
2141 if (!bs->backing_hd) {
2142 return -ENOTSUP;
2145 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2146 return -EBUSY;
2149 ro = bs->backing_hd->read_only;
2150 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2151 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2152 open_flags = bs->backing_hd->open_flags;
2154 if (ro) {
2155 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2156 return -EACCES;
2160 length = bdrv_getlength(bs);
2161 if (length < 0) {
2162 ret = length;
2163 goto ro_cleanup;
2166 backing_length = bdrv_getlength(bs->backing_hd);
2167 if (backing_length < 0) {
2168 ret = backing_length;
2169 goto ro_cleanup;
2172 /* If our top snapshot is larger than the backing file image,
2173 * grow the backing file image if possible. If not possible,
2174 * we must return an error */
2175 if (length > backing_length) {
2176 ret = bdrv_truncate(bs->backing_hd, length);
2177 if (ret < 0) {
2178 goto ro_cleanup;
2182 total_sectors = length >> BDRV_SECTOR_BITS;
2183 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2185 for (sector = 0; sector < total_sectors; sector += n) {
2186 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2187 if (ret < 0) {
2188 goto ro_cleanup;
2190 if (ret) {
2191 ret = bdrv_read(bs, sector, buf, n);
2192 if (ret < 0) {
2193 goto ro_cleanup;
2196 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2197 if (ret < 0) {
2198 goto ro_cleanup;
2203 if (drv->bdrv_make_empty) {
2204 ret = drv->bdrv_make_empty(bs);
2205 if (ret < 0) {
2206 goto ro_cleanup;
2208 bdrv_flush(bs);
2212 * Make sure all data we wrote to the backing device is actually
2213 * stable on disk.
2215 if (bs->backing_hd) {
2216 bdrv_flush(bs->backing_hd);
2219 ret = 0;
2220 ro_cleanup:
2221 g_free(buf);
2223 if (ro) {
2224 /* ignoring error return here */
2225 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2228 return ret;
2231 int bdrv_commit_all(void)
2233 BlockDriverState *bs;
2235 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2236 if (bs->drv && bs->backing_hd) {
2237 int ret = bdrv_commit(bs);
2238 if (ret < 0) {
2239 return ret;
2243 return 0;
2247 * Remove an active request from the tracked requests list
2249 * This function should be called when a tracked request is completing.
2251 static void tracked_request_end(BdrvTrackedRequest *req)
2253 if (req->serialising) {
2254 req->bs->serialising_in_flight--;
2257 QLIST_REMOVE(req, list);
2258 qemu_co_queue_restart_all(&req->wait_queue);
2262 * Add an active request to the tracked requests list
2264 static void tracked_request_begin(BdrvTrackedRequest *req,
2265 BlockDriverState *bs,
2266 int64_t offset,
2267 unsigned int bytes, bool is_write)
2269 *req = (BdrvTrackedRequest){
2270 .bs = bs,
2271 .offset = offset,
2272 .bytes = bytes,
2273 .is_write = is_write,
2274 .co = qemu_coroutine_self(),
2275 .serialising = false,
2276 .overlap_offset = offset,
2277 .overlap_bytes = bytes,
2280 qemu_co_queue_init(&req->wait_queue);
2282 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2285 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2287 int64_t overlap_offset = req->offset & ~(align - 1);
2288 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2289 - overlap_offset;
2291 if (!req->serialising) {
2292 req->bs->serialising_in_flight++;
2293 req->serialising = true;
2296 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2297 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2301 * Round a region to cluster boundaries
2303 void bdrv_round_to_clusters(BlockDriverState *bs,
2304 int64_t sector_num, int nb_sectors,
2305 int64_t *cluster_sector_num,
2306 int *cluster_nb_sectors)
2308 BlockDriverInfo bdi;
2310 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2311 *cluster_sector_num = sector_num;
2312 *cluster_nb_sectors = nb_sectors;
2313 } else {
2314 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2315 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2316 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2317 nb_sectors, c);
2321 static int bdrv_get_cluster_size(BlockDriverState *bs)
2323 BlockDriverInfo bdi;
2324 int ret;
2326 ret = bdrv_get_info(bs, &bdi);
2327 if (ret < 0 || bdi.cluster_size == 0) {
2328 return bs->request_alignment;
2329 } else {
2330 return bdi.cluster_size;
2334 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2335 int64_t offset, unsigned int bytes)
2337 /* aaaa bbbb */
2338 if (offset >= req->overlap_offset + req->overlap_bytes) {
2339 return false;
2341 /* bbbb aaaa */
2342 if (req->overlap_offset >= offset + bytes) {
2343 return false;
2345 return true;
2348 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2350 BlockDriverState *bs = self->bs;
2351 BdrvTrackedRequest *req;
2352 bool retry;
2353 bool waited = false;
2355 if (!bs->serialising_in_flight) {
2356 return false;
2359 do {
2360 retry = false;
2361 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2362 if (req == self || (!req->serialising && !self->serialising)) {
2363 continue;
2365 if (tracked_request_overlaps(req, self->overlap_offset,
2366 self->overlap_bytes))
2368 /* Hitting this means there was a reentrant request, for
2369 * example, a block driver issuing nested requests. This must
2370 * never happen since it means deadlock.
2372 assert(qemu_coroutine_self() != req->co);
2374 /* If the request is already (indirectly) waiting for us, or
2375 * will wait for us as soon as it wakes up, then just go on
2376 * (instead of producing a deadlock in the former case). */
2377 if (!req->waiting_for) {
2378 self->waiting_for = req;
2379 qemu_co_queue_wait(&req->wait_queue);
2380 self->waiting_for = NULL;
2381 retry = true;
2382 waited = true;
2383 break;
2387 } while (retry);
2389 return waited;
2393 * Return values:
2394 * 0 - success
2395 * -EINVAL - backing format specified, but no file
2396 * -ENOSPC - can't update the backing file because no space is left in the
2397 * image file header
2398 * -ENOTSUP - format driver doesn't support changing the backing file
2400 int bdrv_change_backing_file(BlockDriverState *bs,
2401 const char *backing_file, const char *backing_fmt)
2403 BlockDriver *drv = bs->drv;
2404 int ret;
2406 /* Backing file format doesn't make sense without a backing file */
2407 if (backing_fmt && !backing_file) {
2408 return -EINVAL;
2411 if (drv->bdrv_change_backing_file != NULL) {
2412 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2413 } else {
2414 ret = -ENOTSUP;
2417 if (ret == 0) {
2418 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2419 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2421 return ret;
2425 * Finds the image layer in the chain that has 'bs' as its backing file.
2427 * active is the current topmost image.
2429 * Returns NULL if bs is not found in active's image chain,
2430 * or if active == bs.
2432 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2433 BlockDriverState *bs)
2435 BlockDriverState *overlay = NULL;
2436 BlockDriverState *intermediate;
2438 assert(active != NULL);
2439 assert(bs != NULL);
2441 /* if bs is the same as active, then by definition it has no overlay
2443 if (active == bs) {
2444 return NULL;
2447 intermediate = active;
2448 while (intermediate->backing_hd) {
2449 if (intermediate->backing_hd == bs) {
2450 overlay = intermediate;
2451 break;
2453 intermediate = intermediate->backing_hd;
2456 return overlay;
2459 typedef struct BlkIntermediateStates {
2460 BlockDriverState *bs;
2461 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2462 } BlkIntermediateStates;
2466 * Drops images above 'base' up to and including 'top', and sets the image
2467 * above 'top' to have base as its backing file.
2469 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2470 * information in 'bs' can be properly updated.
2472 * E.g., this will convert the following chain:
2473 * bottom <- base <- intermediate <- top <- active
2475 * to
2477 * bottom <- base <- active
2479 * It is allowed for bottom==base, in which case it converts:
2481 * base <- intermediate <- top <- active
2483 * to
2485 * base <- active
2487 * Error conditions:
2488 * if active == top, that is considered an error
2491 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2492 BlockDriverState *base)
2494 BlockDriverState *intermediate;
2495 BlockDriverState *base_bs = NULL;
2496 BlockDriverState *new_top_bs = NULL;
2497 BlkIntermediateStates *intermediate_state, *next;
2498 int ret = -EIO;
2500 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2501 QSIMPLEQ_INIT(&states_to_delete);
2503 if (!top->drv || !base->drv) {
2504 goto exit;
2507 new_top_bs = bdrv_find_overlay(active, top);
2509 if (new_top_bs == NULL) {
2510 /* we could not find the image above 'top', this is an error */
2511 goto exit;
2514 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2515 * to do, no intermediate images */
2516 if (new_top_bs->backing_hd == base) {
2517 ret = 0;
2518 goto exit;
2521 intermediate = top;
2523 /* now we will go down through the list, and add each BDS we find
2524 * into our deletion queue, until we hit the 'base'
2526 while (intermediate) {
2527 intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2528 intermediate_state->bs = intermediate;
2529 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2531 if (intermediate->backing_hd == base) {
2532 base_bs = intermediate->backing_hd;
2533 break;
2535 intermediate = intermediate->backing_hd;
2537 if (base_bs == NULL) {
2538 /* something went wrong, we did not end at the base. safely
2539 * unravel everything, and exit with error */
2540 goto exit;
2543 /* success - we can delete the intermediate states, and link top->base */
2544 ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2545 base_bs->drv ? base_bs->drv->format_name : "");
2546 if (ret) {
2547 goto exit;
2549 new_top_bs->backing_hd = base_bs;
2551 bdrv_refresh_limits(new_top_bs);
2553 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2554 /* so that bdrv_close() does not recursively close the chain */
2555 intermediate_state->bs->backing_hd = NULL;
2556 bdrv_unref(intermediate_state->bs);
2558 ret = 0;
2560 exit:
2561 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2562 g_free(intermediate_state);
2564 return ret;
2568 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2569 size_t size)
2571 int64_t len;
2573 if (!bdrv_is_inserted(bs))
2574 return -ENOMEDIUM;
2576 if (bs->growable)
2577 return 0;
2579 len = bdrv_getlength(bs);
2581 if (offset < 0)
2582 return -EIO;
2584 if ((offset > len) || (len - offset < size))
2585 return -EIO;
2587 return 0;
2590 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2591 int nb_sectors)
2593 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2594 nb_sectors * BDRV_SECTOR_SIZE);
2597 typedef struct RwCo {
2598 BlockDriverState *bs;
2599 int64_t offset;
2600 QEMUIOVector *qiov;
2601 bool is_write;
2602 int ret;
2603 BdrvRequestFlags flags;
2604 } RwCo;
2606 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2608 RwCo *rwco = opaque;
2610 if (!rwco->is_write) {
2611 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2612 rwco->qiov->size, rwco->qiov,
2613 rwco->flags);
2614 } else {
2615 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2616 rwco->qiov->size, rwco->qiov,
2617 rwco->flags);
2622 * Process a vectored synchronous request using coroutines
2624 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2625 QEMUIOVector *qiov, bool is_write,
2626 BdrvRequestFlags flags)
2628 Coroutine *co;
2629 RwCo rwco = {
2630 .bs = bs,
2631 .offset = offset,
2632 .qiov = qiov,
2633 .is_write = is_write,
2634 .ret = NOT_DONE,
2635 .flags = flags,
2639 * In sync call context, when the vcpu is blocked, this throttling timer
2640 * will not fire; so the I/O throttling function has to be disabled here
2641 * if it has been enabled.
2643 if (bs->io_limits_enabled) {
2644 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2645 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2646 bdrv_io_limits_disable(bs);
2649 if (qemu_in_coroutine()) {
2650 /* Fast-path if already in coroutine context */
2651 bdrv_rw_co_entry(&rwco);
2652 } else {
2653 co = qemu_coroutine_create(bdrv_rw_co_entry);
2654 qemu_coroutine_enter(co, &rwco);
2655 while (rwco.ret == NOT_DONE) {
2656 qemu_aio_wait();
2659 return rwco.ret;
2663 * Process a synchronous request using coroutines
2665 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2666 int nb_sectors, bool is_write, BdrvRequestFlags flags)
2668 QEMUIOVector qiov;
2669 struct iovec iov = {
2670 .iov_base = (void *)buf,
2671 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2674 qemu_iovec_init_external(&qiov, &iov, 1);
2675 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2676 &qiov, is_write, flags);
2679 /* return < 0 if error. See bdrv_write() for the return codes */
2680 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2681 uint8_t *buf, int nb_sectors)
2683 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2686 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2687 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2688 uint8_t *buf, int nb_sectors)
2690 bool enabled;
2691 int ret;
2693 enabled = bs->io_limits_enabled;
2694 bs->io_limits_enabled = false;
2695 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2696 bs->io_limits_enabled = enabled;
2697 return ret;
2700 /* Return < 0 if error. Important errors are:
2701 -EIO generic I/O error (may happen for all errors)
2702 -ENOMEDIUM No media inserted.
2703 -EINVAL Invalid sector number or nb_sectors
2704 -EACCES Trying to write a read-only device
2706 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2707 const uint8_t *buf, int nb_sectors)
2709 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2712 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2713 int nb_sectors, BdrvRequestFlags flags)
2715 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2716 BDRV_REQ_ZERO_WRITE | flags);
2720 * Completely zero out a block device with the help of bdrv_write_zeroes.
2721 * The operation is sped up by checking the block status and only writing
2722 * zeroes to the device if they currently do not return zeroes. Optional
2723 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2725 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2727 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2729 int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2730 int64_t ret, nb_sectors, sector_num = 0;
2731 int n;
2733 for (;;) {
2734 nb_sectors = target_size - sector_num;
2735 if (nb_sectors <= 0) {
2736 return 0;
2738 if (nb_sectors > INT_MAX) {
2739 nb_sectors = INT_MAX;
2741 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2742 if (ret < 0) {
2743 error_report("error getting block status at sector %" PRId64 ": %s",
2744 sector_num, strerror(-ret));
2745 return ret;
2747 if (ret & BDRV_BLOCK_ZERO) {
2748 sector_num += n;
2749 continue;
2751 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2752 if (ret < 0) {
2753 error_report("error writing zeroes at sector %" PRId64 ": %s",
2754 sector_num, strerror(-ret));
2755 return ret;
2757 sector_num += n;
2761 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2763 QEMUIOVector qiov;
2764 struct iovec iov = {
2765 .iov_base = (void *)buf,
2766 .iov_len = bytes,
2768 int ret;
2770 if (bytes < 0) {
2771 return -EINVAL;
2774 qemu_iovec_init_external(&qiov, &iov, 1);
2775 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2776 if (ret < 0) {
2777 return ret;
2780 return bytes;
2783 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2785 int ret;
2787 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2788 if (ret < 0) {
2789 return ret;
2792 return qiov->size;
2795 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2796 const void *buf, int bytes)
2798 QEMUIOVector qiov;
2799 struct iovec iov = {
2800 .iov_base = (void *) buf,
2801 .iov_len = bytes,
2804 if (bytes < 0) {
2805 return -EINVAL;
2808 qemu_iovec_init_external(&qiov, &iov, 1);
2809 return bdrv_pwritev(bs, offset, &qiov);
2813 * Writes to the file and ensures that no writes are reordered across this
2814 * request (acts as a barrier)
2816 * Returns 0 on success, -errno in error cases.
2818 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2819 const void *buf, int count)
2821 int ret;
2823 ret = bdrv_pwrite(bs, offset, buf, count);
2824 if (ret < 0) {
2825 return ret;
2828 /* No flush needed for cache modes that already do it */
2829 if (bs->enable_write_cache) {
2830 bdrv_flush(bs);
2833 return 0;
2836 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2837 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2839 /* Perform I/O through a temporary buffer so that users who scribble over
2840 * their read buffer while the operation is in progress do not end up
2841 * modifying the image file. This is critical for zero-copy guest I/O
2842 * where anything might happen inside guest memory.
2844 void *bounce_buffer;
2846 BlockDriver *drv = bs->drv;
2847 struct iovec iov;
2848 QEMUIOVector bounce_qiov;
2849 int64_t cluster_sector_num;
2850 int cluster_nb_sectors;
2851 size_t skip_bytes;
2852 int ret;
2854 /* Cover entire cluster so no additional backing file I/O is required when
2855 * allocating cluster in the image file.
2857 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2858 &cluster_sector_num, &cluster_nb_sectors);
2860 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2861 cluster_sector_num, cluster_nb_sectors);
2863 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2864 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2865 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2867 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2868 &bounce_qiov);
2869 if (ret < 0) {
2870 goto err;
2873 if (drv->bdrv_co_write_zeroes &&
2874 buffer_is_zero(bounce_buffer, iov.iov_len)) {
2875 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2876 cluster_nb_sectors, 0);
2877 } else {
2878 /* This does not change the data on the disk, it is not necessary
2879 * to flush even in cache=writethrough mode.
2881 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2882 &bounce_qiov);
2885 if (ret < 0) {
2886 /* It might be okay to ignore write errors for guest requests. If this
2887 * is a deliberate copy-on-read then we don't want to ignore the error.
2888 * Simply report it in all cases.
2890 goto err;
2893 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2894 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2895 nb_sectors * BDRV_SECTOR_SIZE);
2897 err:
2898 qemu_vfree(bounce_buffer);
2899 return ret;
2903 * Forwards an already correctly aligned request to the BlockDriver. This
2904 * handles copy on read and zeroing after EOF; any other features must be
2905 * implemented by the caller.
2907 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2908 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2909 int64_t align, QEMUIOVector *qiov, int flags)
2911 BlockDriver *drv = bs->drv;
2912 int ret;
2914 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2915 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2917 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2918 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2920 /* Handle Copy on Read and associated serialisation */
2921 if (flags & BDRV_REQ_COPY_ON_READ) {
2922 /* If we touch the same cluster it counts as an overlap. This
2923 * guarantees that allocating writes will be serialized and not race
2924 * with each other for the same cluster. For example, in copy-on-read
2925 * it ensures that the CoR read and write operations are atomic and
2926 * guest writes cannot interleave between them. */
2927 mark_request_serialising(req, bdrv_get_cluster_size(bs));
2930 wait_serialising_requests(req);
2932 if (flags & BDRV_REQ_COPY_ON_READ) {
2933 int pnum;
2935 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2936 if (ret < 0) {
2937 goto out;
2940 if (!ret || pnum != nb_sectors) {
2941 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2942 goto out;
2946 /* Forward the request to the BlockDriver */
2947 if (!(bs->zero_beyond_eof && bs->growable)) {
2948 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2949 } else {
2950 /* Read zeros after EOF of growable BDSes */
2951 int64_t len, total_sectors, max_nb_sectors;
2953 len = bdrv_getlength(bs);
2954 if (len < 0) {
2955 ret = len;
2956 goto out;
2959 total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2960 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
2961 align >> BDRV_SECTOR_BITS);
2962 if (max_nb_sectors > 0) {
2963 ret = drv->bdrv_co_readv(bs, sector_num,
2964 MIN(nb_sectors, max_nb_sectors), qiov);
2965 } else {
2966 ret = 0;
2969 /* Reading beyond end of file is supposed to produce zeroes */
2970 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2971 uint64_t offset = MAX(0, total_sectors - sector_num);
2972 uint64_t bytes = (sector_num + nb_sectors - offset) *
2973 BDRV_SECTOR_SIZE;
2974 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2978 out:
2979 return ret;
2983 * Handle a read request in coroutine context
2985 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
2986 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
2987 BdrvRequestFlags flags)
2989 BlockDriver *drv = bs->drv;
2990 BdrvTrackedRequest req;
2992 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
2993 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
2994 uint8_t *head_buf = NULL;
2995 uint8_t *tail_buf = NULL;
2996 QEMUIOVector local_qiov;
2997 bool use_local_qiov = false;
2998 int ret;
3000 if (!drv) {
3001 return -ENOMEDIUM;
3003 if (bdrv_check_byte_request(bs, offset, bytes)) {
3004 return -EIO;
3007 if (bs->copy_on_read) {
3008 flags |= BDRV_REQ_COPY_ON_READ;
3011 /* throttling disk I/O */
3012 if (bs->io_limits_enabled) {
3013 bdrv_io_limits_intercept(bs, bytes, false);
3016 /* Align read if necessary by padding qiov */
3017 if (offset & (align - 1)) {
3018 head_buf = qemu_blockalign(bs, align);
3019 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3020 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3021 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3022 use_local_qiov = true;
3024 bytes += offset & (align - 1);
3025 offset = offset & ~(align - 1);
3028 if ((offset + bytes) & (align - 1)) {
3029 if (!use_local_qiov) {
3030 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3031 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3032 use_local_qiov = true;
3034 tail_buf = qemu_blockalign(bs, align);
3035 qemu_iovec_add(&local_qiov, tail_buf,
3036 align - ((offset + bytes) & (align - 1)));
3038 bytes = ROUND_UP(bytes, align);
3041 tracked_request_begin(&req, bs, offset, bytes, false);
3042 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3043 use_local_qiov ? &local_qiov : qiov,
3044 flags);
3045 tracked_request_end(&req);
3047 if (use_local_qiov) {
3048 qemu_iovec_destroy(&local_qiov);
3049 qemu_vfree(head_buf);
3050 qemu_vfree(tail_buf);
3053 return ret;
3056 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3057 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3058 BdrvRequestFlags flags)
3060 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3061 return -EINVAL;
3064 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3065 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3068 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3069 int nb_sectors, QEMUIOVector *qiov)
3071 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3073 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3076 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3077 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3079 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3081 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3082 BDRV_REQ_COPY_ON_READ);
3085 /* if no limit is specified in the BlockLimits use a default
3086 * of 32768 512-byte sectors (16 MiB) per request.
3088 #define MAX_WRITE_ZEROES_DEFAULT 32768
3090 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3091 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3093 BlockDriver *drv = bs->drv;
3094 QEMUIOVector qiov;
3095 struct iovec iov = {0};
3096 int ret = 0;
3098 int max_write_zeroes = bs->bl.max_write_zeroes ?
3099 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3101 while (nb_sectors > 0 && !ret) {
3102 int num = nb_sectors;
3104 /* Align request. Block drivers can expect the "bulk" of the request
3105 * to be aligned.
3107 if (bs->bl.write_zeroes_alignment
3108 && num > bs->bl.write_zeroes_alignment) {
3109 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3110 /* Make a small request up to the first aligned sector. */
3111 num = bs->bl.write_zeroes_alignment;
3112 num -= sector_num % bs->bl.write_zeroes_alignment;
3113 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3114 /* Shorten the request to the last aligned sector. num cannot
3115 * underflow because num > bs->bl.write_zeroes_alignment.
3117 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3121 /* limit request size */
3122 if (num > max_write_zeroes) {
3123 num = max_write_zeroes;
3126 ret = -ENOTSUP;
3127 /* First try the efficient write zeroes operation */
3128 if (drv->bdrv_co_write_zeroes) {
3129 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3132 if (ret == -ENOTSUP) {
3133 /* Fall back to bounce buffer if write zeroes is unsupported */
3134 iov.iov_len = num * BDRV_SECTOR_SIZE;
3135 if (iov.iov_base == NULL) {
3136 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3137 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3139 qemu_iovec_init_external(&qiov, &iov, 1);
3141 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3143 /* Keep bounce buffer around if it is big enough for all
3144 * all future requests.
3146 if (num < max_write_zeroes) {
3147 qemu_vfree(iov.iov_base);
3148 iov.iov_base = NULL;
3152 sector_num += num;
3153 nb_sectors -= num;
3156 qemu_vfree(iov.iov_base);
3157 return ret;
3161 * Forwards an already correctly aligned write request to the BlockDriver.
3163 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3164 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3165 QEMUIOVector *qiov, int flags)
3167 BlockDriver *drv = bs->drv;
3168 bool waited;
3169 int ret;
3171 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3172 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3174 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3175 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3177 waited = wait_serialising_requests(req);
3178 assert(!waited || !req->serialising);
3179 assert(req->overlap_offset <= offset);
3180 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3182 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3184 if (ret < 0) {
3185 /* Do nothing, write notifier decided to fail this request */
3186 } else if (flags & BDRV_REQ_ZERO_WRITE) {
3187 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3188 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3189 } else {
3190 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3191 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3193 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3195 if (ret == 0 && !bs->enable_write_cache) {
3196 ret = bdrv_co_flush(bs);
3199 bdrv_set_dirty(bs, sector_num, nb_sectors);
3201 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3202 bs->wr_highest_sector = sector_num + nb_sectors - 1;
3204 if (bs->growable && ret >= 0) {
3205 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3208 return ret;
3212 * Handle a write request in coroutine context
3214 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3215 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3216 BdrvRequestFlags flags)
3218 BdrvTrackedRequest req;
3219 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3220 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3221 uint8_t *head_buf = NULL;
3222 uint8_t *tail_buf = NULL;
3223 QEMUIOVector local_qiov;
3224 bool use_local_qiov = false;
3225 int ret;
3227 if (!bs->drv) {
3228 return -ENOMEDIUM;
3230 if (bs->read_only) {
3231 return -EACCES;
3233 if (bdrv_check_byte_request(bs, offset, bytes)) {
3234 return -EIO;
3237 /* throttling disk I/O */
3238 if (bs->io_limits_enabled) {
3239 bdrv_io_limits_intercept(bs, bytes, true);
3243 * Align write if necessary by performing a read-modify-write cycle.
3244 * Pad qiov with the read parts and be sure to have a tracked request not
3245 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3247 tracked_request_begin(&req, bs, offset, bytes, true);
3249 if (offset & (align - 1)) {
3250 QEMUIOVector head_qiov;
3251 struct iovec head_iov;
3253 mark_request_serialising(&req, align);
3254 wait_serialising_requests(&req);
3256 head_buf = qemu_blockalign(bs, align);
3257 head_iov = (struct iovec) {
3258 .iov_base = head_buf,
3259 .iov_len = align,
3261 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3263 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3264 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3265 align, &head_qiov, 0);
3266 if (ret < 0) {
3267 goto fail;
3269 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3271 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3272 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3273 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3274 use_local_qiov = true;
3276 bytes += offset & (align - 1);
3277 offset = offset & ~(align - 1);
3280 if ((offset + bytes) & (align - 1)) {
3281 QEMUIOVector tail_qiov;
3282 struct iovec tail_iov;
3283 size_t tail_bytes;
3284 bool waited;
3286 mark_request_serialising(&req, align);
3287 waited = wait_serialising_requests(&req);
3288 assert(!waited || !use_local_qiov);
3290 tail_buf = qemu_blockalign(bs, align);
3291 tail_iov = (struct iovec) {
3292 .iov_base = tail_buf,
3293 .iov_len = align,
3295 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3297 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3298 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3299 align, &tail_qiov, 0);
3300 if (ret < 0) {
3301 goto fail;
3303 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3305 if (!use_local_qiov) {
3306 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3307 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3308 use_local_qiov = true;
3311 tail_bytes = (offset + bytes) & (align - 1);
3312 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3314 bytes = ROUND_UP(bytes, align);
3317 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3318 use_local_qiov ? &local_qiov : qiov,
3319 flags);
3321 fail:
3322 tracked_request_end(&req);
3324 if (use_local_qiov) {
3325 qemu_iovec_destroy(&local_qiov);
3327 qemu_vfree(head_buf);
3328 qemu_vfree(tail_buf);
3330 return ret;
3333 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3334 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3335 BdrvRequestFlags flags)
3337 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3338 return -EINVAL;
3341 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3342 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3345 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3346 int nb_sectors, QEMUIOVector *qiov)
3348 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3350 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3353 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3354 int64_t sector_num, int nb_sectors,
3355 BdrvRequestFlags flags)
3357 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3359 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3360 flags &= ~BDRV_REQ_MAY_UNMAP;
3363 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3364 BDRV_REQ_ZERO_WRITE | flags);
3368 * Truncate file to 'offset' bytes (needed only for file protocols)
3370 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3372 BlockDriver *drv = bs->drv;
3373 int ret;
3374 if (!drv)
3375 return -ENOMEDIUM;
3376 if (!drv->bdrv_truncate)
3377 return -ENOTSUP;
3378 if (bs->read_only)
3379 return -EACCES;
3380 if (bdrv_in_use(bs))
3381 return -EBUSY;
3382 ret = drv->bdrv_truncate(bs, offset);
3383 if (ret == 0) {
3384 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3385 bdrv_dev_resize_cb(bs);
3387 return ret;
3391 * Length of a allocated file in bytes. Sparse files are counted by actual
3392 * allocated space. Return < 0 if error or unknown.
3394 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3396 BlockDriver *drv = bs->drv;
3397 if (!drv) {
3398 return -ENOMEDIUM;
3400 if (drv->bdrv_get_allocated_file_size) {
3401 return drv->bdrv_get_allocated_file_size(bs);
3403 if (bs->file) {
3404 return bdrv_get_allocated_file_size(bs->file);
3406 return -ENOTSUP;
3410 * Length of a file in bytes. Return < 0 if error or unknown.
3412 int64_t bdrv_getlength(BlockDriverState *bs)
3414 BlockDriver *drv = bs->drv;
3415 if (!drv)
3416 return -ENOMEDIUM;
3418 if (drv->has_variable_length) {
3419 int ret = refresh_total_sectors(bs, bs->total_sectors);
3420 if (ret < 0) {
3421 return ret;
3424 return bs->total_sectors * BDRV_SECTOR_SIZE;
3427 /* return 0 as number of sectors if no device present or error */
3428 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3430 int64_t length;
3431 length = bdrv_getlength(bs);
3432 if (length < 0)
3433 length = 0;
3434 else
3435 length = length >> BDRV_SECTOR_BITS;
3436 *nb_sectors_ptr = length;
3439 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3440 BlockdevOnError on_write_error)
3442 bs->on_read_error = on_read_error;
3443 bs->on_write_error = on_write_error;
3446 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3448 return is_read ? bs->on_read_error : bs->on_write_error;
3451 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3453 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3455 switch (on_err) {
3456 case BLOCKDEV_ON_ERROR_ENOSPC:
3457 return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3458 case BLOCKDEV_ON_ERROR_STOP:
3459 return BDRV_ACTION_STOP;
3460 case BLOCKDEV_ON_ERROR_REPORT:
3461 return BDRV_ACTION_REPORT;
3462 case BLOCKDEV_ON_ERROR_IGNORE:
3463 return BDRV_ACTION_IGNORE;
3464 default:
3465 abort();
3469 /* This is done by device models because, while the block layer knows
3470 * about the error, it does not know whether an operation comes from
3471 * the device or the block layer (from a job, for example).
3473 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3474 bool is_read, int error)
3476 assert(error >= 0);
3477 bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3478 if (action == BDRV_ACTION_STOP) {
3479 vm_stop(RUN_STATE_IO_ERROR);
3480 bdrv_iostatus_set_err(bs, error);
3484 int bdrv_is_read_only(BlockDriverState *bs)
3486 return bs->read_only;
3489 int bdrv_is_sg(BlockDriverState *bs)
3491 return bs->sg;
3494 int bdrv_enable_write_cache(BlockDriverState *bs)
3496 return bs->enable_write_cache;
3499 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3501 bs->enable_write_cache = wce;
3503 /* so a reopen() will preserve wce */
3504 if (wce) {
3505 bs->open_flags |= BDRV_O_CACHE_WB;
3506 } else {
3507 bs->open_flags &= ~BDRV_O_CACHE_WB;
3511 int bdrv_is_encrypted(BlockDriverState *bs)
3513 if (bs->backing_hd && bs->backing_hd->encrypted)
3514 return 1;
3515 return bs->encrypted;
3518 int bdrv_key_required(BlockDriverState *bs)
3520 BlockDriverState *backing_hd = bs->backing_hd;
3522 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3523 return 1;
3524 return (bs->encrypted && !bs->valid_key);
3527 int bdrv_set_key(BlockDriverState *bs, const char *key)
3529 int ret;
3530 if (bs->backing_hd && bs->backing_hd->encrypted) {
3531 ret = bdrv_set_key(bs->backing_hd, key);
3532 if (ret < 0)
3533 return ret;
3534 if (!bs->encrypted)
3535 return 0;
3537 if (!bs->encrypted) {
3538 return -EINVAL;
3539 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3540 return -ENOMEDIUM;
3542 ret = bs->drv->bdrv_set_key(bs, key);
3543 if (ret < 0) {
3544 bs->valid_key = 0;
3545 } else if (!bs->valid_key) {
3546 bs->valid_key = 1;
3547 /* call the change callback now, we skipped it on open */
3548 bdrv_dev_change_media_cb(bs, true);
3550 return ret;
3553 const char *bdrv_get_format_name(BlockDriverState *bs)
3555 return bs->drv ? bs->drv->format_name : NULL;
3558 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3559 void *opaque)
3561 BlockDriver *drv;
3563 QLIST_FOREACH(drv, &bdrv_drivers, list) {
3564 it(opaque, drv->format_name);
3568 /* This function is to find block backend bs */
3569 BlockDriverState *bdrv_find(const char *name)
3571 BlockDriverState *bs;
3573 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3574 if (!strcmp(name, bs->device_name)) {
3575 return bs;
3578 return NULL;
3581 /* This function is to find a node in the bs graph */
3582 BlockDriverState *bdrv_find_node(const char *node_name)
3584 BlockDriverState *bs;
3586 assert(node_name);
3588 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3589 if (!strcmp(node_name, bs->node_name)) {
3590 return bs;
3593 return NULL;
3596 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3597 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3599 BlockDeviceInfoList *list, *entry;
3600 BlockDriverState *bs;
3602 list = NULL;
3603 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3604 entry = g_malloc0(sizeof(*entry));
3605 entry->value = bdrv_block_device_info(bs);
3606 entry->next = list;
3607 list = entry;
3610 return list;
3613 BlockDriverState *bdrv_lookup_bs(const char *device,
3614 const char *node_name,
3615 Error **errp)
3617 BlockDriverState *bs = NULL;
3619 if (device) {
3620 bs = bdrv_find(device);
3622 if (bs) {
3623 return bs;
3627 if (node_name) {
3628 bs = bdrv_find_node(node_name);
3630 if (bs) {
3631 return bs;
3635 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3636 device ? device : "",
3637 node_name ? node_name : "");
3638 return NULL;
3641 BlockDriverState *bdrv_next(BlockDriverState *bs)
3643 if (!bs) {
3644 return QTAILQ_FIRST(&bdrv_states);
3646 return QTAILQ_NEXT(bs, device_list);
3649 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3651 BlockDriverState *bs;
3653 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3654 it(opaque, bs);
3658 const char *bdrv_get_device_name(BlockDriverState *bs)
3660 return bs->device_name;
3663 int bdrv_get_flags(BlockDriverState *bs)
3665 return bs->open_flags;
3668 int bdrv_flush_all(void)
3670 BlockDriverState *bs;
3671 int result = 0;
3673 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3674 int ret = bdrv_flush(bs);
3675 if (ret < 0 && !result) {
3676 result = ret;
3680 return result;
3683 int bdrv_has_zero_init_1(BlockDriverState *bs)
3685 return 1;
3688 int bdrv_has_zero_init(BlockDriverState *bs)
3690 assert(bs->drv);
3692 /* If BS is a copy on write image, it is initialized to
3693 the contents of the base image, which may not be zeroes. */
3694 if (bs->backing_hd) {
3695 return 0;
3697 if (bs->drv->bdrv_has_zero_init) {
3698 return bs->drv->bdrv_has_zero_init(bs);
3701 /* safe default */
3702 return 0;
3705 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3707 BlockDriverInfo bdi;
3709 if (bs->backing_hd) {
3710 return false;
3713 if (bdrv_get_info(bs, &bdi) == 0) {
3714 return bdi.unallocated_blocks_are_zero;
3717 return false;
3720 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3722 BlockDriverInfo bdi;
3724 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3725 return false;
3728 if (bdrv_get_info(bs, &bdi) == 0) {
3729 return bdi.can_write_zeroes_with_unmap;
3732 return false;
3735 typedef struct BdrvCoGetBlockStatusData {
3736 BlockDriverState *bs;
3737 BlockDriverState *base;
3738 int64_t sector_num;
3739 int nb_sectors;
3740 int *pnum;
3741 int64_t ret;
3742 bool done;
3743 } BdrvCoGetBlockStatusData;
3746 * Returns true iff the specified sector is present in the disk image. Drivers
3747 * not implementing the functionality are assumed to not support backing files,
3748 * hence all their sectors are reported as allocated.
3750 * If 'sector_num' is beyond the end of the disk image the return value is 0
3751 * and 'pnum' is set to 0.
3753 * 'pnum' is set to the number of sectors (including and immediately following
3754 * the specified sector) that are known to be in the same
3755 * allocated/unallocated state.
3757 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3758 * beyond the end of the disk image it will be clamped.
3760 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3761 int64_t sector_num,
3762 int nb_sectors, int *pnum)
3764 int64_t length;
3765 int64_t n;
3766 int64_t ret, ret2;
3768 length = bdrv_getlength(bs);
3769 if (length < 0) {
3770 return length;
3773 if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3774 *pnum = 0;
3775 return 0;
3778 n = bs->total_sectors - sector_num;
3779 if (n < nb_sectors) {
3780 nb_sectors = n;
3783 if (!bs->drv->bdrv_co_get_block_status) {
3784 *pnum = nb_sectors;
3785 ret = BDRV_BLOCK_DATA;
3786 if (bs->drv->protocol_name) {
3787 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3789 return ret;
3792 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3793 if (ret < 0) {
3794 *pnum = 0;
3795 return ret;
3798 if (ret & BDRV_BLOCK_RAW) {
3799 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3800 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3801 *pnum, pnum);
3804 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3805 if (bdrv_unallocated_blocks_are_zero(bs)) {
3806 ret |= BDRV_BLOCK_ZERO;
3807 } else if (bs->backing_hd) {
3808 BlockDriverState *bs2 = bs->backing_hd;
3809 int64_t length2 = bdrv_getlength(bs2);
3810 if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3811 ret |= BDRV_BLOCK_ZERO;
3816 if (bs->file &&
3817 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3818 (ret & BDRV_BLOCK_OFFSET_VALID)) {
3819 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3820 *pnum, pnum);
3821 if (ret2 >= 0) {
3822 /* Ignore errors. This is just providing extra information, it
3823 * is useful but not necessary.
3825 ret |= (ret2 & BDRV_BLOCK_ZERO);
3829 return ret;
3832 /* Coroutine wrapper for bdrv_get_block_status() */
3833 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3835 BdrvCoGetBlockStatusData *data = opaque;
3836 BlockDriverState *bs = data->bs;
3838 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3839 data->pnum);
3840 data->done = true;
3844 * Synchronous wrapper around bdrv_co_get_block_status().
3846 * See bdrv_co_get_block_status() for details.
3848 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3849 int nb_sectors, int *pnum)
3851 Coroutine *co;
3852 BdrvCoGetBlockStatusData data = {
3853 .bs = bs,
3854 .sector_num = sector_num,
3855 .nb_sectors = nb_sectors,
3856 .pnum = pnum,
3857 .done = false,
3860 if (qemu_in_coroutine()) {
3861 /* Fast-path if already in coroutine context */
3862 bdrv_get_block_status_co_entry(&data);
3863 } else {
3864 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3865 qemu_coroutine_enter(co, &data);
3866 while (!data.done) {
3867 qemu_aio_wait();
3870 return data.ret;
3873 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3874 int nb_sectors, int *pnum)
3876 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3877 if (ret < 0) {
3878 return ret;
3880 return
3881 (ret & BDRV_BLOCK_DATA) ||
3882 ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3886 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3888 * Return true if the given sector is allocated in any image between
3889 * BASE and TOP (inclusive). BASE can be NULL to check if the given
3890 * sector is allocated in any image of the chain. Return false otherwise.
3892 * 'pnum' is set to the number of sectors (including and immediately following
3893 * the specified sector) that are known to be in the same
3894 * allocated/unallocated state.
3897 int bdrv_is_allocated_above(BlockDriverState *top,
3898 BlockDriverState *base,
3899 int64_t sector_num,
3900 int nb_sectors, int *pnum)
3902 BlockDriverState *intermediate;
3903 int ret, n = nb_sectors;
3905 intermediate = top;
3906 while (intermediate && intermediate != base) {
3907 int pnum_inter;
3908 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3909 &pnum_inter);
3910 if (ret < 0) {
3911 return ret;
3912 } else if (ret) {
3913 *pnum = pnum_inter;
3914 return 1;
3918 * [sector_num, nb_sectors] is unallocated on top but intermediate
3919 * might have
3921 * [sector_num+x, nr_sectors] allocated.
3923 if (n > pnum_inter &&
3924 (intermediate == top ||
3925 sector_num + pnum_inter < intermediate->total_sectors)) {
3926 n = pnum_inter;
3929 intermediate = intermediate->backing_hd;
3932 *pnum = n;
3933 return 0;
3936 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3938 if (bs->backing_hd && bs->backing_hd->encrypted)
3939 return bs->backing_file;
3940 else if (bs->encrypted)
3941 return bs->filename;
3942 else
3943 return NULL;
3946 void bdrv_get_backing_filename(BlockDriverState *bs,
3947 char *filename, int filename_size)
3949 pstrcpy(filename, filename_size, bs->backing_file);
3952 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3953 const uint8_t *buf, int nb_sectors)
3955 BlockDriver *drv = bs->drv;
3956 if (!drv)
3957 return -ENOMEDIUM;
3958 if (!drv->bdrv_write_compressed)
3959 return -ENOTSUP;
3960 if (bdrv_check_request(bs, sector_num, nb_sectors))
3961 return -EIO;
3963 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3965 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3968 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3970 BlockDriver *drv = bs->drv;
3971 if (!drv)
3972 return -ENOMEDIUM;
3973 if (!drv->bdrv_get_info)
3974 return -ENOTSUP;
3975 memset(bdi, 0, sizeof(*bdi));
3976 return drv->bdrv_get_info(bs, bdi);
3979 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3981 BlockDriver *drv = bs->drv;
3982 if (drv && drv->bdrv_get_specific_info) {
3983 return drv->bdrv_get_specific_info(bs);
3985 return NULL;
3988 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3989 int64_t pos, int size)
3991 QEMUIOVector qiov;
3992 struct iovec iov = {
3993 .iov_base = (void *) buf,
3994 .iov_len = size,
3997 qemu_iovec_init_external(&qiov, &iov, 1);
3998 return bdrv_writev_vmstate(bs, &qiov, pos);
4001 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4003 BlockDriver *drv = bs->drv;
4005 if (!drv) {
4006 return -ENOMEDIUM;
4007 } else if (drv->bdrv_save_vmstate) {
4008 return drv->bdrv_save_vmstate(bs, qiov, pos);
4009 } else if (bs->file) {
4010 return bdrv_writev_vmstate(bs->file, qiov, pos);
4013 return -ENOTSUP;
4016 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4017 int64_t pos, int size)
4019 BlockDriver *drv = bs->drv;
4020 if (!drv)
4021 return -ENOMEDIUM;
4022 if (drv->bdrv_load_vmstate)
4023 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4024 if (bs->file)
4025 return bdrv_load_vmstate(bs->file, buf, pos, size);
4026 return -ENOTSUP;
4029 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4031 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4032 return;
4035 bs->drv->bdrv_debug_event(bs, event);
4038 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4039 const char *tag)
4041 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4042 bs = bs->file;
4045 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4046 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4049 return -ENOTSUP;
4052 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4054 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4055 bs = bs->file;
4058 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4059 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4062 return -ENOTSUP;
4065 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4067 while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
4068 bs = bs->file;
4071 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4072 return bs->drv->bdrv_debug_resume(bs, tag);
4075 return -ENOTSUP;
4078 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4080 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4081 bs = bs->file;
4084 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4085 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4088 return false;
4091 int bdrv_is_snapshot(BlockDriverState *bs)
4093 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4096 /* backing_file can either be relative, or absolute, or a protocol. If it is
4097 * relative, it must be relative to the chain. So, passing in bs->filename
4098 * from a BDS as backing_file should not be done, as that may be relative to
4099 * the CWD rather than the chain. */
4100 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4101 const char *backing_file)
4103 char *filename_full = NULL;
4104 char *backing_file_full = NULL;
4105 char *filename_tmp = NULL;
4106 int is_protocol = 0;
4107 BlockDriverState *curr_bs = NULL;
4108 BlockDriverState *retval = NULL;
4110 if (!bs || !bs->drv || !backing_file) {
4111 return NULL;
4114 filename_full = g_malloc(PATH_MAX);
4115 backing_file_full = g_malloc(PATH_MAX);
4116 filename_tmp = g_malloc(PATH_MAX);
4118 is_protocol = path_has_protocol(backing_file);
4120 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4122 /* If either of the filename paths is actually a protocol, then
4123 * compare unmodified paths; otherwise make paths relative */
4124 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4125 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4126 retval = curr_bs->backing_hd;
4127 break;
4129 } else {
4130 /* If not an absolute filename path, make it relative to the current
4131 * image's filename path */
4132 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4133 backing_file);
4135 /* We are going to compare absolute pathnames */
4136 if (!realpath(filename_tmp, filename_full)) {
4137 continue;
4140 /* We need to make sure the backing filename we are comparing against
4141 * is relative to the current image filename (or absolute) */
4142 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4143 curr_bs->backing_file);
4145 if (!realpath(filename_tmp, backing_file_full)) {
4146 continue;
4149 if (strcmp(backing_file_full, filename_full) == 0) {
4150 retval = curr_bs->backing_hd;
4151 break;
4156 g_free(filename_full);
4157 g_free(backing_file_full);
4158 g_free(filename_tmp);
4159 return retval;
4162 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4164 if (!bs->drv) {
4165 return 0;
4168 if (!bs->backing_hd) {
4169 return 0;
4172 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4175 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4177 BlockDriverState *curr_bs = NULL;
4179 if (!bs) {
4180 return NULL;
4183 curr_bs = bs;
4185 while (curr_bs->backing_hd) {
4186 curr_bs = curr_bs->backing_hd;
4188 return curr_bs;
4191 /**************************************************************/
4192 /* async I/Os */
4194 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4195 QEMUIOVector *qiov, int nb_sectors,
4196 BlockDriverCompletionFunc *cb, void *opaque)
4198 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4200 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4201 cb, opaque, false);
4204 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4205 QEMUIOVector *qiov, int nb_sectors,
4206 BlockDriverCompletionFunc *cb, void *opaque)
4208 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4210 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4211 cb, opaque, true);
4214 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4215 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4216 BlockDriverCompletionFunc *cb, void *opaque)
4218 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4220 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4221 BDRV_REQ_ZERO_WRITE | flags,
4222 cb, opaque, true);
4226 typedef struct MultiwriteCB {
4227 int error;
4228 int num_requests;
4229 int num_callbacks;
4230 struct {
4231 BlockDriverCompletionFunc *cb;
4232 void *opaque;
4233 QEMUIOVector *free_qiov;
4234 } callbacks[];
4235 } MultiwriteCB;
4237 static void multiwrite_user_cb(MultiwriteCB *mcb)
4239 int i;
4241 for (i = 0; i < mcb->num_callbacks; i++) {
4242 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4243 if (mcb->callbacks[i].free_qiov) {
4244 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4246 g_free(mcb->callbacks[i].free_qiov);
4250 static void multiwrite_cb(void *opaque, int ret)
4252 MultiwriteCB *mcb = opaque;
4254 trace_multiwrite_cb(mcb, ret);
4256 if (ret < 0 && !mcb->error) {
4257 mcb->error = ret;
4260 mcb->num_requests--;
4261 if (mcb->num_requests == 0) {
4262 multiwrite_user_cb(mcb);
4263 g_free(mcb);
4267 static int multiwrite_req_compare(const void *a, const void *b)
4269 const BlockRequest *req1 = a, *req2 = b;
4272 * Note that we can't simply subtract req2->sector from req1->sector
4273 * here as that could overflow the return value.
4275 if (req1->sector > req2->sector) {
4276 return 1;
4277 } else if (req1->sector < req2->sector) {
4278 return -1;
4279 } else {
4280 return 0;
4285 * Takes a bunch of requests and tries to merge them. Returns the number of
4286 * requests that remain after merging.
4288 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4289 int num_reqs, MultiwriteCB *mcb)
4291 int i, outidx;
4293 // Sort requests by start sector
4294 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4296 // Check if adjacent requests touch the same clusters. If so, combine them,
4297 // filling up gaps with zero sectors.
4298 outidx = 0;
4299 for (i = 1; i < num_reqs; i++) {
4300 int merge = 0;
4301 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4303 // Handle exactly sequential writes and overlapping writes.
4304 if (reqs[i].sector <= oldreq_last) {
4305 merge = 1;
4308 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4309 merge = 0;
4312 if (merge) {
4313 size_t size;
4314 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4315 qemu_iovec_init(qiov,
4316 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4318 // Add the first request to the merged one. If the requests are
4319 // overlapping, drop the last sectors of the first request.
4320 size = (reqs[i].sector - reqs[outidx].sector) << 9;
4321 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4323 // We should need to add any zeros between the two requests
4324 assert (reqs[i].sector <= oldreq_last);
4326 // Add the second request
4327 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4329 reqs[outidx].nb_sectors = qiov->size >> 9;
4330 reqs[outidx].qiov = qiov;
4332 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4333 } else {
4334 outidx++;
4335 reqs[outidx].sector = reqs[i].sector;
4336 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4337 reqs[outidx].qiov = reqs[i].qiov;
4341 return outidx + 1;
4345 * Submit multiple AIO write requests at once.
4347 * On success, the function returns 0 and all requests in the reqs array have
4348 * been submitted. In error case this function returns -1, and any of the
4349 * requests may or may not be submitted yet. In particular, this means that the
4350 * callback will be called for some of the requests, for others it won't. The
4351 * caller must check the error field of the BlockRequest to wait for the right
4352 * callbacks (if error != 0, no callback will be called).
4354 * The implementation may modify the contents of the reqs array, e.g. to merge
4355 * requests. However, the fields opaque and error are left unmodified as they
4356 * are used to signal failure for a single request to the caller.
4358 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4360 MultiwriteCB *mcb;
4361 int i;
4363 /* don't submit writes if we don't have a medium */
4364 if (bs->drv == NULL) {
4365 for (i = 0; i < num_reqs; i++) {
4366 reqs[i].error = -ENOMEDIUM;
4368 return -1;
4371 if (num_reqs == 0) {
4372 return 0;
4375 // Create MultiwriteCB structure
4376 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4377 mcb->num_requests = 0;
4378 mcb->num_callbacks = num_reqs;
4380 for (i = 0; i < num_reqs; i++) {
4381 mcb->callbacks[i].cb = reqs[i].cb;
4382 mcb->callbacks[i].opaque = reqs[i].opaque;
4385 // Check for mergable requests
4386 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4388 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4390 /* Run the aio requests. */
4391 mcb->num_requests = num_reqs;
4392 for (i = 0; i < num_reqs; i++) {
4393 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4394 reqs[i].nb_sectors, reqs[i].flags,
4395 multiwrite_cb, mcb,
4396 true);
4399 return 0;
4402 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4404 acb->aiocb_info->cancel(acb);
4407 /**************************************************************/
4408 /* async block device emulation */
4410 typedef struct BlockDriverAIOCBSync {
4411 BlockDriverAIOCB common;
4412 QEMUBH *bh;
4413 int ret;
4414 /* vector translation state */
4415 QEMUIOVector *qiov;
4416 uint8_t *bounce;
4417 int is_write;
4418 } BlockDriverAIOCBSync;
4420 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4422 BlockDriverAIOCBSync *acb =
4423 container_of(blockacb, BlockDriverAIOCBSync, common);
4424 qemu_bh_delete(acb->bh);
4425 acb->bh = NULL;
4426 qemu_aio_release(acb);
4429 static const AIOCBInfo bdrv_em_aiocb_info = {
4430 .aiocb_size = sizeof(BlockDriverAIOCBSync),
4431 .cancel = bdrv_aio_cancel_em,
4434 static void bdrv_aio_bh_cb(void *opaque)
4436 BlockDriverAIOCBSync *acb = opaque;
4438 if (!acb->is_write)
4439 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4440 qemu_vfree(acb->bounce);
4441 acb->common.cb(acb->common.opaque, acb->ret);
4442 qemu_bh_delete(acb->bh);
4443 acb->bh = NULL;
4444 qemu_aio_release(acb);
4447 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4448 int64_t sector_num,
4449 QEMUIOVector *qiov,
4450 int nb_sectors,
4451 BlockDriverCompletionFunc *cb,
4452 void *opaque,
4453 int is_write)
4456 BlockDriverAIOCBSync *acb;
4458 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4459 acb->is_write = is_write;
4460 acb->qiov = qiov;
4461 acb->bounce = qemu_blockalign(bs, qiov->size);
4462 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4464 if (is_write) {
4465 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4466 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4467 } else {
4468 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4471 qemu_bh_schedule(acb->bh);
4473 return &acb->common;
4476 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4477 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4478 BlockDriverCompletionFunc *cb, void *opaque)
4480 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4483 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4484 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4485 BlockDriverCompletionFunc *cb, void *opaque)
4487 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4491 typedef struct BlockDriverAIOCBCoroutine {
4492 BlockDriverAIOCB common;
4493 BlockRequest req;
4494 bool is_write;
4495 bool *done;
4496 QEMUBH* bh;
4497 } BlockDriverAIOCBCoroutine;
4499 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4501 BlockDriverAIOCBCoroutine *acb =
4502 container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4503 bool done = false;
4505 acb->done = &done;
4506 while (!done) {
4507 qemu_aio_wait();
4511 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4512 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
4513 .cancel = bdrv_aio_co_cancel_em,
4516 static void bdrv_co_em_bh(void *opaque)
4518 BlockDriverAIOCBCoroutine *acb = opaque;
4520 acb->common.cb(acb->common.opaque, acb->req.error);
4522 if (acb->done) {
4523 *acb->done = true;
4526 qemu_bh_delete(acb->bh);
4527 qemu_aio_release(acb);
4530 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4531 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4533 BlockDriverAIOCBCoroutine *acb = opaque;
4534 BlockDriverState *bs = acb->common.bs;
4536 if (!acb->is_write) {
4537 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4538 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4539 } else {
4540 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4541 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4544 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4545 qemu_bh_schedule(acb->bh);
4548 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4549 int64_t sector_num,
4550 QEMUIOVector *qiov,
4551 int nb_sectors,
4552 BdrvRequestFlags flags,
4553 BlockDriverCompletionFunc *cb,
4554 void *opaque,
4555 bool is_write)
4557 Coroutine *co;
4558 BlockDriverAIOCBCoroutine *acb;
4560 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4561 acb->req.sector = sector_num;
4562 acb->req.nb_sectors = nb_sectors;
4563 acb->req.qiov = qiov;
4564 acb->req.flags = flags;
4565 acb->is_write = is_write;
4566 acb->done = NULL;
4568 co = qemu_coroutine_create(bdrv_co_do_rw);
4569 qemu_coroutine_enter(co, acb);
4571 return &acb->common;
4574 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4576 BlockDriverAIOCBCoroutine *acb = opaque;
4577 BlockDriverState *bs = acb->common.bs;
4579 acb->req.error = bdrv_co_flush(bs);
4580 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4581 qemu_bh_schedule(acb->bh);
4584 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4585 BlockDriverCompletionFunc *cb, void *opaque)
4587 trace_bdrv_aio_flush(bs, opaque);
4589 Coroutine *co;
4590 BlockDriverAIOCBCoroutine *acb;
4592 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4593 acb->done = NULL;
4595 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4596 qemu_coroutine_enter(co, acb);
4598 return &acb->common;
4601 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4603 BlockDriverAIOCBCoroutine *acb = opaque;
4604 BlockDriverState *bs = acb->common.bs;
4606 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4607 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4608 qemu_bh_schedule(acb->bh);
4611 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4612 int64_t sector_num, int nb_sectors,
4613 BlockDriverCompletionFunc *cb, void *opaque)
4615 Coroutine *co;
4616 BlockDriverAIOCBCoroutine *acb;
4618 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4620 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4621 acb->req.sector = sector_num;
4622 acb->req.nb_sectors = nb_sectors;
4623 acb->done = NULL;
4624 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4625 qemu_coroutine_enter(co, acb);
4627 return &acb->common;
4630 void bdrv_init(void)
4632 module_call_init(MODULE_INIT_BLOCK);
4635 void bdrv_init_with_whitelist(void)
4637 use_bdrv_whitelist = 1;
4638 bdrv_init();
4641 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4642 BlockDriverCompletionFunc *cb, void *opaque)
4644 BlockDriverAIOCB *acb;
4646 acb = g_slice_alloc(aiocb_info->aiocb_size);
4647 acb->aiocb_info = aiocb_info;
4648 acb->bs = bs;
4649 acb->cb = cb;
4650 acb->opaque = opaque;
4651 return acb;
4654 void qemu_aio_release(void *p)
4656 BlockDriverAIOCB *acb = p;
4657 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4660 /**************************************************************/
4661 /* Coroutine block device emulation */
4663 typedef struct CoroutineIOCompletion {
4664 Coroutine *coroutine;
4665 int ret;
4666 } CoroutineIOCompletion;
4668 static void bdrv_co_io_em_complete(void *opaque, int ret)
4670 CoroutineIOCompletion *co = opaque;
4672 co->ret = ret;
4673 qemu_coroutine_enter(co->coroutine, NULL);
4676 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4677 int nb_sectors, QEMUIOVector *iov,
4678 bool is_write)
4680 CoroutineIOCompletion co = {
4681 .coroutine = qemu_coroutine_self(),
4683 BlockDriverAIOCB *acb;
4685 if (is_write) {
4686 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4687 bdrv_co_io_em_complete, &co);
4688 } else {
4689 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4690 bdrv_co_io_em_complete, &co);
4693 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4694 if (!acb) {
4695 return -EIO;
4697 qemu_coroutine_yield();
4699 return co.ret;
4702 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4703 int64_t sector_num, int nb_sectors,
4704 QEMUIOVector *iov)
4706 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4709 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4710 int64_t sector_num, int nb_sectors,
4711 QEMUIOVector *iov)
4713 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4716 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4718 RwCo *rwco = opaque;
4720 rwco->ret = bdrv_co_flush(rwco->bs);
4723 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4725 int ret;
4727 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4728 return 0;
4731 /* Write back cached data to the OS even with cache=unsafe */
4732 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4733 if (bs->drv->bdrv_co_flush_to_os) {
4734 ret = bs->drv->bdrv_co_flush_to_os(bs);
4735 if (ret < 0) {
4736 return ret;
4740 /* But don't actually force it to the disk with cache=unsafe */
4741 if (bs->open_flags & BDRV_O_NO_FLUSH) {
4742 goto flush_parent;
4745 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4746 if (bs->drv->bdrv_co_flush_to_disk) {
4747 ret = bs->drv->bdrv_co_flush_to_disk(bs);
4748 } else if (bs->drv->bdrv_aio_flush) {
4749 BlockDriverAIOCB *acb;
4750 CoroutineIOCompletion co = {
4751 .coroutine = qemu_coroutine_self(),
4754 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4755 if (acb == NULL) {
4756 ret = -EIO;
4757 } else {
4758 qemu_coroutine_yield();
4759 ret = co.ret;
4761 } else {
4763 * Some block drivers always operate in either writethrough or unsafe
4764 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4765 * know how the server works (because the behaviour is hardcoded or
4766 * depends on server-side configuration), so we can't ensure that
4767 * everything is safe on disk. Returning an error doesn't work because
4768 * that would break guests even if the server operates in writethrough
4769 * mode.
4771 * Let's hope the user knows what he's doing.
4773 ret = 0;
4775 if (ret < 0) {
4776 return ret;
4779 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4780 * in the case of cache=unsafe, so there are no useless flushes.
4782 flush_parent:
4783 return bdrv_co_flush(bs->file);
4786 void bdrv_invalidate_cache(BlockDriverState *bs)
4788 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4789 bs->drv->bdrv_invalidate_cache(bs);
4793 void bdrv_invalidate_cache_all(void)
4795 BlockDriverState *bs;
4797 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4798 bdrv_invalidate_cache(bs);
4802 void bdrv_clear_incoming_migration_all(void)
4804 BlockDriverState *bs;
4806 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4807 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4811 int bdrv_flush(BlockDriverState *bs)
4813 Coroutine *co;
4814 RwCo rwco = {
4815 .bs = bs,
4816 .ret = NOT_DONE,
4819 if (qemu_in_coroutine()) {
4820 /* Fast-path if already in coroutine context */
4821 bdrv_flush_co_entry(&rwco);
4822 } else {
4823 co = qemu_coroutine_create(bdrv_flush_co_entry);
4824 qemu_coroutine_enter(co, &rwco);
4825 while (rwco.ret == NOT_DONE) {
4826 qemu_aio_wait();
4830 return rwco.ret;
4833 typedef struct DiscardCo {
4834 BlockDriverState *bs;
4835 int64_t sector_num;
4836 int nb_sectors;
4837 int ret;
4838 } DiscardCo;
4839 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4841 DiscardCo *rwco = opaque;
4843 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4846 /* if no limit is specified in the BlockLimits use a default
4847 * of 32768 512-byte sectors (16 MiB) per request.
4849 #define MAX_DISCARD_DEFAULT 32768
4851 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4852 int nb_sectors)
4854 int max_discard;
4856 if (!bs->drv) {
4857 return -ENOMEDIUM;
4858 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4859 return -EIO;
4860 } else if (bs->read_only) {
4861 return -EROFS;
4864 bdrv_reset_dirty(bs, sector_num, nb_sectors);
4866 /* Do nothing if disabled. */
4867 if (!(bs->open_flags & BDRV_O_UNMAP)) {
4868 return 0;
4871 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4872 return 0;
4875 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4876 while (nb_sectors > 0) {
4877 int ret;
4878 int num = nb_sectors;
4880 /* align request */
4881 if (bs->bl.discard_alignment &&
4882 num >= bs->bl.discard_alignment &&
4883 sector_num % bs->bl.discard_alignment) {
4884 if (num > bs->bl.discard_alignment) {
4885 num = bs->bl.discard_alignment;
4887 num -= sector_num % bs->bl.discard_alignment;
4890 /* limit request size */
4891 if (num > max_discard) {
4892 num = max_discard;
4895 if (bs->drv->bdrv_co_discard) {
4896 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4897 } else {
4898 BlockDriverAIOCB *acb;
4899 CoroutineIOCompletion co = {
4900 .coroutine = qemu_coroutine_self(),
4903 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4904 bdrv_co_io_em_complete, &co);
4905 if (acb == NULL) {
4906 return -EIO;
4907 } else {
4908 qemu_coroutine_yield();
4909 ret = co.ret;
4912 if (ret && ret != -ENOTSUP) {
4913 return ret;
4916 sector_num += num;
4917 nb_sectors -= num;
4919 return 0;
4922 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4924 Coroutine *co;
4925 DiscardCo rwco = {
4926 .bs = bs,
4927 .sector_num = sector_num,
4928 .nb_sectors = nb_sectors,
4929 .ret = NOT_DONE,
4932 if (qemu_in_coroutine()) {
4933 /* Fast-path if already in coroutine context */
4934 bdrv_discard_co_entry(&rwco);
4935 } else {
4936 co = qemu_coroutine_create(bdrv_discard_co_entry);
4937 qemu_coroutine_enter(co, &rwco);
4938 while (rwco.ret == NOT_DONE) {
4939 qemu_aio_wait();
4943 return rwco.ret;
4946 /**************************************************************/
4947 /* removable device support */
4950 * Return TRUE if the media is present
4952 int bdrv_is_inserted(BlockDriverState *bs)
4954 BlockDriver *drv = bs->drv;
4956 if (!drv)
4957 return 0;
4958 if (!drv->bdrv_is_inserted)
4959 return 1;
4960 return drv->bdrv_is_inserted(bs);
4964 * Return whether the media changed since the last call to this
4965 * function, or -ENOTSUP if we don't know. Most drivers don't know.
4967 int bdrv_media_changed(BlockDriverState *bs)
4969 BlockDriver *drv = bs->drv;
4971 if (drv && drv->bdrv_media_changed) {
4972 return drv->bdrv_media_changed(bs);
4974 return -ENOTSUP;
4978 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4980 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4982 BlockDriver *drv = bs->drv;
4984 if (drv && drv->bdrv_eject) {
4985 drv->bdrv_eject(bs, eject_flag);
4988 if (bs->device_name[0] != '\0') {
4989 bdrv_emit_qmp_eject_event(bs, eject_flag);
4994 * Lock or unlock the media (if it is locked, the user won't be able
4995 * to eject it manually).
4997 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4999 BlockDriver *drv = bs->drv;
5001 trace_bdrv_lock_medium(bs, locked);
5003 if (drv && drv->bdrv_lock_medium) {
5004 drv->bdrv_lock_medium(bs, locked);
5008 /* needed for generic scsi interface */
5010 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5012 BlockDriver *drv = bs->drv;
5014 if (drv && drv->bdrv_ioctl)
5015 return drv->bdrv_ioctl(bs, req, buf);
5016 return -ENOTSUP;
5019 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5020 unsigned long int req, void *buf,
5021 BlockDriverCompletionFunc *cb, void *opaque)
5023 BlockDriver *drv = bs->drv;
5025 if (drv && drv->bdrv_aio_ioctl)
5026 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5027 return NULL;
5030 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5032 bs->guest_block_size = align;
5035 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5037 return qemu_memalign(bdrv_opt_mem_align(bs), size);
5041 * Check if all memory in this vector is sector aligned.
5043 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5045 int i;
5046 size_t alignment = bdrv_opt_mem_align(bs);
5048 for (i = 0; i < qiov->niov; i++) {
5049 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5050 return false;
5052 if (qiov->iov[i].iov_len % alignment) {
5053 return false;
5057 return true;
5060 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
5062 int64_t bitmap_size;
5063 BdrvDirtyBitmap *bitmap;
5065 assert((granularity & (granularity - 1)) == 0);
5067 granularity >>= BDRV_SECTOR_BITS;
5068 assert(granularity);
5069 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
5070 bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5071 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5072 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5073 return bitmap;
5076 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5078 BdrvDirtyBitmap *bm, *next;
5079 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5080 if (bm == bitmap) {
5081 QLIST_REMOVE(bitmap, list);
5082 hbitmap_free(bitmap->bitmap);
5083 g_free(bitmap);
5084 return;
5089 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5091 BdrvDirtyBitmap *bm;
5092 BlockDirtyInfoList *list = NULL;
5093 BlockDirtyInfoList **plist = &list;
5095 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5096 BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5097 BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5098 info->count = bdrv_get_dirty_count(bs, bm);
5099 info->granularity =
5100 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5101 entry->value = info;
5102 *plist = entry;
5103 plist = &entry->next;
5106 return list;
5109 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5111 if (bitmap) {
5112 return hbitmap_get(bitmap->bitmap, sector);
5113 } else {
5114 return 0;
5118 void bdrv_dirty_iter_init(BlockDriverState *bs,
5119 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5121 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5124 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5125 int nr_sectors)
5127 BdrvDirtyBitmap *bitmap;
5128 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5129 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5133 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5135 BdrvDirtyBitmap *bitmap;
5136 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5137 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5141 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5143 return hbitmap_count(bitmap->bitmap);
5146 /* Get a reference to bs */
5147 void bdrv_ref(BlockDriverState *bs)
5149 bs->refcnt++;
5152 /* Release a previously grabbed reference to bs.
5153 * If after releasing, reference count is zero, the BlockDriverState is
5154 * deleted. */
5155 void bdrv_unref(BlockDriverState *bs)
5157 assert(bs->refcnt > 0);
5158 if (--bs->refcnt == 0) {
5159 bdrv_delete(bs);
5163 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5165 assert(bs->in_use != in_use);
5166 bs->in_use = in_use;
5169 int bdrv_in_use(BlockDriverState *bs)
5171 return bs->in_use;
5174 void bdrv_iostatus_enable(BlockDriverState *bs)
5176 bs->iostatus_enabled = true;
5177 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5180 /* The I/O status is only enabled if the drive explicitly
5181 * enables it _and_ the VM is configured to stop on errors */
5182 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5184 return (bs->iostatus_enabled &&
5185 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5186 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5187 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5190 void bdrv_iostatus_disable(BlockDriverState *bs)
5192 bs->iostatus_enabled = false;
5195 void bdrv_iostatus_reset(BlockDriverState *bs)
5197 if (bdrv_iostatus_is_enabled(bs)) {
5198 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5199 if (bs->job) {
5200 block_job_iostatus_reset(bs->job);
5205 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5207 assert(bdrv_iostatus_is_enabled(bs));
5208 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5209 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5210 BLOCK_DEVICE_IO_STATUS_FAILED;
5214 void
5215 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5216 enum BlockAcctType type)
5218 assert(type < BDRV_MAX_IOTYPE);
5220 cookie->bytes = bytes;
5221 cookie->start_time_ns = get_clock();
5222 cookie->type = type;
5225 void
5226 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5228 assert(cookie->type < BDRV_MAX_IOTYPE);
5230 bs->nr_bytes[cookie->type] += cookie->bytes;
5231 bs->nr_ops[cookie->type]++;
5232 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5235 void bdrv_img_create(const char *filename, const char *fmt,
5236 const char *base_filename, const char *base_fmt,
5237 char *options, uint64_t img_size, int flags,
5238 Error **errp, bool quiet)
5240 QEMUOptionParameter *param = NULL, *create_options = NULL;
5241 QEMUOptionParameter *backing_fmt, *backing_file, *size;
5242 BlockDriver *drv, *proto_drv;
5243 BlockDriver *backing_drv = NULL;
5244 Error *local_err = NULL;
5245 int ret = 0;
5247 /* Find driver and parse its options */
5248 drv = bdrv_find_format(fmt);
5249 if (!drv) {
5250 error_setg(errp, "Unknown file format '%s'", fmt);
5251 return;
5254 proto_drv = bdrv_find_protocol(filename, true);
5255 if (!proto_drv) {
5256 error_setg(errp, "Unknown protocol '%s'", filename);
5257 return;
5260 create_options = append_option_parameters(create_options,
5261 drv->create_options);
5262 create_options = append_option_parameters(create_options,
5263 proto_drv->create_options);
5265 /* Create parameter list with default values */
5266 param = parse_option_parameters("", create_options, param);
5268 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5270 /* Parse -o options */
5271 if (options) {
5272 param = parse_option_parameters(options, create_options, param);
5273 if (param == NULL) {
5274 error_setg(errp, "Invalid options for file format '%s'.", fmt);
5275 goto out;
5279 if (base_filename) {
5280 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5281 base_filename)) {
5282 error_setg(errp, "Backing file not supported for file format '%s'",
5283 fmt);
5284 goto out;
5288 if (base_fmt) {
5289 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5290 error_setg(errp, "Backing file format not supported for file "
5291 "format '%s'", fmt);
5292 goto out;
5296 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5297 if (backing_file && backing_file->value.s) {
5298 if (!strcmp(filename, backing_file->value.s)) {
5299 error_setg(errp, "Error: Trying to create an image with the "
5300 "same filename as the backing file");
5301 goto out;
5305 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5306 if (backing_fmt && backing_fmt->value.s) {
5307 backing_drv = bdrv_find_format(backing_fmt->value.s);
5308 if (!backing_drv) {
5309 error_setg(errp, "Unknown backing file format '%s'",
5310 backing_fmt->value.s);
5311 goto out;
5315 // The size for the image must always be specified, with one exception:
5316 // If we are using a backing file, we can obtain the size from there
5317 size = get_option_parameter(param, BLOCK_OPT_SIZE);
5318 if (size && size->value.n == -1) {
5319 if (backing_file && backing_file->value.s) {
5320 BlockDriverState *bs;
5321 uint64_t size;
5322 char buf[32];
5323 int back_flags;
5325 /* backing files always opened read-only */
5326 back_flags =
5327 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5329 bs = NULL;
5330 ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
5331 backing_drv, &local_err);
5332 if (ret < 0) {
5333 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5334 backing_file->value.s,
5335 error_get_pretty(local_err));
5336 error_free(local_err);
5337 local_err = NULL;
5338 goto out;
5340 bdrv_get_geometry(bs, &size);
5341 size *= 512;
5343 snprintf(buf, sizeof(buf), "%" PRId64, size);
5344 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5346 bdrv_unref(bs);
5347 } else {
5348 error_setg(errp, "Image creation needs a size parameter");
5349 goto out;
5353 if (!quiet) {
5354 printf("Formatting '%s', fmt=%s ", filename, fmt);
5355 print_option_parameters(param);
5356 puts("");
5358 ret = bdrv_create(drv, filename, param, &local_err);
5359 if (ret == -EFBIG) {
5360 /* This is generally a better message than whatever the driver would
5361 * deliver (especially because of the cluster_size_hint), since that
5362 * is most probably not much different from "image too large". */
5363 const char *cluster_size_hint = "";
5364 if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5365 cluster_size_hint = " (try using a larger cluster size)";
5367 error_setg(errp, "The image size is too large for file format '%s'"
5368 "%s", fmt, cluster_size_hint);
5369 error_free(local_err);
5370 local_err = NULL;
5373 out:
5374 free_option_parameters(create_options);
5375 free_option_parameters(param);
5377 if (local_err) {
5378 error_propagate(errp, local_err);
5382 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5384 /* Currently BlockDriverState always uses the main loop AioContext */
5385 return qemu_get_aio_context();
5388 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5389 NotifierWithReturn *notifier)
5391 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5394 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5396 if (bs->drv->bdrv_amend_options == NULL) {
5397 return -ENOTSUP;
5399 return bs->drv->bdrv_amend_options(bs, options);
5402 /* Used to recurse on single child block filters.
5403 * Single child block filter will store their child in bs->file.
5405 bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
5406 BlockDriverState *candidate)
5408 if (!bs->drv) {
5409 return false;
5412 if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
5413 if (bs == candidate) {
5414 return true;
5415 } else {
5416 return false;
5420 if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
5421 return false;
5424 if (!bs->file) {
5425 return false;
5428 return bdrv_recurse_is_first_non_filter(bs->file, candidate);
5431 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5432 BlockDriverState *candidate)
5434 if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
5435 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5438 return bdrv_generic_is_first_non_filter(bs, candidate);
5441 /* This function checks if the candidate is the first non filter bs down it's
5442 * bs chain. Since we don't have pointers to parents it explore all bs chains
5443 * from the top. Some filters can choose not to pass down the recursion.
5445 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5447 BlockDriverState *bs;
5449 /* walk down the bs forest recursively */
5450 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5451 bool perm;
5453 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5455 /* candidate is the first non filter */
5456 if (perm) {
5457 return true;
5461 return false;