iotests: replace fake parallels image with authentic one
[qemu/ar7.git] / block.c
blob41793419d8e1e8b3d429402af077988a02bec899
1 /*
2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
40 #ifdef CONFIG_BSD
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
45 #ifndef __DragonFly__
46 #include <sys/disk.h>
47 #endif
48 #endif
50 #ifdef _WIN32
51 #include <windows.h>
52 #endif
54 struct BdrvDirtyBitmap {
55 HBitmap *bitmap;
56 QLIST_ENTRY(BdrvDirtyBitmap) list;
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63 BlockCompletionFunc *cb, void *opaque);
64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66 BlockCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75 BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78 BdrvRequestFlags flags);
79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
83 BdrvRequestFlags flags,
84 BlockCompletionFunc *cb,
85 void *opaque,
86 bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
111 int is_windows_drive(const char *filename)
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
121 #endif
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
127 int i;
129 throttle_config(&bs->throttle_state, cfg);
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
143 bs->io_limits_enabled = false;
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
151 bs->io_limits_enabled = enabled;
153 return drained;
156 void bdrv_io_limits_disable(BlockDriverState *bs)
158 bs->io_limits_enabled = false;
160 bdrv_start_throttled_reqs(bs);
162 throttle_destroy(&bs->throttle_state);
165 static void bdrv_throttle_read_timer_cb(void *opaque)
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
171 static void bdrv_throttle_write_timer_cb(void *opaque)
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 bdrv_get_aio_context(bs),
183 QEMU_CLOCK_VIRTUAL,
184 bdrv_throttle_read_timer_cb,
185 bdrv_throttle_write_timer_cb,
186 bs);
187 bs->io_limits_enabled = true;
190 /* This function makes an IO wait if needed
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
195 static void bdrv_io_limits_intercept(BlockDriverState *bs,
196 unsigned int bytes,
197 bool is_write)
199 /* does this io must wait */
200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
202 /* if must wait or any request of this type throttled queue the IO */
203 if (must_wait ||
204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
208 /* the IO will be executed, do the accounting */
209 throttle_account(&bs->throttle_state, is_write, bytes);
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214 return;
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
221 size_t bdrv_opt_mem_align(BlockDriverState *bs)
223 if (!bs || !bs->drv) {
224 /* 4k should be on the safe side */
225 return 4096;
228 return bs->bl.opt_mem_alignment;
231 /* check if the path starts with "<protocol>:" */
232 static int path_has_protocol(const char *path)
234 const char *p;
236 #ifdef _WIN32
237 if (is_windows_drive(path) ||
238 is_windows_drive_prefix(path)) {
239 return 0;
241 p = path + strcspn(path, ":/\\");
242 #else
243 p = path + strcspn(path, ":/");
244 #endif
246 return *p == ':';
249 int path_is_absolute(const char *path)
251 #ifdef _WIN32
252 /* specific case for names like: "\\.\d:" */
253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254 return 1;
256 return (*path == '/' || *path == '\\');
257 #else
258 return (*path == '/');
259 #endif
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
264 supported. */
265 void path_combine(char *dest, int dest_size,
266 const char *base_path,
267 const char *filename)
269 const char *p, *p1;
270 int len;
272 if (dest_size <= 0)
273 return;
274 if (path_is_absolute(filename)) {
275 pstrcpy(dest, dest_size, filename);
276 } else {
277 p = strchr(base_path, ':');
278 if (p)
279 p++;
280 else
281 p = base_path;
282 p1 = strrchr(base_path, '/');
283 #ifdef _WIN32
285 const char *p2;
286 p2 = strrchr(base_path, '\\');
287 if (!p1 || p2 > p1)
288 p1 = p2;
290 #endif
291 if (p1)
292 p1++;
293 else
294 p1 = base_path;
295 if (p1 > p)
296 p = p1;
297 len = p - base_path;
298 if (len > dest_size - 1)
299 len = dest_size - 1;
300 memcpy(dest, base_path, len);
301 dest[len] = '\0';
302 pstrcat(dest, dest_size, filename);
306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
308 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309 pstrcpy(dest, sz, bs->backing_file);
310 } else {
311 path_combine(dest, sz, bs->filename, bs->backing_file);
315 void bdrv_register(BlockDriver *bdrv)
317 /* Block drivers without coroutine functions need emulation */
318 if (!bdrv->bdrv_co_readv) {
319 bdrv->bdrv_co_readv = bdrv_co_readv_em;
320 bdrv->bdrv_co_writev = bdrv_co_writev_em;
322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323 * the block driver lacks aio we need to emulate that too.
325 if (!bdrv->bdrv_aio_readv) {
326 /* add AIO emulation layer */
327 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
332 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
335 BlockDriverState *bdrv_new_root(void)
337 BlockDriverState *bs = bdrv_new();
339 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
340 return bs;
343 BlockDriverState *bdrv_new(void)
345 BlockDriverState *bs;
346 int i;
348 bs = g_new0(BlockDriverState, 1);
349 QLIST_INIT(&bs->dirty_bitmaps);
350 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
351 QLIST_INIT(&bs->op_blockers[i]);
353 bdrv_iostatus_disable(bs);
354 notifier_list_init(&bs->close_notifiers);
355 notifier_with_return_list_init(&bs->before_write_notifiers);
356 qemu_co_queue_init(&bs->throttled_reqs[0]);
357 qemu_co_queue_init(&bs->throttled_reqs[1]);
358 bs->refcnt = 1;
359 bs->aio_context = qemu_get_aio_context();
361 return bs;
364 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
366 notifier_list_add(&bs->close_notifiers, notify);
369 BlockDriver *bdrv_find_format(const char *format_name)
371 BlockDriver *drv1;
372 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
373 if (!strcmp(drv1->format_name, format_name)) {
374 return drv1;
377 return NULL;
380 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
382 static const char *whitelist_rw[] = {
383 CONFIG_BDRV_RW_WHITELIST
385 static const char *whitelist_ro[] = {
386 CONFIG_BDRV_RO_WHITELIST
388 const char **p;
390 if (!whitelist_rw[0] && !whitelist_ro[0]) {
391 return 1; /* no whitelist, anything goes */
394 for (p = whitelist_rw; *p; p++) {
395 if (!strcmp(drv->format_name, *p)) {
396 return 1;
399 if (read_only) {
400 for (p = whitelist_ro; *p; p++) {
401 if (!strcmp(drv->format_name, *p)) {
402 return 1;
406 return 0;
409 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
410 bool read_only)
412 BlockDriver *drv = bdrv_find_format(format_name);
413 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
416 typedef struct CreateCo {
417 BlockDriver *drv;
418 char *filename;
419 QemuOpts *opts;
420 int ret;
421 Error *err;
422 } CreateCo;
424 static void coroutine_fn bdrv_create_co_entry(void *opaque)
426 Error *local_err = NULL;
427 int ret;
429 CreateCo *cco = opaque;
430 assert(cco->drv);
432 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
433 if (local_err) {
434 error_propagate(&cco->err, local_err);
436 cco->ret = ret;
439 int bdrv_create(BlockDriver *drv, const char* filename,
440 QemuOpts *opts, Error **errp)
442 int ret;
444 Coroutine *co;
445 CreateCo cco = {
446 .drv = drv,
447 .filename = g_strdup(filename),
448 .opts = opts,
449 .ret = NOT_DONE,
450 .err = NULL,
453 if (!drv->bdrv_create) {
454 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
455 ret = -ENOTSUP;
456 goto out;
459 if (qemu_in_coroutine()) {
460 /* Fast-path if already in coroutine context */
461 bdrv_create_co_entry(&cco);
462 } else {
463 co = qemu_coroutine_create(bdrv_create_co_entry);
464 qemu_coroutine_enter(co, &cco);
465 while (cco.ret == NOT_DONE) {
466 aio_poll(qemu_get_aio_context(), true);
470 ret = cco.ret;
471 if (ret < 0) {
472 if (cco.err) {
473 error_propagate(errp, cco.err);
474 } else {
475 error_setg_errno(errp, -ret, "Could not create image");
479 out:
480 g_free(cco.filename);
481 return ret;
484 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
486 BlockDriver *drv;
487 Error *local_err = NULL;
488 int ret;
490 drv = bdrv_find_protocol(filename, true);
491 if (drv == NULL) {
492 error_setg(errp, "Could not find protocol for file '%s'", filename);
493 return -ENOENT;
496 ret = bdrv_create(drv, filename, opts, &local_err);
497 if (local_err) {
498 error_propagate(errp, local_err);
500 return ret;
503 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
505 BlockDriver *drv = bs->drv;
506 Error *local_err = NULL;
508 memset(&bs->bl, 0, sizeof(bs->bl));
510 if (!drv) {
511 return;
514 /* Take some limits from the children as a default */
515 if (bs->file) {
516 bdrv_refresh_limits(bs->file, &local_err);
517 if (local_err) {
518 error_propagate(errp, local_err);
519 return;
521 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
522 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
523 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
524 } else {
525 bs->bl.opt_mem_alignment = 512;
528 if (bs->backing_hd) {
529 bdrv_refresh_limits(bs->backing_hd, &local_err);
530 if (local_err) {
531 error_propagate(errp, local_err);
532 return;
534 bs->bl.opt_transfer_length =
535 MAX(bs->bl.opt_transfer_length,
536 bs->backing_hd->bl.opt_transfer_length);
537 bs->bl.max_transfer_length =
538 MIN_NON_ZERO(bs->bl.max_transfer_length,
539 bs->backing_hd->bl.max_transfer_length);
540 bs->bl.opt_mem_alignment =
541 MAX(bs->bl.opt_mem_alignment,
542 bs->backing_hd->bl.opt_mem_alignment);
545 /* Then let the driver override it */
546 if (drv->bdrv_refresh_limits) {
547 drv->bdrv_refresh_limits(bs, errp);
552 * Create a uniquely-named empty temporary file.
553 * Return 0 upon success, otherwise a negative errno value.
555 int get_tmp_filename(char *filename, int size)
557 #ifdef _WIN32
558 char temp_dir[MAX_PATH];
559 /* GetTempFileName requires that its output buffer (4th param)
560 have length MAX_PATH or greater. */
561 assert(size >= MAX_PATH);
562 return (GetTempPath(MAX_PATH, temp_dir)
563 && GetTempFileName(temp_dir, "qem", 0, filename)
564 ? 0 : -GetLastError());
565 #else
566 int fd;
567 const char *tmpdir;
568 tmpdir = getenv("TMPDIR");
569 if (!tmpdir) {
570 tmpdir = "/var/tmp";
572 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
573 return -EOVERFLOW;
575 fd = mkstemp(filename);
576 if (fd < 0) {
577 return -errno;
579 if (close(fd) != 0) {
580 unlink(filename);
581 return -errno;
583 return 0;
584 #endif
588 * Detect host devices. By convention, /dev/cdrom[N] is always
589 * recognized as a host CDROM.
591 static BlockDriver *find_hdev_driver(const char *filename)
593 int score_max = 0, score;
594 BlockDriver *drv = NULL, *d;
596 QLIST_FOREACH(d, &bdrv_drivers, list) {
597 if (d->bdrv_probe_device) {
598 score = d->bdrv_probe_device(filename);
599 if (score > score_max) {
600 score_max = score;
601 drv = d;
606 return drv;
609 BlockDriver *bdrv_find_protocol(const char *filename,
610 bool allow_protocol_prefix)
612 BlockDriver *drv1;
613 char protocol[128];
614 int len;
615 const char *p;
617 /* TODO Drivers without bdrv_file_open must be specified explicitly */
620 * XXX(hch): we really should not let host device detection
621 * override an explicit protocol specification, but moving this
622 * later breaks access to device names with colons in them.
623 * Thanks to the brain-dead persistent naming schemes on udev-
624 * based Linux systems those actually are quite common.
626 drv1 = find_hdev_driver(filename);
627 if (drv1) {
628 return drv1;
631 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
632 return bdrv_find_format("file");
635 p = strchr(filename, ':');
636 assert(p != NULL);
637 len = p - filename;
638 if (len > sizeof(protocol) - 1)
639 len = sizeof(protocol) - 1;
640 memcpy(protocol, filename, len);
641 protocol[len] = '\0';
642 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
643 if (drv1->protocol_name &&
644 !strcmp(drv1->protocol_name, protocol)) {
645 return drv1;
648 return NULL;
651 static int find_image_format(BlockDriverState *bs, const char *filename,
652 BlockDriver **pdrv, Error **errp)
654 int score, score_max;
655 BlockDriver *drv1, *drv;
656 uint8_t buf[2048];
657 int ret = 0;
659 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
660 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
661 drv = bdrv_find_format("raw");
662 if (!drv) {
663 error_setg(errp, "Could not find raw image format");
664 ret = -ENOENT;
666 *pdrv = drv;
667 return ret;
670 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
671 if (ret < 0) {
672 error_setg_errno(errp, -ret, "Could not read image for determining its "
673 "format");
674 *pdrv = NULL;
675 return ret;
678 score_max = 0;
679 drv = NULL;
680 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
681 if (drv1->bdrv_probe) {
682 score = drv1->bdrv_probe(buf, ret, filename);
683 if (score > score_max) {
684 score_max = score;
685 drv = drv1;
689 if (!drv) {
690 error_setg(errp, "Could not determine image format: No compatible "
691 "driver found");
692 ret = -ENOENT;
694 *pdrv = drv;
695 return ret;
699 * Set the current 'total_sectors' value
700 * Return 0 on success, -errno on error.
702 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
704 BlockDriver *drv = bs->drv;
706 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
707 if (bs->sg)
708 return 0;
710 /* query actual device if possible, otherwise just trust the hint */
711 if (drv->bdrv_getlength) {
712 int64_t length = drv->bdrv_getlength(bs);
713 if (length < 0) {
714 return length;
716 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
719 bs->total_sectors = hint;
720 return 0;
724 * Set open flags for a given discard mode
726 * Return 0 on success, -1 if the discard mode was invalid.
728 int bdrv_parse_discard_flags(const char *mode, int *flags)
730 *flags &= ~BDRV_O_UNMAP;
732 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
733 /* do nothing */
734 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
735 *flags |= BDRV_O_UNMAP;
736 } else {
737 return -1;
740 return 0;
744 * Set open flags for a given cache mode
746 * Return 0 on success, -1 if the cache mode was invalid.
748 int bdrv_parse_cache_flags(const char *mode, int *flags)
750 *flags &= ~BDRV_O_CACHE_MASK;
752 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
753 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
754 } else if (!strcmp(mode, "directsync")) {
755 *flags |= BDRV_O_NOCACHE;
756 } else if (!strcmp(mode, "writeback")) {
757 *flags |= BDRV_O_CACHE_WB;
758 } else if (!strcmp(mode, "unsafe")) {
759 *flags |= BDRV_O_CACHE_WB;
760 *flags |= BDRV_O_NO_FLUSH;
761 } else if (!strcmp(mode, "writethrough")) {
762 /* this is the default */
763 } else {
764 return -1;
767 return 0;
771 * The copy-on-read flag is actually a reference count so multiple users may
772 * use the feature without worrying about clobbering its previous state.
773 * Copy-on-read stays enabled until all users have called to disable it.
775 void bdrv_enable_copy_on_read(BlockDriverState *bs)
777 bs->copy_on_read++;
780 void bdrv_disable_copy_on_read(BlockDriverState *bs)
782 assert(bs->copy_on_read > 0);
783 bs->copy_on_read--;
787 * Returns the flags that a temporary snapshot should get, based on the
788 * originally requested flags (the originally requested image will have flags
789 * like a backing file)
791 static int bdrv_temp_snapshot_flags(int flags)
793 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
797 * Returns the flags that bs->file should get, based on the given flags for
798 * the parent BDS
800 static int bdrv_inherited_flags(int flags)
802 /* Enable protocol handling, disable format probing for bs->file */
803 flags |= BDRV_O_PROTOCOL;
805 /* Our block drivers take care to send flushes and respect unmap policy,
806 * so we can enable both unconditionally on lower layers. */
807 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
809 /* Clear flags that only apply to the top layer */
810 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
812 return flags;
816 * Returns the flags that bs->backing_hd should get, based on the given flags
817 * for the parent BDS
819 static int bdrv_backing_flags(int flags)
821 /* backing files always opened read-only */
822 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
824 /* snapshot=on is handled on the top layer */
825 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
827 return flags;
830 static int bdrv_open_flags(BlockDriverState *bs, int flags)
832 int open_flags = flags | BDRV_O_CACHE_WB;
835 * Clear flags that are internal to the block layer before opening the
836 * image.
838 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
841 * Snapshots should be writable.
843 if (flags & BDRV_O_TEMPORARY) {
844 open_flags |= BDRV_O_RDWR;
847 return open_flags;
850 static void bdrv_assign_node_name(BlockDriverState *bs,
851 const char *node_name,
852 Error **errp)
854 if (!node_name) {
855 return;
858 /* Check for empty string or invalid characters */
859 if (!id_wellformed(node_name)) {
860 error_setg(errp, "Invalid node name");
861 return;
864 /* takes care of avoiding namespaces collisions */
865 if (blk_by_name(node_name)) {
866 error_setg(errp, "node-name=%s is conflicting with a device id",
867 node_name);
868 return;
871 /* takes care of avoiding duplicates node names */
872 if (bdrv_find_node(node_name)) {
873 error_setg(errp, "Duplicate node name");
874 return;
877 /* copy node name into the bs and insert it into the graph list */
878 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
879 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
883 * Common part for opening disk images and files
885 * Removes all processed options from *options.
887 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
888 QDict *options, int flags, BlockDriver *drv, Error **errp)
890 int ret, open_flags;
891 const char *filename;
892 const char *node_name = NULL;
893 Error *local_err = NULL;
895 assert(drv != NULL);
896 assert(bs->file == NULL);
897 assert(options != NULL && bs->options != options);
899 if (file != NULL) {
900 filename = file->filename;
901 } else {
902 filename = qdict_get_try_str(options, "filename");
905 if (drv->bdrv_needs_filename && !filename) {
906 error_setg(errp, "The '%s' block driver requires a file name",
907 drv->format_name);
908 return -EINVAL;
911 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
913 node_name = qdict_get_try_str(options, "node-name");
914 bdrv_assign_node_name(bs, node_name, &local_err);
915 if (local_err) {
916 error_propagate(errp, local_err);
917 return -EINVAL;
919 qdict_del(options, "node-name");
921 /* bdrv_open() with directly using a protocol as drv. This layer is already
922 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
923 * and return immediately. */
924 if (file != NULL && drv->bdrv_file_open) {
925 bdrv_swap(file, bs);
926 return 0;
929 bs->open_flags = flags;
930 bs->guest_block_size = 512;
931 bs->request_alignment = 512;
932 bs->zero_beyond_eof = true;
933 open_flags = bdrv_open_flags(bs, flags);
934 bs->read_only = !(open_flags & BDRV_O_RDWR);
935 bs->growable = !!(flags & BDRV_O_PROTOCOL);
937 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
938 error_setg(errp,
939 !bs->read_only && bdrv_is_whitelisted(drv, true)
940 ? "Driver '%s' can only be used for read-only devices"
941 : "Driver '%s' is not whitelisted",
942 drv->format_name);
943 return -ENOTSUP;
946 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
947 if (flags & BDRV_O_COPY_ON_READ) {
948 if (!bs->read_only) {
949 bdrv_enable_copy_on_read(bs);
950 } else {
951 error_setg(errp, "Can't use copy-on-read on read-only device");
952 return -EINVAL;
956 if (filename != NULL) {
957 pstrcpy(bs->filename, sizeof(bs->filename), filename);
958 } else {
959 bs->filename[0] = '\0';
961 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
963 bs->drv = drv;
964 bs->opaque = g_malloc0(drv->instance_size);
966 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
968 /* Open the image, either directly or using a protocol */
969 if (drv->bdrv_file_open) {
970 assert(file == NULL);
971 assert(!drv->bdrv_needs_filename || filename != NULL);
972 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
973 } else {
974 if (file == NULL) {
975 error_setg(errp, "Can't use '%s' as a block driver for the "
976 "protocol level", drv->format_name);
977 ret = -EINVAL;
978 goto free_and_fail;
980 bs->file = file;
981 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
984 if (ret < 0) {
985 if (local_err) {
986 error_propagate(errp, local_err);
987 } else if (bs->filename[0]) {
988 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
989 } else {
990 error_setg_errno(errp, -ret, "Could not open image");
992 goto free_and_fail;
995 ret = refresh_total_sectors(bs, bs->total_sectors);
996 if (ret < 0) {
997 error_setg_errno(errp, -ret, "Could not refresh total sector count");
998 goto free_and_fail;
1001 bdrv_refresh_limits(bs, &local_err);
1002 if (local_err) {
1003 error_propagate(errp, local_err);
1004 ret = -EINVAL;
1005 goto free_and_fail;
1008 assert(bdrv_opt_mem_align(bs) != 0);
1009 assert((bs->request_alignment != 0) || bs->sg);
1010 return 0;
1012 free_and_fail:
1013 bs->file = NULL;
1014 g_free(bs->opaque);
1015 bs->opaque = NULL;
1016 bs->drv = NULL;
1017 return ret;
1020 static QDict *parse_json_filename(const char *filename, Error **errp)
1022 QObject *options_obj;
1023 QDict *options;
1024 int ret;
1026 ret = strstart(filename, "json:", &filename);
1027 assert(ret);
1029 options_obj = qobject_from_json(filename);
1030 if (!options_obj) {
1031 error_setg(errp, "Could not parse the JSON options");
1032 return NULL;
1035 if (qobject_type(options_obj) != QTYPE_QDICT) {
1036 qobject_decref(options_obj);
1037 error_setg(errp, "Invalid JSON object given");
1038 return NULL;
1041 options = qobject_to_qdict(options_obj);
1042 qdict_flatten(options);
1044 return options;
1048 * Fills in default options for opening images and converts the legacy
1049 * filename/flags pair to option QDict entries.
1051 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1052 BlockDriver *drv, Error **errp)
1054 const char *filename = *pfilename;
1055 const char *drvname;
1056 bool protocol = flags & BDRV_O_PROTOCOL;
1057 bool parse_filename = false;
1058 Error *local_err = NULL;
1060 /* Parse json: pseudo-protocol */
1061 if (filename && g_str_has_prefix(filename, "json:")) {
1062 QDict *json_options = parse_json_filename(filename, &local_err);
1063 if (local_err) {
1064 error_propagate(errp, local_err);
1065 return -EINVAL;
1068 /* Options given in the filename have lower priority than options
1069 * specified directly */
1070 qdict_join(*options, json_options, false);
1071 QDECREF(json_options);
1072 *pfilename = filename = NULL;
1075 /* Fetch the file name from the options QDict if necessary */
1076 if (protocol && filename) {
1077 if (!qdict_haskey(*options, "filename")) {
1078 qdict_put(*options, "filename", qstring_from_str(filename));
1079 parse_filename = true;
1080 } else {
1081 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1082 "the same time");
1083 return -EINVAL;
1087 /* Find the right block driver */
1088 filename = qdict_get_try_str(*options, "filename");
1089 drvname = qdict_get_try_str(*options, "driver");
1091 if (drv) {
1092 if (drvname) {
1093 error_setg(errp, "Driver specified twice");
1094 return -EINVAL;
1096 drvname = drv->format_name;
1097 qdict_put(*options, "driver", qstring_from_str(drvname));
1098 } else {
1099 if (!drvname && protocol) {
1100 if (filename) {
1101 drv = bdrv_find_protocol(filename, parse_filename);
1102 if (!drv) {
1103 error_setg(errp, "Unknown protocol");
1104 return -EINVAL;
1107 drvname = drv->format_name;
1108 qdict_put(*options, "driver", qstring_from_str(drvname));
1109 } else {
1110 error_setg(errp, "Must specify either driver or file");
1111 return -EINVAL;
1113 } else if (drvname) {
1114 drv = bdrv_find_format(drvname);
1115 if (!drv) {
1116 error_setg(errp, "Unknown driver '%s'", drvname);
1117 return -ENOENT;
1122 assert(drv || !protocol);
1124 /* Driver-specific filename parsing */
1125 if (drv && drv->bdrv_parse_filename && parse_filename) {
1126 drv->bdrv_parse_filename(filename, *options, &local_err);
1127 if (local_err) {
1128 error_propagate(errp, local_err);
1129 return -EINVAL;
1132 if (!drv->bdrv_needs_filename) {
1133 qdict_del(*options, "filename");
1137 return 0;
1140 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1143 if (bs->backing_hd) {
1144 assert(bs->backing_blocker);
1145 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1146 } else if (backing_hd) {
1147 error_setg(&bs->backing_blocker,
1148 "device is used as backing hd of '%s'",
1149 bdrv_get_device_name(bs));
1152 bs->backing_hd = backing_hd;
1153 if (!backing_hd) {
1154 error_free(bs->backing_blocker);
1155 bs->backing_blocker = NULL;
1156 goto out;
1158 bs->open_flags &= ~BDRV_O_NO_BACKING;
1159 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1160 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1161 backing_hd->drv ? backing_hd->drv->format_name : "");
1163 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1164 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1165 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1166 bs->backing_blocker);
1167 out:
1168 bdrv_refresh_limits(bs, NULL);
1172 * Opens the backing file for a BlockDriverState if not yet open
1174 * options is a QDict of options to pass to the block drivers, or NULL for an
1175 * empty set of options. The reference to the QDict is transferred to this
1176 * function (even on failure), so if the caller intends to reuse the dictionary,
1177 * it needs to use QINCREF() before calling bdrv_file_open.
1179 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1181 char *backing_filename = g_malloc0(PATH_MAX);
1182 int ret = 0;
1183 BlockDriver *back_drv = NULL;
1184 BlockDriverState *backing_hd;
1185 Error *local_err = NULL;
1187 if (bs->backing_hd != NULL) {
1188 QDECREF(options);
1189 goto free_exit;
1192 /* NULL means an empty set of options */
1193 if (options == NULL) {
1194 options = qdict_new();
1197 bs->open_flags &= ~BDRV_O_NO_BACKING;
1198 if (qdict_haskey(options, "file.filename")) {
1199 backing_filename[0] = '\0';
1200 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1201 QDECREF(options);
1202 goto free_exit;
1203 } else {
1204 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1207 if (!bs->drv || !bs->drv->supports_backing) {
1208 ret = -EINVAL;
1209 error_setg(errp, "Driver doesn't support backing files");
1210 QDECREF(options);
1211 goto free_exit;
1214 backing_hd = bdrv_new();
1216 if (bs->backing_format[0] != '\0') {
1217 back_drv = bdrv_find_format(bs->backing_format);
1220 assert(bs->backing_hd == NULL);
1221 ret = bdrv_open(&backing_hd,
1222 *backing_filename ? backing_filename : NULL, NULL, options,
1223 bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1224 if (ret < 0) {
1225 bdrv_unref(backing_hd);
1226 backing_hd = NULL;
1227 bs->open_flags |= BDRV_O_NO_BACKING;
1228 error_setg(errp, "Could not open backing file: %s",
1229 error_get_pretty(local_err));
1230 error_free(local_err);
1231 goto free_exit;
1233 bdrv_set_backing_hd(bs, backing_hd);
1235 free_exit:
1236 g_free(backing_filename);
1237 return ret;
1241 * Opens a disk image whose options are given as BlockdevRef in another block
1242 * device's options.
1244 * If allow_none is true, no image will be opened if filename is false and no
1245 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1247 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1248 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1249 * itself, all options starting with "${bdref_key}." are considered part of the
1250 * BlockdevRef.
1252 * The BlockdevRef will be removed from the options QDict.
1254 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1256 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1257 QDict *options, const char *bdref_key, int flags,
1258 bool allow_none, Error **errp)
1260 QDict *image_options;
1261 int ret;
1262 char *bdref_key_dot;
1263 const char *reference;
1265 assert(pbs);
1266 assert(*pbs == NULL);
1268 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1269 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1270 g_free(bdref_key_dot);
1272 reference = qdict_get_try_str(options, bdref_key);
1273 if (!filename && !reference && !qdict_size(image_options)) {
1274 if (allow_none) {
1275 ret = 0;
1276 } else {
1277 error_setg(errp, "A block device must be specified for \"%s\"",
1278 bdref_key);
1279 ret = -EINVAL;
1281 QDECREF(image_options);
1282 goto done;
1285 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1287 done:
1288 qdict_del(options, bdref_key);
1289 return ret;
1292 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1294 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1295 char *tmp_filename = g_malloc0(PATH_MAX + 1);
1296 int64_t total_size;
1297 BlockDriver *bdrv_qcow2;
1298 QemuOpts *opts = NULL;
1299 QDict *snapshot_options;
1300 BlockDriverState *bs_snapshot;
1301 Error *local_err;
1302 int ret;
1304 /* if snapshot, we create a temporary backing file and open it
1305 instead of opening 'filename' directly */
1307 /* Get the required size from the image */
1308 total_size = bdrv_getlength(bs);
1309 if (total_size < 0) {
1310 ret = total_size;
1311 error_setg_errno(errp, -total_size, "Could not get image size");
1312 goto out;
1315 /* Create the temporary image */
1316 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1317 if (ret < 0) {
1318 error_setg_errno(errp, -ret, "Could not get temporary filename");
1319 goto out;
1322 bdrv_qcow2 = bdrv_find_format("qcow2");
1323 opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
1324 &error_abort);
1325 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1326 ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
1327 qemu_opts_del(opts);
1328 if (ret < 0) {
1329 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1330 "'%s': %s", tmp_filename,
1331 error_get_pretty(local_err));
1332 error_free(local_err);
1333 goto out;
1336 /* Prepare a new options QDict for the temporary file */
1337 snapshot_options = qdict_new();
1338 qdict_put(snapshot_options, "file.driver",
1339 qstring_from_str("file"));
1340 qdict_put(snapshot_options, "file.filename",
1341 qstring_from_str(tmp_filename));
1343 bs_snapshot = bdrv_new();
1345 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1346 flags, bdrv_qcow2, &local_err);
1347 if (ret < 0) {
1348 error_propagate(errp, local_err);
1349 goto out;
1352 bdrv_append(bs_snapshot, bs);
1354 out:
1355 g_free(tmp_filename);
1356 return ret;
1360 * Opens a disk image (raw, qcow2, vmdk, ...)
1362 * options is a QDict of options to pass to the block drivers, or NULL for an
1363 * empty set of options. The reference to the QDict belongs to the block layer
1364 * after the call (even on failure), so if the caller intends to reuse the
1365 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1367 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1368 * If it is not NULL, the referenced BDS will be reused.
1370 * The reference parameter may be used to specify an existing block device which
1371 * should be opened. If specified, neither options nor a filename may be given,
1372 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1374 int bdrv_open(BlockDriverState **pbs, const char *filename,
1375 const char *reference, QDict *options, int flags,
1376 BlockDriver *drv, Error **errp)
1378 int ret;
1379 BlockDriverState *file = NULL, *bs;
1380 const char *drvname;
1381 Error *local_err = NULL;
1382 int snapshot_flags = 0;
1384 assert(pbs);
1386 if (reference) {
1387 bool options_non_empty = options ? qdict_size(options) : false;
1388 QDECREF(options);
1390 if (*pbs) {
1391 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1392 "another block device");
1393 return -EINVAL;
1396 if (filename || options_non_empty) {
1397 error_setg(errp, "Cannot reference an existing block device with "
1398 "additional options or a new filename");
1399 return -EINVAL;
1402 bs = bdrv_lookup_bs(reference, reference, errp);
1403 if (!bs) {
1404 return -ENODEV;
1406 bdrv_ref(bs);
1407 *pbs = bs;
1408 return 0;
1411 if (*pbs) {
1412 bs = *pbs;
1413 } else {
1414 bs = bdrv_new();
1417 /* NULL means an empty set of options */
1418 if (options == NULL) {
1419 options = qdict_new();
1422 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1423 if (local_err) {
1424 goto fail;
1427 /* Find the right image format driver */
1428 drv = NULL;
1429 drvname = qdict_get_try_str(options, "driver");
1430 if (drvname) {
1431 drv = bdrv_find_format(drvname);
1432 qdict_del(options, "driver");
1433 if (!drv) {
1434 error_setg(errp, "Unknown driver: '%s'", drvname);
1435 ret = -EINVAL;
1436 goto fail;
1440 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1441 if (drv && !drv->bdrv_file_open) {
1442 /* If the user explicitly wants a format driver here, we'll need to add
1443 * another layer for the protocol in bs->file */
1444 flags &= ~BDRV_O_PROTOCOL;
1447 bs->options = options;
1448 options = qdict_clone_shallow(options);
1450 /* Open image file without format layer */
1451 if ((flags & BDRV_O_PROTOCOL) == 0) {
1452 if (flags & BDRV_O_RDWR) {
1453 flags |= BDRV_O_ALLOW_RDWR;
1455 if (flags & BDRV_O_SNAPSHOT) {
1456 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1457 flags = bdrv_backing_flags(flags);
1460 assert(file == NULL);
1461 ret = bdrv_open_image(&file, filename, options, "file",
1462 bdrv_inherited_flags(flags),
1463 true, &local_err);
1464 if (ret < 0) {
1465 goto fail;
1469 /* Image format probing */
1470 if (!drv && file) {
1471 ret = find_image_format(file, filename, &drv, &local_err);
1472 if (ret < 0) {
1473 goto fail;
1475 } else if (!drv) {
1476 error_setg(errp, "Must specify either driver or file");
1477 ret = -EINVAL;
1478 goto fail;
1481 /* Open the image */
1482 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1483 if (ret < 0) {
1484 goto fail;
1487 if (file && (bs->file != file)) {
1488 bdrv_unref(file);
1489 file = NULL;
1492 /* If there is a backing file, use it */
1493 if ((flags & BDRV_O_NO_BACKING) == 0) {
1494 QDict *backing_options;
1496 qdict_extract_subqdict(options, &backing_options, "backing.");
1497 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1498 if (ret < 0) {
1499 goto close_and_fail;
1503 bdrv_refresh_filename(bs);
1505 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1506 * temporary snapshot afterwards. */
1507 if (snapshot_flags) {
1508 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1509 if (local_err) {
1510 goto close_and_fail;
1514 /* Check if any unknown options were used */
1515 if (options && (qdict_size(options) != 0)) {
1516 const QDictEntry *entry = qdict_first(options);
1517 if (flags & BDRV_O_PROTOCOL) {
1518 error_setg(errp, "Block protocol '%s' doesn't support the option "
1519 "'%s'", drv->format_name, entry->key);
1520 } else {
1521 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1522 "support the option '%s'", drv->format_name,
1523 bdrv_get_device_name(bs), entry->key);
1526 ret = -EINVAL;
1527 goto close_and_fail;
1530 if (!bdrv_key_required(bs)) {
1531 if (bs->blk) {
1532 blk_dev_change_media_cb(bs->blk, true);
1534 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1535 && !runstate_check(RUN_STATE_INMIGRATE)
1536 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1537 error_setg(errp,
1538 "Guest must be stopped for opening of encrypted image");
1539 ret = -EBUSY;
1540 goto close_and_fail;
1543 QDECREF(options);
1544 *pbs = bs;
1545 return 0;
1547 fail:
1548 if (file != NULL) {
1549 bdrv_unref(file);
1551 QDECREF(bs->options);
1552 QDECREF(options);
1553 bs->options = NULL;
1554 if (!*pbs) {
1555 /* If *pbs is NULL, a new BDS has been created in this function and
1556 needs to be freed now. Otherwise, it does not need to be closed,
1557 since it has not really been opened yet. */
1558 bdrv_unref(bs);
1560 if (local_err) {
1561 error_propagate(errp, local_err);
1563 return ret;
1565 close_and_fail:
1566 /* See fail path, but now the BDS has to be always closed */
1567 if (*pbs) {
1568 bdrv_close(bs);
1569 } else {
1570 bdrv_unref(bs);
1572 QDECREF(options);
1573 if (local_err) {
1574 error_propagate(errp, local_err);
1576 return ret;
1579 typedef struct BlockReopenQueueEntry {
1580 bool prepared;
1581 BDRVReopenState state;
1582 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1583 } BlockReopenQueueEntry;
1586 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1587 * reopen of multiple devices.
1589 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1590 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1591 * be created and initialized. This newly created BlockReopenQueue should be
1592 * passed back in for subsequent calls that are intended to be of the same
1593 * atomic 'set'.
1595 * bs is the BlockDriverState to add to the reopen queue.
1597 * flags contains the open flags for the associated bs
1599 * returns a pointer to bs_queue, which is either the newly allocated
1600 * bs_queue, or the existing bs_queue being used.
1603 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1604 BlockDriverState *bs, int flags)
1606 assert(bs != NULL);
1608 BlockReopenQueueEntry *bs_entry;
1609 if (bs_queue == NULL) {
1610 bs_queue = g_new0(BlockReopenQueue, 1);
1611 QSIMPLEQ_INIT(bs_queue);
1614 /* bdrv_open() masks this flag out */
1615 flags &= ~BDRV_O_PROTOCOL;
1617 if (bs->file) {
1618 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1621 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1622 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1624 bs_entry->state.bs = bs;
1625 bs_entry->state.flags = flags;
1627 return bs_queue;
1631 * Reopen multiple BlockDriverStates atomically & transactionally.
1633 * The queue passed in (bs_queue) must have been built up previous
1634 * via bdrv_reopen_queue().
1636 * Reopens all BDS specified in the queue, with the appropriate
1637 * flags. All devices are prepared for reopen, and failure of any
1638 * device will cause all device changes to be abandonded, and intermediate
1639 * data cleaned up.
1641 * If all devices prepare successfully, then the changes are committed
1642 * to all devices.
1645 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1647 int ret = -1;
1648 BlockReopenQueueEntry *bs_entry, *next;
1649 Error *local_err = NULL;
1651 assert(bs_queue != NULL);
1653 bdrv_drain_all();
1655 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1656 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1657 error_propagate(errp, local_err);
1658 goto cleanup;
1660 bs_entry->prepared = true;
1663 /* If we reach this point, we have success and just need to apply the
1664 * changes
1666 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1667 bdrv_reopen_commit(&bs_entry->state);
1670 ret = 0;
1672 cleanup:
1673 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1674 if (ret && bs_entry->prepared) {
1675 bdrv_reopen_abort(&bs_entry->state);
1677 g_free(bs_entry);
1679 g_free(bs_queue);
1680 return ret;
1684 /* Reopen a single BlockDriverState with the specified flags. */
1685 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1687 int ret = -1;
1688 Error *local_err = NULL;
1689 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1691 ret = bdrv_reopen_multiple(queue, &local_err);
1692 if (local_err != NULL) {
1693 error_propagate(errp, local_err);
1695 return ret;
1700 * Prepares a BlockDriverState for reopen. All changes are staged in the
1701 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1702 * the block driver layer .bdrv_reopen_prepare()
1704 * bs is the BlockDriverState to reopen
1705 * flags are the new open flags
1706 * queue is the reopen queue
1708 * Returns 0 on success, non-zero on error. On error errp will be set
1709 * as well.
1711 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1712 * It is the responsibility of the caller to then call the abort() or
1713 * commit() for any other BDS that have been left in a prepare() state
1716 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1717 Error **errp)
1719 int ret = -1;
1720 Error *local_err = NULL;
1721 BlockDriver *drv;
1723 assert(reopen_state != NULL);
1724 assert(reopen_state->bs->drv != NULL);
1725 drv = reopen_state->bs->drv;
1727 /* if we are to stay read-only, do not allow permission change
1728 * to r/w */
1729 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1730 reopen_state->flags & BDRV_O_RDWR) {
1731 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1732 bdrv_get_device_name(reopen_state->bs));
1733 goto error;
1737 ret = bdrv_flush(reopen_state->bs);
1738 if (ret) {
1739 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1740 strerror(-ret));
1741 goto error;
1744 if (drv->bdrv_reopen_prepare) {
1745 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1746 if (ret) {
1747 if (local_err != NULL) {
1748 error_propagate(errp, local_err);
1749 } else {
1750 error_setg(errp, "failed while preparing to reopen image '%s'",
1751 reopen_state->bs->filename);
1753 goto error;
1755 } else {
1756 /* It is currently mandatory to have a bdrv_reopen_prepare()
1757 * handler for each supported drv. */
1758 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1759 drv->format_name, bdrv_get_device_name(reopen_state->bs),
1760 "reopening of file");
1761 ret = -1;
1762 goto error;
1765 ret = 0;
1767 error:
1768 return ret;
1772 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1773 * makes them final by swapping the staging BlockDriverState contents into
1774 * the active BlockDriverState contents.
1776 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1778 BlockDriver *drv;
1780 assert(reopen_state != NULL);
1781 drv = reopen_state->bs->drv;
1782 assert(drv != NULL);
1784 /* If there are any driver level actions to take */
1785 if (drv->bdrv_reopen_commit) {
1786 drv->bdrv_reopen_commit(reopen_state);
1789 /* set BDS specific flags now */
1790 reopen_state->bs->open_flags = reopen_state->flags;
1791 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1792 BDRV_O_CACHE_WB);
1793 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1795 bdrv_refresh_limits(reopen_state->bs, NULL);
1799 * Abort the reopen, and delete and free the staged changes in
1800 * reopen_state
1802 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1804 BlockDriver *drv;
1806 assert(reopen_state != NULL);
1807 drv = reopen_state->bs->drv;
1808 assert(drv != NULL);
1810 if (drv->bdrv_reopen_abort) {
1811 drv->bdrv_reopen_abort(reopen_state);
1816 void bdrv_close(BlockDriverState *bs)
1818 BdrvAioNotifier *ban, *ban_next;
1820 if (bs->job) {
1821 block_job_cancel_sync(bs->job);
1823 bdrv_drain_all(); /* complete I/O */
1824 bdrv_flush(bs);
1825 bdrv_drain_all(); /* in case flush left pending I/O */
1826 notifier_list_notify(&bs->close_notifiers, bs);
1828 if (bs->drv) {
1829 if (bs->backing_hd) {
1830 BlockDriverState *backing_hd = bs->backing_hd;
1831 bdrv_set_backing_hd(bs, NULL);
1832 bdrv_unref(backing_hd);
1834 bs->drv->bdrv_close(bs);
1835 g_free(bs->opaque);
1836 bs->opaque = NULL;
1837 bs->drv = NULL;
1838 bs->copy_on_read = 0;
1839 bs->backing_file[0] = '\0';
1840 bs->backing_format[0] = '\0';
1841 bs->total_sectors = 0;
1842 bs->encrypted = 0;
1843 bs->valid_key = 0;
1844 bs->sg = 0;
1845 bs->growable = 0;
1846 bs->zero_beyond_eof = false;
1847 QDECREF(bs->options);
1848 bs->options = NULL;
1849 QDECREF(bs->full_open_options);
1850 bs->full_open_options = NULL;
1852 if (bs->file != NULL) {
1853 bdrv_unref(bs->file);
1854 bs->file = NULL;
1858 if (bs->blk) {
1859 blk_dev_change_media_cb(bs->blk, false);
1862 /*throttling disk I/O limits*/
1863 if (bs->io_limits_enabled) {
1864 bdrv_io_limits_disable(bs);
1867 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1868 g_free(ban);
1870 QLIST_INIT(&bs->aio_notifiers);
1873 void bdrv_close_all(void)
1875 BlockDriverState *bs;
1877 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1878 AioContext *aio_context = bdrv_get_aio_context(bs);
1880 aio_context_acquire(aio_context);
1881 bdrv_close(bs);
1882 aio_context_release(aio_context);
1886 /* Check if any requests are in-flight (including throttled requests) */
1887 static bool bdrv_requests_pending(BlockDriverState *bs)
1889 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1890 return true;
1892 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1893 return true;
1895 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1896 return true;
1898 if (bs->file && bdrv_requests_pending(bs->file)) {
1899 return true;
1901 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1902 return true;
1904 return false;
1908 * Wait for pending requests to complete across all BlockDriverStates
1910 * This function does not flush data to disk, use bdrv_flush_all() for that
1911 * after calling this function.
1913 * Note that completion of an asynchronous I/O operation can trigger any
1914 * number of other I/O operations on other devices---for example a coroutine
1915 * can be arbitrarily complex and a constant flow of I/O can come until the
1916 * coroutine is complete. Because of this, it is not possible to have a
1917 * function to drain a single device's I/O queue.
1919 void bdrv_drain_all(void)
1921 /* Always run first iteration so any pending completion BHs run */
1922 bool busy = true;
1923 BlockDriverState *bs;
1925 while (busy) {
1926 busy = false;
1928 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1929 AioContext *aio_context = bdrv_get_aio_context(bs);
1930 bool bs_busy;
1932 aio_context_acquire(aio_context);
1933 bdrv_flush_io_queue(bs);
1934 bdrv_start_throttled_reqs(bs);
1935 bs_busy = bdrv_requests_pending(bs);
1936 bs_busy |= aio_poll(aio_context, bs_busy);
1937 aio_context_release(aio_context);
1939 busy |= bs_busy;
1944 /* make a BlockDriverState anonymous by removing from bdrv_state and
1945 * graph_bdrv_state list.
1946 Also, NULL terminate the device_name to prevent double remove */
1947 void bdrv_make_anon(BlockDriverState *bs)
1950 * Take care to remove bs from bdrv_states only when it's actually
1951 * in it. Note that bs->device_list.tqe_prev is initially null,
1952 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
1953 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1954 * resetting it to null on remove.
1956 if (bs->device_list.tqe_prev) {
1957 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1958 bs->device_list.tqe_prev = NULL;
1960 if (bs->node_name[0] != '\0') {
1961 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1963 bs->node_name[0] = '\0';
1966 static void bdrv_rebind(BlockDriverState *bs)
1968 if (bs->drv && bs->drv->bdrv_rebind) {
1969 bs->drv->bdrv_rebind(bs);
1973 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1974 BlockDriverState *bs_src)
1976 /* move some fields that need to stay attached to the device */
1978 /* dev info */
1979 bs_dest->guest_block_size = bs_src->guest_block_size;
1980 bs_dest->copy_on_read = bs_src->copy_on_read;
1982 bs_dest->enable_write_cache = bs_src->enable_write_cache;
1984 /* i/o throttled req */
1985 memcpy(&bs_dest->throttle_state,
1986 &bs_src->throttle_state,
1987 sizeof(ThrottleState));
1988 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
1989 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
1990 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
1992 /* r/w error */
1993 bs_dest->on_read_error = bs_src->on_read_error;
1994 bs_dest->on_write_error = bs_src->on_write_error;
1996 /* i/o status */
1997 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
1998 bs_dest->iostatus = bs_src->iostatus;
2000 /* dirty bitmap */
2001 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
2003 /* reference count */
2004 bs_dest->refcnt = bs_src->refcnt;
2006 /* job */
2007 bs_dest->job = bs_src->job;
2009 /* keep the same entry in bdrv_states */
2010 bs_dest->device_list = bs_src->device_list;
2011 bs_dest->blk = bs_src->blk;
2013 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2014 sizeof(bs_dest->op_blockers));
2018 * Swap bs contents for two image chains while they are live,
2019 * while keeping required fields on the BlockDriverState that is
2020 * actually attached to a device.
2022 * This will modify the BlockDriverState fields, and swap contents
2023 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2025 * bs_new must not be attached to a BlockBackend.
2027 * This function does not create any image files.
2029 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2031 BlockDriverState tmp;
2033 /* The code needs to swap the node_name but simply swapping node_list won't
2034 * work so first remove the nodes from the graph list, do the swap then
2035 * insert them back if needed.
2037 if (bs_new->node_name[0] != '\0') {
2038 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2040 if (bs_old->node_name[0] != '\0') {
2041 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2044 /* bs_new must be unattached and shouldn't have anything fancy enabled */
2045 assert(!bs_new->blk);
2046 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2047 assert(bs_new->job == NULL);
2048 assert(bs_new->io_limits_enabled == false);
2049 assert(!throttle_have_timer(&bs_new->throttle_state));
2051 tmp = *bs_new;
2052 *bs_new = *bs_old;
2053 *bs_old = tmp;
2055 /* there are some fields that should not be swapped, move them back */
2056 bdrv_move_feature_fields(&tmp, bs_old);
2057 bdrv_move_feature_fields(bs_old, bs_new);
2058 bdrv_move_feature_fields(bs_new, &tmp);
2060 /* bs_new must remain unattached */
2061 assert(!bs_new->blk);
2063 /* Check a few fields that should remain attached to the device */
2064 assert(bs_new->job == NULL);
2065 assert(bs_new->io_limits_enabled == false);
2066 assert(!throttle_have_timer(&bs_new->throttle_state));
2068 /* insert the nodes back into the graph node list if needed */
2069 if (bs_new->node_name[0] != '\0') {
2070 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2072 if (bs_old->node_name[0] != '\0') {
2073 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2076 bdrv_rebind(bs_new);
2077 bdrv_rebind(bs_old);
2081 * Add new bs contents at the top of an image chain while the chain is
2082 * live, while keeping required fields on the top layer.
2084 * This will modify the BlockDriverState fields, and swap contents
2085 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2087 * bs_new must not be attached to a BlockBackend.
2089 * This function does not create any image files.
2091 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2093 bdrv_swap(bs_new, bs_top);
2095 /* The contents of 'tmp' will become bs_top, as we are
2096 * swapping bs_new and bs_top contents. */
2097 bdrv_set_backing_hd(bs_top, bs_new);
2100 static void bdrv_delete(BlockDriverState *bs)
2102 assert(!bs->job);
2103 assert(bdrv_op_blocker_is_empty(bs));
2104 assert(!bs->refcnt);
2105 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2107 bdrv_close(bs);
2109 /* remove from list, if necessary */
2110 bdrv_make_anon(bs);
2112 g_free(bs);
2116 * Run consistency checks on an image
2118 * Returns 0 if the check could be completed (it doesn't mean that the image is
2119 * free of errors) or -errno when an internal error occurred. The results of the
2120 * check are stored in res.
2122 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2124 if (bs->drv == NULL) {
2125 return -ENOMEDIUM;
2127 if (bs->drv->bdrv_check == NULL) {
2128 return -ENOTSUP;
2131 memset(res, 0, sizeof(*res));
2132 return bs->drv->bdrv_check(bs, res, fix);
2135 #define COMMIT_BUF_SECTORS 2048
2137 /* commit COW file into the raw image */
2138 int bdrv_commit(BlockDriverState *bs)
2140 BlockDriver *drv = bs->drv;
2141 int64_t sector, total_sectors, length, backing_length;
2142 int n, ro, open_flags;
2143 int ret = 0;
2144 uint8_t *buf = NULL;
2145 char filename[PATH_MAX];
2147 if (!drv)
2148 return -ENOMEDIUM;
2150 if (!bs->backing_hd) {
2151 return -ENOTSUP;
2154 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2155 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2156 return -EBUSY;
2159 ro = bs->backing_hd->read_only;
2160 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2161 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2162 open_flags = bs->backing_hd->open_flags;
2164 if (ro) {
2165 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2166 return -EACCES;
2170 length = bdrv_getlength(bs);
2171 if (length < 0) {
2172 ret = length;
2173 goto ro_cleanup;
2176 backing_length = bdrv_getlength(bs->backing_hd);
2177 if (backing_length < 0) {
2178 ret = backing_length;
2179 goto ro_cleanup;
2182 /* If our top snapshot is larger than the backing file image,
2183 * grow the backing file image if possible. If not possible,
2184 * we must return an error */
2185 if (length > backing_length) {
2186 ret = bdrv_truncate(bs->backing_hd, length);
2187 if (ret < 0) {
2188 goto ro_cleanup;
2192 total_sectors = length >> BDRV_SECTOR_BITS;
2194 /* qemu_try_blockalign() for bs will choose an alignment that works for
2195 * bs->backing_hd as well, so no need to compare the alignment manually. */
2196 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2197 if (buf == NULL) {
2198 ret = -ENOMEM;
2199 goto ro_cleanup;
2202 for (sector = 0; sector < total_sectors; sector += n) {
2203 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2204 if (ret < 0) {
2205 goto ro_cleanup;
2207 if (ret) {
2208 ret = bdrv_read(bs, sector, buf, n);
2209 if (ret < 0) {
2210 goto ro_cleanup;
2213 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2214 if (ret < 0) {
2215 goto ro_cleanup;
2220 if (drv->bdrv_make_empty) {
2221 ret = drv->bdrv_make_empty(bs);
2222 if (ret < 0) {
2223 goto ro_cleanup;
2225 bdrv_flush(bs);
2229 * Make sure all data we wrote to the backing device is actually
2230 * stable on disk.
2232 if (bs->backing_hd) {
2233 bdrv_flush(bs->backing_hd);
2236 ret = 0;
2237 ro_cleanup:
2238 qemu_vfree(buf);
2240 if (ro) {
2241 /* ignoring error return here */
2242 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2245 return ret;
2248 int bdrv_commit_all(void)
2250 BlockDriverState *bs;
2252 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2253 AioContext *aio_context = bdrv_get_aio_context(bs);
2255 aio_context_acquire(aio_context);
2256 if (bs->drv && bs->backing_hd) {
2257 int ret = bdrv_commit(bs);
2258 if (ret < 0) {
2259 aio_context_release(aio_context);
2260 return ret;
2263 aio_context_release(aio_context);
2265 return 0;
2269 * Remove an active request from the tracked requests list
2271 * This function should be called when a tracked request is completing.
2273 static void tracked_request_end(BdrvTrackedRequest *req)
2275 if (req->serialising) {
2276 req->bs->serialising_in_flight--;
2279 QLIST_REMOVE(req, list);
2280 qemu_co_queue_restart_all(&req->wait_queue);
2284 * Add an active request to the tracked requests list
2286 static void tracked_request_begin(BdrvTrackedRequest *req,
2287 BlockDriverState *bs,
2288 int64_t offset,
2289 unsigned int bytes, bool is_write)
2291 *req = (BdrvTrackedRequest){
2292 .bs = bs,
2293 .offset = offset,
2294 .bytes = bytes,
2295 .is_write = is_write,
2296 .co = qemu_coroutine_self(),
2297 .serialising = false,
2298 .overlap_offset = offset,
2299 .overlap_bytes = bytes,
2302 qemu_co_queue_init(&req->wait_queue);
2304 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2307 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2309 int64_t overlap_offset = req->offset & ~(align - 1);
2310 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2311 - overlap_offset;
2313 if (!req->serialising) {
2314 req->bs->serialising_in_flight++;
2315 req->serialising = true;
2318 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2319 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2323 * Round a region to cluster boundaries
2325 void bdrv_round_to_clusters(BlockDriverState *bs,
2326 int64_t sector_num, int nb_sectors,
2327 int64_t *cluster_sector_num,
2328 int *cluster_nb_sectors)
2330 BlockDriverInfo bdi;
2332 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2333 *cluster_sector_num = sector_num;
2334 *cluster_nb_sectors = nb_sectors;
2335 } else {
2336 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2337 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2338 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2339 nb_sectors, c);
2343 static int bdrv_get_cluster_size(BlockDriverState *bs)
2345 BlockDriverInfo bdi;
2346 int ret;
2348 ret = bdrv_get_info(bs, &bdi);
2349 if (ret < 0 || bdi.cluster_size == 0) {
2350 return bs->request_alignment;
2351 } else {
2352 return bdi.cluster_size;
2356 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2357 int64_t offset, unsigned int bytes)
2359 /* aaaa bbbb */
2360 if (offset >= req->overlap_offset + req->overlap_bytes) {
2361 return false;
2363 /* bbbb aaaa */
2364 if (req->overlap_offset >= offset + bytes) {
2365 return false;
2367 return true;
2370 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2372 BlockDriverState *bs = self->bs;
2373 BdrvTrackedRequest *req;
2374 bool retry;
2375 bool waited = false;
2377 if (!bs->serialising_in_flight) {
2378 return false;
2381 do {
2382 retry = false;
2383 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2384 if (req == self || (!req->serialising && !self->serialising)) {
2385 continue;
2387 if (tracked_request_overlaps(req, self->overlap_offset,
2388 self->overlap_bytes))
2390 /* Hitting this means there was a reentrant request, for
2391 * example, a block driver issuing nested requests. This must
2392 * never happen since it means deadlock.
2394 assert(qemu_coroutine_self() != req->co);
2396 /* If the request is already (indirectly) waiting for us, or
2397 * will wait for us as soon as it wakes up, then just go on
2398 * (instead of producing a deadlock in the former case). */
2399 if (!req->waiting_for) {
2400 self->waiting_for = req;
2401 qemu_co_queue_wait(&req->wait_queue);
2402 self->waiting_for = NULL;
2403 retry = true;
2404 waited = true;
2405 break;
2409 } while (retry);
2411 return waited;
2415 * Return values:
2416 * 0 - success
2417 * -EINVAL - backing format specified, but no file
2418 * -ENOSPC - can't update the backing file because no space is left in the
2419 * image file header
2420 * -ENOTSUP - format driver doesn't support changing the backing file
2422 int bdrv_change_backing_file(BlockDriverState *bs,
2423 const char *backing_file, const char *backing_fmt)
2425 BlockDriver *drv = bs->drv;
2426 int ret;
2428 /* Backing file format doesn't make sense without a backing file */
2429 if (backing_fmt && !backing_file) {
2430 return -EINVAL;
2433 if (drv->bdrv_change_backing_file != NULL) {
2434 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2435 } else {
2436 ret = -ENOTSUP;
2439 if (ret == 0) {
2440 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2441 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2443 return ret;
2447 * Finds the image layer in the chain that has 'bs' as its backing file.
2449 * active is the current topmost image.
2451 * Returns NULL if bs is not found in active's image chain,
2452 * or if active == bs.
2454 * Returns the bottommost base image if bs == NULL.
2456 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2457 BlockDriverState *bs)
2459 while (active && bs != active->backing_hd) {
2460 active = active->backing_hd;
2463 return active;
2466 /* Given a BDS, searches for the base layer. */
2467 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2469 return bdrv_find_overlay(bs, NULL);
2472 typedef struct BlkIntermediateStates {
2473 BlockDriverState *bs;
2474 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2475 } BlkIntermediateStates;
2479 * Drops images above 'base' up to and including 'top', and sets the image
2480 * above 'top' to have base as its backing file.
2482 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2483 * information in 'bs' can be properly updated.
2485 * E.g., this will convert the following chain:
2486 * bottom <- base <- intermediate <- top <- active
2488 * to
2490 * bottom <- base <- active
2492 * It is allowed for bottom==base, in which case it converts:
2494 * base <- intermediate <- top <- active
2496 * to
2498 * base <- active
2500 * If backing_file_str is non-NULL, it will be used when modifying top's
2501 * overlay image metadata.
2503 * Error conditions:
2504 * if active == top, that is considered an error
2507 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2508 BlockDriverState *base, const char *backing_file_str)
2510 BlockDriverState *intermediate;
2511 BlockDriverState *base_bs = NULL;
2512 BlockDriverState *new_top_bs = NULL;
2513 BlkIntermediateStates *intermediate_state, *next;
2514 int ret = -EIO;
2516 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2517 QSIMPLEQ_INIT(&states_to_delete);
2519 if (!top->drv || !base->drv) {
2520 goto exit;
2523 new_top_bs = bdrv_find_overlay(active, top);
2525 if (new_top_bs == NULL) {
2526 /* we could not find the image above 'top', this is an error */
2527 goto exit;
2530 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2531 * to do, no intermediate images */
2532 if (new_top_bs->backing_hd == base) {
2533 ret = 0;
2534 goto exit;
2537 intermediate = top;
2539 /* now we will go down through the list, and add each BDS we find
2540 * into our deletion queue, until we hit the 'base'
2542 while (intermediate) {
2543 intermediate_state = g_new0(BlkIntermediateStates, 1);
2544 intermediate_state->bs = intermediate;
2545 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2547 if (intermediate->backing_hd == base) {
2548 base_bs = intermediate->backing_hd;
2549 break;
2551 intermediate = intermediate->backing_hd;
2553 if (base_bs == NULL) {
2554 /* something went wrong, we did not end at the base. safely
2555 * unravel everything, and exit with error */
2556 goto exit;
2559 /* success - we can delete the intermediate states, and link top->base */
2560 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2561 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2562 base_bs->drv ? base_bs->drv->format_name : "");
2563 if (ret) {
2564 goto exit;
2566 bdrv_set_backing_hd(new_top_bs, base_bs);
2568 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2569 /* so that bdrv_close() does not recursively close the chain */
2570 bdrv_set_backing_hd(intermediate_state->bs, NULL);
2571 bdrv_unref(intermediate_state->bs);
2573 ret = 0;
2575 exit:
2576 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2577 g_free(intermediate_state);
2579 return ret;
2583 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2584 size_t size)
2586 int64_t len;
2588 if (size > INT_MAX) {
2589 return -EIO;
2592 if (!bdrv_is_inserted(bs))
2593 return -ENOMEDIUM;
2595 if (bs->growable)
2596 return 0;
2598 len = bdrv_getlength(bs);
2600 if (offset < 0)
2601 return -EIO;
2603 if ((offset > len) || (len - offset < size))
2604 return -EIO;
2606 return 0;
2609 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2610 int nb_sectors)
2612 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2613 return -EIO;
2616 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2617 nb_sectors * BDRV_SECTOR_SIZE);
2620 typedef struct RwCo {
2621 BlockDriverState *bs;
2622 int64_t offset;
2623 QEMUIOVector *qiov;
2624 bool is_write;
2625 int ret;
2626 BdrvRequestFlags flags;
2627 } RwCo;
2629 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2631 RwCo *rwco = opaque;
2633 if (!rwco->is_write) {
2634 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2635 rwco->qiov->size, rwco->qiov,
2636 rwco->flags);
2637 } else {
2638 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2639 rwco->qiov->size, rwco->qiov,
2640 rwco->flags);
2645 * Process a vectored synchronous request using coroutines
2647 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2648 QEMUIOVector *qiov, bool is_write,
2649 BdrvRequestFlags flags)
2651 Coroutine *co;
2652 RwCo rwco = {
2653 .bs = bs,
2654 .offset = offset,
2655 .qiov = qiov,
2656 .is_write = is_write,
2657 .ret = NOT_DONE,
2658 .flags = flags,
2662 * In sync call context, when the vcpu is blocked, this throttling timer
2663 * will not fire; so the I/O throttling function has to be disabled here
2664 * if it has been enabled.
2666 if (bs->io_limits_enabled) {
2667 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2668 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2669 bdrv_io_limits_disable(bs);
2672 if (qemu_in_coroutine()) {
2673 /* Fast-path if already in coroutine context */
2674 bdrv_rw_co_entry(&rwco);
2675 } else {
2676 AioContext *aio_context = bdrv_get_aio_context(bs);
2678 co = qemu_coroutine_create(bdrv_rw_co_entry);
2679 qemu_coroutine_enter(co, &rwco);
2680 while (rwco.ret == NOT_DONE) {
2681 aio_poll(aio_context, true);
2684 return rwco.ret;
2688 * Process a synchronous request using coroutines
2690 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2691 int nb_sectors, bool is_write, BdrvRequestFlags flags)
2693 QEMUIOVector qiov;
2694 struct iovec iov = {
2695 .iov_base = (void *)buf,
2696 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2699 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2700 return -EINVAL;
2703 qemu_iovec_init_external(&qiov, &iov, 1);
2704 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2705 &qiov, is_write, flags);
2708 /* return < 0 if error. See bdrv_write() for the return codes */
2709 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2710 uint8_t *buf, int nb_sectors)
2712 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2715 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2716 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2717 uint8_t *buf, int nb_sectors)
2719 bool enabled;
2720 int ret;
2722 enabled = bs->io_limits_enabled;
2723 bs->io_limits_enabled = false;
2724 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2725 bs->io_limits_enabled = enabled;
2726 return ret;
2729 /* Return < 0 if error. Important errors are:
2730 -EIO generic I/O error (may happen for all errors)
2731 -ENOMEDIUM No media inserted.
2732 -EINVAL Invalid sector number or nb_sectors
2733 -EACCES Trying to write a read-only device
2735 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2736 const uint8_t *buf, int nb_sectors)
2738 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2741 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2742 int nb_sectors, BdrvRequestFlags flags)
2744 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2745 BDRV_REQ_ZERO_WRITE | flags);
2749 * Completely zero out a block device with the help of bdrv_write_zeroes.
2750 * The operation is sped up by checking the block status and only writing
2751 * zeroes to the device if they currently do not return zeroes. Optional
2752 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2754 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2756 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2758 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2759 int n;
2761 target_sectors = bdrv_nb_sectors(bs);
2762 if (target_sectors < 0) {
2763 return target_sectors;
2766 for (;;) {
2767 nb_sectors = target_sectors - sector_num;
2768 if (nb_sectors <= 0) {
2769 return 0;
2771 if (nb_sectors > INT_MAX) {
2772 nb_sectors = INT_MAX;
2774 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2775 if (ret < 0) {
2776 error_report("error getting block status at sector %" PRId64 ": %s",
2777 sector_num, strerror(-ret));
2778 return ret;
2780 if (ret & BDRV_BLOCK_ZERO) {
2781 sector_num += n;
2782 continue;
2784 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2785 if (ret < 0) {
2786 error_report("error writing zeroes at sector %" PRId64 ": %s",
2787 sector_num, strerror(-ret));
2788 return ret;
2790 sector_num += n;
2794 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2796 QEMUIOVector qiov;
2797 struct iovec iov = {
2798 .iov_base = (void *)buf,
2799 .iov_len = bytes,
2801 int ret;
2803 if (bytes < 0) {
2804 return -EINVAL;
2807 qemu_iovec_init_external(&qiov, &iov, 1);
2808 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2809 if (ret < 0) {
2810 return ret;
2813 return bytes;
2816 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2818 int ret;
2820 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2821 if (ret < 0) {
2822 return ret;
2825 return qiov->size;
2828 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2829 const void *buf, int bytes)
2831 QEMUIOVector qiov;
2832 struct iovec iov = {
2833 .iov_base = (void *) buf,
2834 .iov_len = bytes,
2837 if (bytes < 0) {
2838 return -EINVAL;
2841 qemu_iovec_init_external(&qiov, &iov, 1);
2842 return bdrv_pwritev(bs, offset, &qiov);
2846 * Writes to the file and ensures that no writes are reordered across this
2847 * request (acts as a barrier)
2849 * Returns 0 on success, -errno in error cases.
2851 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2852 const void *buf, int count)
2854 int ret;
2856 ret = bdrv_pwrite(bs, offset, buf, count);
2857 if (ret < 0) {
2858 return ret;
2861 /* No flush needed for cache modes that already do it */
2862 if (bs->enable_write_cache) {
2863 bdrv_flush(bs);
2866 return 0;
2869 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2870 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2872 /* Perform I/O through a temporary buffer so that users who scribble over
2873 * their read buffer while the operation is in progress do not end up
2874 * modifying the image file. This is critical for zero-copy guest I/O
2875 * where anything might happen inside guest memory.
2877 void *bounce_buffer;
2879 BlockDriver *drv = bs->drv;
2880 struct iovec iov;
2881 QEMUIOVector bounce_qiov;
2882 int64_t cluster_sector_num;
2883 int cluster_nb_sectors;
2884 size_t skip_bytes;
2885 int ret;
2887 /* Cover entire cluster so no additional backing file I/O is required when
2888 * allocating cluster in the image file.
2890 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2891 &cluster_sector_num, &cluster_nb_sectors);
2893 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2894 cluster_sector_num, cluster_nb_sectors);
2896 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2897 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2898 if (bounce_buffer == NULL) {
2899 ret = -ENOMEM;
2900 goto err;
2903 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2905 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2906 &bounce_qiov);
2907 if (ret < 0) {
2908 goto err;
2911 if (drv->bdrv_co_write_zeroes &&
2912 buffer_is_zero(bounce_buffer, iov.iov_len)) {
2913 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2914 cluster_nb_sectors, 0);
2915 } else {
2916 /* This does not change the data on the disk, it is not necessary
2917 * to flush even in cache=writethrough mode.
2919 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2920 &bounce_qiov);
2923 if (ret < 0) {
2924 /* It might be okay to ignore write errors for guest requests. If this
2925 * is a deliberate copy-on-read then we don't want to ignore the error.
2926 * Simply report it in all cases.
2928 goto err;
2931 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2932 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2933 nb_sectors * BDRV_SECTOR_SIZE);
2935 err:
2936 qemu_vfree(bounce_buffer);
2937 return ret;
2941 * Forwards an already correctly aligned request to the BlockDriver. This
2942 * handles copy on read and zeroing after EOF; any other features must be
2943 * implemented by the caller.
2945 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2946 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2947 int64_t align, QEMUIOVector *qiov, int flags)
2949 BlockDriver *drv = bs->drv;
2950 int ret;
2952 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2953 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2955 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2956 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2957 assert(!qiov || bytes == qiov->size);
2959 /* Handle Copy on Read and associated serialisation */
2960 if (flags & BDRV_REQ_COPY_ON_READ) {
2961 /* If we touch the same cluster it counts as an overlap. This
2962 * guarantees that allocating writes will be serialized and not race
2963 * with each other for the same cluster. For example, in copy-on-read
2964 * it ensures that the CoR read and write operations are atomic and
2965 * guest writes cannot interleave between them. */
2966 mark_request_serialising(req, bdrv_get_cluster_size(bs));
2969 wait_serialising_requests(req);
2971 if (flags & BDRV_REQ_COPY_ON_READ) {
2972 int pnum;
2974 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2975 if (ret < 0) {
2976 goto out;
2979 if (!ret || pnum != nb_sectors) {
2980 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2981 goto out;
2985 /* Forward the request to the BlockDriver */
2986 if (!(bs->zero_beyond_eof && bs->growable)) {
2987 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2988 } else {
2989 /* Read zeros after EOF of growable BDSes */
2990 int64_t total_sectors, max_nb_sectors;
2992 total_sectors = bdrv_nb_sectors(bs);
2993 if (total_sectors < 0) {
2994 ret = total_sectors;
2995 goto out;
2998 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
2999 align >> BDRV_SECTOR_BITS);
3000 if (max_nb_sectors > 0) {
3001 QEMUIOVector local_qiov;
3002 size_t local_sectors;
3004 max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3005 local_sectors = MIN(max_nb_sectors, nb_sectors);
3007 qemu_iovec_init(&local_qiov, qiov->niov);
3008 qemu_iovec_concat(&local_qiov, qiov, 0,
3009 local_sectors * BDRV_SECTOR_SIZE);
3011 ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3012 &local_qiov);
3014 qemu_iovec_destroy(&local_qiov);
3015 } else {
3016 ret = 0;
3019 /* Reading beyond end of file is supposed to produce zeroes */
3020 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3021 uint64_t offset = MAX(0, total_sectors - sector_num);
3022 uint64_t bytes = (sector_num + nb_sectors - offset) *
3023 BDRV_SECTOR_SIZE;
3024 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3028 out:
3029 return ret;
3033 * Handle a read request in coroutine context
3035 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3036 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3037 BdrvRequestFlags flags)
3039 BlockDriver *drv = bs->drv;
3040 BdrvTrackedRequest req;
3042 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3043 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3044 uint8_t *head_buf = NULL;
3045 uint8_t *tail_buf = NULL;
3046 QEMUIOVector local_qiov;
3047 bool use_local_qiov = false;
3048 int ret;
3050 if (!drv) {
3051 return -ENOMEDIUM;
3053 if (bdrv_check_byte_request(bs, offset, bytes)) {
3054 return -EIO;
3057 if (bs->copy_on_read) {
3058 flags |= BDRV_REQ_COPY_ON_READ;
3061 /* throttling disk I/O */
3062 if (bs->io_limits_enabled) {
3063 bdrv_io_limits_intercept(bs, bytes, false);
3066 /* Align read if necessary by padding qiov */
3067 if (offset & (align - 1)) {
3068 head_buf = qemu_blockalign(bs, align);
3069 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3070 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3071 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3072 use_local_qiov = true;
3074 bytes += offset & (align - 1);
3075 offset = offset & ~(align - 1);
3078 if ((offset + bytes) & (align - 1)) {
3079 if (!use_local_qiov) {
3080 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3081 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3082 use_local_qiov = true;
3084 tail_buf = qemu_blockalign(bs, align);
3085 qemu_iovec_add(&local_qiov, tail_buf,
3086 align - ((offset + bytes) & (align - 1)));
3088 bytes = ROUND_UP(bytes, align);
3091 tracked_request_begin(&req, bs, offset, bytes, false);
3092 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3093 use_local_qiov ? &local_qiov : qiov,
3094 flags);
3095 tracked_request_end(&req);
3097 if (use_local_qiov) {
3098 qemu_iovec_destroy(&local_qiov);
3099 qemu_vfree(head_buf);
3100 qemu_vfree(tail_buf);
3103 return ret;
3106 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3107 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3108 BdrvRequestFlags flags)
3110 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3111 return -EINVAL;
3114 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3115 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3118 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3119 int nb_sectors, QEMUIOVector *qiov)
3121 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3123 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3126 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3127 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3129 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3131 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3132 BDRV_REQ_COPY_ON_READ);
3135 /* if no limit is specified in the BlockLimits use a default
3136 * of 32768 512-byte sectors (16 MiB) per request.
3138 #define MAX_WRITE_ZEROES_DEFAULT 32768
3140 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3141 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3143 BlockDriver *drv = bs->drv;
3144 QEMUIOVector qiov;
3145 struct iovec iov = {0};
3146 int ret = 0;
3148 int max_write_zeroes = bs->bl.max_write_zeroes ?
3149 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3151 while (nb_sectors > 0 && !ret) {
3152 int num = nb_sectors;
3154 /* Align request. Block drivers can expect the "bulk" of the request
3155 * to be aligned.
3157 if (bs->bl.write_zeroes_alignment
3158 && num > bs->bl.write_zeroes_alignment) {
3159 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3160 /* Make a small request up to the first aligned sector. */
3161 num = bs->bl.write_zeroes_alignment;
3162 num -= sector_num % bs->bl.write_zeroes_alignment;
3163 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3164 /* Shorten the request to the last aligned sector. num cannot
3165 * underflow because num > bs->bl.write_zeroes_alignment.
3167 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3171 /* limit request size */
3172 if (num > max_write_zeroes) {
3173 num = max_write_zeroes;
3176 ret = -ENOTSUP;
3177 /* First try the efficient write zeroes operation */
3178 if (drv->bdrv_co_write_zeroes) {
3179 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3182 if (ret == -ENOTSUP) {
3183 /* Fall back to bounce buffer if write zeroes is unsupported */
3184 iov.iov_len = num * BDRV_SECTOR_SIZE;
3185 if (iov.iov_base == NULL) {
3186 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3187 if (iov.iov_base == NULL) {
3188 ret = -ENOMEM;
3189 goto fail;
3191 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3193 qemu_iovec_init_external(&qiov, &iov, 1);
3195 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3197 /* Keep bounce buffer around if it is big enough for all
3198 * all future requests.
3200 if (num < max_write_zeroes) {
3201 qemu_vfree(iov.iov_base);
3202 iov.iov_base = NULL;
3206 sector_num += num;
3207 nb_sectors -= num;
3210 fail:
3211 qemu_vfree(iov.iov_base);
3212 return ret;
3216 * Forwards an already correctly aligned write request to the BlockDriver.
3218 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3219 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3220 QEMUIOVector *qiov, int flags)
3222 BlockDriver *drv = bs->drv;
3223 bool waited;
3224 int ret;
3226 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3227 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3229 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3230 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3231 assert(!qiov || bytes == qiov->size);
3233 waited = wait_serialising_requests(req);
3234 assert(!waited || !req->serialising);
3235 assert(req->overlap_offset <= offset);
3236 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3238 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3240 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3241 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3242 qemu_iovec_is_zero(qiov)) {
3243 flags |= BDRV_REQ_ZERO_WRITE;
3244 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3245 flags |= BDRV_REQ_MAY_UNMAP;
3249 if (ret < 0) {
3250 /* Do nothing, write notifier decided to fail this request */
3251 } else if (flags & BDRV_REQ_ZERO_WRITE) {
3252 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3253 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3254 } else {
3255 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3256 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3258 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3260 if (ret == 0 && !bs->enable_write_cache) {
3261 ret = bdrv_co_flush(bs);
3264 bdrv_set_dirty(bs, sector_num, nb_sectors);
3266 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3268 if (bs->growable && ret >= 0) {
3269 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3272 return ret;
3276 * Handle a write request in coroutine context
3278 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3279 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3280 BdrvRequestFlags flags)
3282 BdrvTrackedRequest req;
3283 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3284 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3285 uint8_t *head_buf = NULL;
3286 uint8_t *tail_buf = NULL;
3287 QEMUIOVector local_qiov;
3288 bool use_local_qiov = false;
3289 int ret;
3291 if (!bs->drv) {
3292 return -ENOMEDIUM;
3294 if (bs->read_only) {
3295 return -EACCES;
3297 if (bdrv_check_byte_request(bs, offset, bytes)) {
3298 return -EIO;
3301 /* throttling disk I/O */
3302 if (bs->io_limits_enabled) {
3303 bdrv_io_limits_intercept(bs, bytes, true);
3307 * Align write if necessary by performing a read-modify-write cycle.
3308 * Pad qiov with the read parts and be sure to have a tracked request not
3309 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3311 tracked_request_begin(&req, bs, offset, bytes, true);
3313 if (offset & (align - 1)) {
3314 QEMUIOVector head_qiov;
3315 struct iovec head_iov;
3317 mark_request_serialising(&req, align);
3318 wait_serialising_requests(&req);
3320 head_buf = qemu_blockalign(bs, align);
3321 head_iov = (struct iovec) {
3322 .iov_base = head_buf,
3323 .iov_len = align,
3325 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3327 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3328 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3329 align, &head_qiov, 0);
3330 if (ret < 0) {
3331 goto fail;
3333 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3335 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3336 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3337 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3338 use_local_qiov = true;
3340 bytes += offset & (align - 1);
3341 offset = offset & ~(align - 1);
3344 if ((offset + bytes) & (align - 1)) {
3345 QEMUIOVector tail_qiov;
3346 struct iovec tail_iov;
3347 size_t tail_bytes;
3348 bool waited;
3350 mark_request_serialising(&req, align);
3351 waited = wait_serialising_requests(&req);
3352 assert(!waited || !use_local_qiov);
3354 tail_buf = qemu_blockalign(bs, align);
3355 tail_iov = (struct iovec) {
3356 .iov_base = tail_buf,
3357 .iov_len = align,
3359 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3361 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3362 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3363 align, &tail_qiov, 0);
3364 if (ret < 0) {
3365 goto fail;
3367 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3369 if (!use_local_qiov) {
3370 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3371 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3372 use_local_qiov = true;
3375 tail_bytes = (offset + bytes) & (align - 1);
3376 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3378 bytes = ROUND_UP(bytes, align);
3381 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3382 use_local_qiov ? &local_qiov : qiov,
3383 flags);
3385 fail:
3386 tracked_request_end(&req);
3388 if (use_local_qiov) {
3389 qemu_iovec_destroy(&local_qiov);
3391 qemu_vfree(head_buf);
3392 qemu_vfree(tail_buf);
3394 return ret;
3397 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3398 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3399 BdrvRequestFlags flags)
3401 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3402 return -EINVAL;
3405 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3406 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3409 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3410 int nb_sectors, QEMUIOVector *qiov)
3412 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3414 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3417 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3418 int64_t sector_num, int nb_sectors,
3419 BdrvRequestFlags flags)
3421 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3423 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3424 flags &= ~BDRV_REQ_MAY_UNMAP;
3427 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3428 BDRV_REQ_ZERO_WRITE | flags);
3432 * Truncate file to 'offset' bytes (needed only for file protocols)
3434 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3436 BlockDriver *drv = bs->drv;
3437 int ret;
3438 if (!drv)
3439 return -ENOMEDIUM;
3440 if (!drv->bdrv_truncate)
3441 return -ENOTSUP;
3442 if (bs->read_only)
3443 return -EACCES;
3445 ret = drv->bdrv_truncate(bs, offset);
3446 if (ret == 0) {
3447 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3448 if (bs->blk) {
3449 blk_dev_resize_cb(bs->blk);
3452 return ret;
3456 * Length of a allocated file in bytes. Sparse files are counted by actual
3457 * allocated space. Return < 0 if error or unknown.
3459 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3461 BlockDriver *drv = bs->drv;
3462 if (!drv) {
3463 return -ENOMEDIUM;
3465 if (drv->bdrv_get_allocated_file_size) {
3466 return drv->bdrv_get_allocated_file_size(bs);
3468 if (bs->file) {
3469 return bdrv_get_allocated_file_size(bs->file);
3471 return -ENOTSUP;
3475 * Return number of sectors on success, -errno on error.
3477 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3479 BlockDriver *drv = bs->drv;
3481 if (!drv)
3482 return -ENOMEDIUM;
3484 if (drv->has_variable_length) {
3485 int ret = refresh_total_sectors(bs, bs->total_sectors);
3486 if (ret < 0) {
3487 return ret;
3490 return bs->total_sectors;
3494 * Return length in bytes on success, -errno on error.
3495 * The length is always a multiple of BDRV_SECTOR_SIZE.
3497 int64_t bdrv_getlength(BlockDriverState *bs)
3499 int64_t ret = bdrv_nb_sectors(bs);
3501 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3504 /* return 0 as number of sectors if no device present or error */
3505 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3507 int64_t nb_sectors = bdrv_nb_sectors(bs);
3509 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3512 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3513 BlockdevOnError on_write_error)
3515 bs->on_read_error = on_read_error;
3516 bs->on_write_error = on_write_error;
3519 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3521 return is_read ? bs->on_read_error : bs->on_write_error;
3524 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3526 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3528 switch (on_err) {
3529 case BLOCKDEV_ON_ERROR_ENOSPC:
3530 return (error == ENOSPC) ?
3531 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3532 case BLOCKDEV_ON_ERROR_STOP:
3533 return BLOCK_ERROR_ACTION_STOP;
3534 case BLOCKDEV_ON_ERROR_REPORT:
3535 return BLOCK_ERROR_ACTION_REPORT;
3536 case BLOCKDEV_ON_ERROR_IGNORE:
3537 return BLOCK_ERROR_ACTION_IGNORE;
3538 default:
3539 abort();
3543 static void send_qmp_error_event(BlockDriverState *bs,
3544 BlockErrorAction action,
3545 bool is_read, int error)
3547 BlockErrorAction ac;
3549 ac = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3550 qapi_event_send_block_io_error(bdrv_get_device_name(bs), ac, action,
3551 bdrv_iostatus_is_enabled(bs),
3552 error == ENOSPC, strerror(error),
3553 &error_abort);
3556 /* This is done by device models because, while the block layer knows
3557 * about the error, it does not know whether an operation comes from
3558 * the device or the block layer (from a job, for example).
3560 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3561 bool is_read, int error)
3563 assert(error >= 0);
3565 if (action == BLOCK_ERROR_ACTION_STOP) {
3566 /* First set the iostatus, so that "info block" returns an iostatus
3567 * that matches the events raised so far (an additional error iostatus
3568 * is fine, but not a lost one).
3570 bdrv_iostatus_set_err(bs, error);
3572 /* Then raise the request to stop the VM and the event.
3573 * qemu_system_vmstop_request_prepare has two effects. First,
3574 * it ensures that the STOP event always comes after the
3575 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3576 * can observe the STOP event and do a "cont" before the STOP
3577 * event is issued, the VM will not stop. In this case, vm_start()
3578 * also ensures that the STOP/RESUME pair of events is emitted.
3580 qemu_system_vmstop_request_prepare();
3581 send_qmp_error_event(bs, action, is_read, error);
3582 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3583 } else {
3584 send_qmp_error_event(bs, action, is_read, error);
3588 int bdrv_is_read_only(BlockDriverState *bs)
3590 return bs->read_only;
3593 int bdrv_is_sg(BlockDriverState *bs)
3595 return bs->sg;
3598 int bdrv_enable_write_cache(BlockDriverState *bs)
3600 return bs->enable_write_cache;
3603 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3605 bs->enable_write_cache = wce;
3607 /* so a reopen() will preserve wce */
3608 if (wce) {
3609 bs->open_flags |= BDRV_O_CACHE_WB;
3610 } else {
3611 bs->open_flags &= ~BDRV_O_CACHE_WB;
3615 int bdrv_is_encrypted(BlockDriverState *bs)
3617 if (bs->backing_hd && bs->backing_hd->encrypted)
3618 return 1;
3619 return bs->encrypted;
3622 int bdrv_key_required(BlockDriverState *bs)
3624 BlockDriverState *backing_hd = bs->backing_hd;
3626 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3627 return 1;
3628 return (bs->encrypted && !bs->valid_key);
3631 int bdrv_set_key(BlockDriverState *bs, const char *key)
3633 int ret;
3634 if (bs->backing_hd && bs->backing_hd->encrypted) {
3635 ret = bdrv_set_key(bs->backing_hd, key);
3636 if (ret < 0)
3637 return ret;
3638 if (!bs->encrypted)
3639 return 0;
3641 if (!bs->encrypted) {
3642 return -EINVAL;
3643 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3644 return -ENOMEDIUM;
3646 ret = bs->drv->bdrv_set_key(bs, key);
3647 if (ret < 0) {
3648 bs->valid_key = 0;
3649 } else if (!bs->valid_key) {
3650 bs->valid_key = 1;
3651 if (bs->blk) {
3652 /* call the change callback now, we skipped it on open */
3653 blk_dev_change_media_cb(bs->blk, true);
3656 return ret;
3659 const char *bdrv_get_format_name(BlockDriverState *bs)
3661 return bs->drv ? bs->drv->format_name : NULL;
3664 static int qsort_strcmp(const void *a, const void *b)
3666 return strcmp(a, b);
3669 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3670 void *opaque)
3672 BlockDriver *drv;
3673 int count = 0;
3674 int i;
3675 const char **formats = NULL;
3677 QLIST_FOREACH(drv, &bdrv_drivers, list) {
3678 if (drv->format_name) {
3679 bool found = false;
3680 int i = count;
3681 while (formats && i && !found) {
3682 found = !strcmp(formats[--i], drv->format_name);
3685 if (!found) {
3686 formats = g_renew(const char *, formats, count + 1);
3687 formats[count++] = drv->format_name;
3692 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3694 for (i = 0; i < count; i++) {
3695 it(opaque, formats[i]);
3698 g_free(formats);
3701 /* This function is to find block backend bs */
3702 /* TODO convert callers to blk_by_name(), then remove */
3703 BlockDriverState *bdrv_find(const char *name)
3705 BlockBackend *blk = blk_by_name(name);
3707 return blk ? blk_bs(blk) : NULL;
3710 /* This function is to find a node in the bs graph */
3711 BlockDriverState *bdrv_find_node(const char *node_name)
3713 BlockDriverState *bs;
3715 assert(node_name);
3717 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3718 if (!strcmp(node_name, bs->node_name)) {
3719 return bs;
3722 return NULL;
3725 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3726 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3728 BlockDeviceInfoList *list, *entry;
3729 BlockDriverState *bs;
3731 list = NULL;
3732 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3733 entry = g_malloc0(sizeof(*entry));
3734 entry->value = bdrv_block_device_info(bs);
3735 entry->next = list;
3736 list = entry;
3739 return list;
3742 BlockDriverState *bdrv_lookup_bs(const char *device,
3743 const char *node_name,
3744 Error **errp)
3746 BlockBackend *blk;
3747 BlockDriverState *bs;
3749 if (device) {
3750 blk = blk_by_name(device);
3752 if (blk) {
3753 return blk_bs(blk);
3757 if (node_name) {
3758 bs = bdrv_find_node(node_name);
3760 if (bs) {
3761 return bs;
3765 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3766 device ? device : "",
3767 node_name ? node_name : "");
3768 return NULL;
3771 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3772 * return false. If either argument is NULL, return false. */
3773 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3775 while (top && top != base) {
3776 top = top->backing_hd;
3779 return top != NULL;
3782 BlockDriverState *bdrv_next(BlockDriverState *bs)
3784 if (!bs) {
3785 return QTAILQ_FIRST(&bdrv_states);
3787 return QTAILQ_NEXT(bs, device_list);
3790 /* TODO check what callers really want: bs->node_name or blk_name() */
3791 const char *bdrv_get_device_name(const BlockDriverState *bs)
3793 return bs->blk ? blk_name(bs->blk) : "";
3796 int bdrv_get_flags(BlockDriverState *bs)
3798 return bs->open_flags;
3801 int bdrv_flush_all(void)
3803 BlockDriverState *bs;
3804 int result = 0;
3806 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3807 AioContext *aio_context = bdrv_get_aio_context(bs);
3808 int ret;
3810 aio_context_acquire(aio_context);
3811 ret = bdrv_flush(bs);
3812 if (ret < 0 && !result) {
3813 result = ret;
3815 aio_context_release(aio_context);
3818 return result;
3821 int bdrv_has_zero_init_1(BlockDriverState *bs)
3823 return 1;
3826 int bdrv_has_zero_init(BlockDriverState *bs)
3828 assert(bs->drv);
3830 /* If BS is a copy on write image, it is initialized to
3831 the contents of the base image, which may not be zeroes. */
3832 if (bs->backing_hd) {
3833 return 0;
3835 if (bs->drv->bdrv_has_zero_init) {
3836 return bs->drv->bdrv_has_zero_init(bs);
3839 /* safe default */
3840 return 0;
3843 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3845 BlockDriverInfo bdi;
3847 if (bs->backing_hd) {
3848 return false;
3851 if (bdrv_get_info(bs, &bdi) == 0) {
3852 return bdi.unallocated_blocks_are_zero;
3855 return false;
3858 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3860 BlockDriverInfo bdi;
3862 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3863 return false;
3866 if (bdrv_get_info(bs, &bdi) == 0) {
3867 return bdi.can_write_zeroes_with_unmap;
3870 return false;
3873 typedef struct BdrvCoGetBlockStatusData {
3874 BlockDriverState *bs;
3875 BlockDriverState *base;
3876 int64_t sector_num;
3877 int nb_sectors;
3878 int *pnum;
3879 int64_t ret;
3880 bool done;
3881 } BdrvCoGetBlockStatusData;
3884 * Returns true iff the specified sector is present in the disk image. Drivers
3885 * not implementing the functionality are assumed to not support backing files,
3886 * hence all their sectors are reported as allocated.
3888 * If 'sector_num' is beyond the end of the disk image the return value is 0
3889 * and 'pnum' is set to 0.
3891 * 'pnum' is set to the number of sectors (including and immediately following
3892 * the specified sector) that are known to be in the same
3893 * allocated/unallocated state.
3895 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3896 * beyond the end of the disk image it will be clamped.
3898 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3899 int64_t sector_num,
3900 int nb_sectors, int *pnum)
3902 int64_t total_sectors;
3903 int64_t n;
3904 int64_t ret, ret2;
3906 total_sectors = bdrv_nb_sectors(bs);
3907 if (total_sectors < 0) {
3908 return total_sectors;
3911 if (sector_num >= total_sectors) {
3912 *pnum = 0;
3913 return 0;
3916 n = total_sectors - sector_num;
3917 if (n < nb_sectors) {
3918 nb_sectors = n;
3921 if (!bs->drv->bdrv_co_get_block_status) {
3922 *pnum = nb_sectors;
3923 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3924 if (bs->drv->protocol_name) {
3925 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3927 return ret;
3930 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3931 if (ret < 0) {
3932 *pnum = 0;
3933 return ret;
3936 if (ret & BDRV_BLOCK_RAW) {
3937 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3938 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3939 *pnum, pnum);
3942 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
3943 ret |= BDRV_BLOCK_ALLOCATED;
3946 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3947 if (bdrv_unallocated_blocks_are_zero(bs)) {
3948 ret |= BDRV_BLOCK_ZERO;
3949 } else if (bs->backing_hd) {
3950 BlockDriverState *bs2 = bs->backing_hd;
3951 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
3952 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
3953 ret |= BDRV_BLOCK_ZERO;
3958 if (bs->file &&
3959 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3960 (ret & BDRV_BLOCK_OFFSET_VALID)) {
3961 int file_pnum;
3963 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3964 *pnum, &file_pnum);
3965 if (ret2 >= 0) {
3966 /* Ignore errors. This is just providing extra information, it
3967 * is useful but not necessary.
3969 if (!file_pnum) {
3970 /* !file_pnum indicates an offset at or beyond the EOF; it is
3971 * perfectly valid for the format block driver to point to such
3972 * offsets, so catch it and mark everything as zero */
3973 ret |= BDRV_BLOCK_ZERO;
3974 } else {
3975 /* Limit request to the range reported by the protocol driver */
3976 *pnum = file_pnum;
3977 ret |= (ret2 & BDRV_BLOCK_ZERO);
3982 return ret;
3985 /* Coroutine wrapper for bdrv_get_block_status() */
3986 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3988 BdrvCoGetBlockStatusData *data = opaque;
3989 BlockDriverState *bs = data->bs;
3991 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3992 data->pnum);
3993 data->done = true;
3997 * Synchronous wrapper around bdrv_co_get_block_status().
3999 * See bdrv_co_get_block_status() for details.
4001 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4002 int nb_sectors, int *pnum)
4004 Coroutine *co;
4005 BdrvCoGetBlockStatusData data = {
4006 .bs = bs,
4007 .sector_num = sector_num,
4008 .nb_sectors = nb_sectors,
4009 .pnum = pnum,
4010 .done = false,
4013 if (qemu_in_coroutine()) {
4014 /* Fast-path if already in coroutine context */
4015 bdrv_get_block_status_co_entry(&data);
4016 } else {
4017 AioContext *aio_context = bdrv_get_aio_context(bs);
4019 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4020 qemu_coroutine_enter(co, &data);
4021 while (!data.done) {
4022 aio_poll(aio_context, true);
4025 return data.ret;
4028 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4029 int nb_sectors, int *pnum)
4031 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4032 if (ret < 0) {
4033 return ret;
4035 return !!(ret & BDRV_BLOCK_ALLOCATED);
4039 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4041 * Return true if the given sector is allocated in any image between
4042 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4043 * sector is allocated in any image of the chain. Return false otherwise.
4045 * 'pnum' is set to the number of sectors (including and immediately following
4046 * the specified sector) that are known to be in the same
4047 * allocated/unallocated state.
4050 int bdrv_is_allocated_above(BlockDriverState *top,
4051 BlockDriverState *base,
4052 int64_t sector_num,
4053 int nb_sectors, int *pnum)
4055 BlockDriverState *intermediate;
4056 int ret, n = nb_sectors;
4058 intermediate = top;
4059 while (intermediate && intermediate != base) {
4060 int pnum_inter;
4061 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4062 &pnum_inter);
4063 if (ret < 0) {
4064 return ret;
4065 } else if (ret) {
4066 *pnum = pnum_inter;
4067 return 1;
4071 * [sector_num, nb_sectors] is unallocated on top but intermediate
4072 * might have
4074 * [sector_num+x, nr_sectors] allocated.
4076 if (n > pnum_inter &&
4077 (intermediate == top ||
4078 sector_num + pnum_inter < intermediate->total_sectors)) {
4079 n = pnum_inter;
4082 intermediate = intermediate->backing_hd;
4085 *pnum = n;
4086 return 0;
4089 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4091 if (bs->backing_hd && bs->backing_hd->encrypted)
4092 return bs->backing_file;
4093 else if (bs->encrypted)
4094 return bs->filename;
4095 else
4096 return NULL;
4099 void bdrv_get_backing_filename(BlockDriverState *bs,
4100 char *filename, int filename_size)
4102 pstrcpy(filename, filename_size, bs->backing_file);
4105 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4106 const uint8_t *buf, int nb_sectors)
4108 BlockDriver *drv = bs->drv;
4109 if (!drv)
4110 return -ENOMEDIUM;
4111 if (!drv->bdrv_write_compressed)
4112 return -ENOTSUP;
4113 if (bdrv_check_request(bs, sector_num, nb_sectors))
4114 return -EIO;
4116 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4118 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4121 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4123 BlockDriver *drv = bs->drv;
4124 if (!drv)
4125 return -ENOMEDIUM;
4126 if (!drv->bdrv_get_info)
4127 return -ENOTSUP;
4128 memset(bdi, 0, sizeof(*bdi));
4129 return drv->bdrv_get_info(bs, bdi);
4132 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4134 BlockDriver *drv = bs->drv;
4135 if (drv && drv->bdrv_get_specific_info) {
4136 return drv->bdrv_get_specific_info(bs);
4138 return NULL;
4141 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4142 int64_t pos, int size)
4144 QEMUIOVector qiov;
4145 struct iovec iov = {
4146 .iov_base = (void *) buf,
4147 .iov_len = size,
4150 qemu_iovec_init_external(&qiov, &iov, 1);
4151 return bdrv_writev_vmstate(bs, &qiov, pos);
4154 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4156 BlockDriver *drv = bs->drv;
4158 if (!drv) {
4159 return -ENOMEDIUM;
4160 } else if (drv->bdrv_save_vmstate) {
4161 return drv->bdrv_save_vmstate(bs, qiov, pos);
4162 } else if (bs->file) {
4163 return bdrv_writev_vmstate(bs->file, qiov, pos);
4166 return -ENOTSUP;
4169 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4170 int64_t pos, int size)
4172 BlockDriver *drv = bs->drv;
4173 if (!drv)
4174 return -ENOMEDIUM;
4175 if (drv->bdrv_load_vmstate)
4176 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4177 if (bs->file)
4178 return bdrv_load_vmstate(bs->file, buf, pos, size);
4179 return -ENOTSUP;
4182 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4184 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4185 return;
4188 bs->drv->bdrv_debug_event(bs, event);
4191 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4192 const char *tag)
4194 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4195 bs = bs->file;
4198 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4199 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4202 return -ENOTSUP;
4205 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4207 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4208 bs = bs->file;
4211 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4212 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4215 return -ENOTSUP;
4218 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4220 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4221 bs = bs->file;
4224 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4225 return bs->drv->bdrv_debug_resume(bs, tag);
4228 return -ENOTSUP;
4231 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4233 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4234 bs = bs->file;
4237 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4238 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4241 return false;
4244 int bdrv_is_snapshot(BlockDriverState *bs)
4246 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4249 /* backing_file can either be relative, or absolute, or a protocol. If it is
4250 * relative, it must be relative to the chain. So, passing in bs->filename
4251 * from a BDS as backing_file should not be done, as that may be relative to
4252 * the CWD rather than the chain. */
4253 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4254 const char *backing_file)
4256 char *filename_full = NULL;
4257 char *backing_file_full = NULL;
4258 char *filename_tmp = NULL;
4259 int is_protocol = 0;
4260 BlockDriverState *curr_bs = NULL;
4261 BlockDriverState *retval = NULL;
4263 if (!bs || !bs->drv || !backing_file) {
4264 return NULL;
4267 filename_full = g_malloc(PATH_MAX);
4268 backing_file_full = g_malloc(PATH_MAX);
4269 filename_tmp = g_malloc(PATH_MAX);
4271 is_protocol = path_has_protocol(backing_file);
4273 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4275 /* If either of the filename paths is actually a protocol, then
4276 * compare unmodified paths; otherwise make paths relative */
4277 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4278 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4279 retval = curr_bs->backing_hd;
4280 break;
4282 } else {
4283 /* If not an absolute filename path, make it relative to the current
4284 * image's filename path */
4285 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4286 backing_file);
4288 /* We are going to compare absolute pathnames */
4289 if (!realpath(filename_tmp, filename_full)) {
4290 continue;
4293 /* We need to make sure the backing filename we are comparing against
4294 * is relative to the current image filename (or absolute) */
4295 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4296 curr_bs->backing_file);
4298 if (!realpath(filename_tmp, backing_file_full)) {
4299 continue;
4302 if (strcmp(backing_file_full, filename_full) == 0) {
4303 retval = curr_bs->backing_hd;
4304 break;
4309 g_free(filename_full);
4310 g_free(backing_file_full);
4311 g_free(filename_tmp);
4312 return retval;
4315 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4317 if (!bs->drv) {
4318 return 0;
4321 if (!bs->backing_hd) {
4322 return 0;
4325 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4328 /**************************************************************/
4329 /* async I/Os */
4331 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4332 QEMUIOVector *qiov, int nb_sectors,
4333 BlockCompletionFunc *cb, void *opaque)
4335 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4337 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4338 cb, opaque, false);
4341 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4342 QEMUIOVector *qiov, int nb_sectors,
4343 BlockCompletionFunc *cb, void *opaque)
4345 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4347 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4348 cb, opaque, true);
4351 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4352 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4353 BlockCompletionFunc *cb, void *opaque)
4355 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4357 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4358 BDRV_REQ_ZERO_WRITE | flags,
4359 cb, opaque, true);
4363 typedef struct MultiwriteCB {
4364 int error;
4365 int num_requests;
4366 int num_callbacks;
4367 struct {
4368 BlockCompletionFunc *cb;
4369 void *opaque;
4370 QEMUIOVector *free_qiov;
4371 } callbacks[];
4372 } MultiwriteCB;
4374 static void multiwrite_user_cb(MultiwriteCB *mcb)
4376 int i;
4378 for (i = 0; i < mcb->num_callbacks; i++) {
4379 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4380 if (mcb->callbacks[i].free_qiov) {
4381 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4383 g_free(mcb->callbacks[i].free_qiov);
4387 static void multiwrite_cb(void *opaque, int ret)
4389 MultiwriteCB *mcb = opaque;
4391 trace_multiwrite_cb(mcb, ret);
4393 if (ret < 0 && !mcb->error) {
4394 mcb->error = ret;
4397 mcb->num_requests--;
4398 if (mcb->num_requests == 0) {
4399 multiwrite_user_cb(mcb);
4400 g_free(mcb);
4404 static int multiwrite_req_compare(const void *a, const void *b)
4406 const BlockRequest *req1 = a, *req2 = b;
4409 * Note that we can't simply subtract req2->sector from req1->sector
4410 * here as that could overflow the return value.
4412 if (req1->sector > req2->sector) {
4413 return 1;
4414 } else if (req1->sector < req2->sector) {
4415 return -1;
4416 } else {
4417 return 0;
4422 * Takes a bunch of requests and tries to merge them. Returns the number of
4423 * requests that remain after merging.
4425 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4426 int num_reqs, MultiwriteCB *mcb)
4428 int i, outidx;
4430 // Sort requests by start sector
4431 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4433 // Check if adjacent requests touch the same clusters. If so, combine them,
4434 // filling up gaps with zero sectors.
4435 outidx = 0;
4436 for (i = 1; i < num_reqs; i++) {
4437 int merge = 0;
4438 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4440 // Handle exactly sequential writes and overlapping writes.
4441 if (reqs[i].sector <= oldreq_last) {
4442 merge = 1;
4445 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4446 merge = 0;
4449 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4450 reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4451 merge = 0;
4454 if (merge) {
4455 size_t size;
4456 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4457 qemu_iovec_init(qiov,
4458 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4460 // Add the first request to the merged one. If the requests are
4461 // overlapping, drop the last sectors of the first request.
4462 size = (reqs[i].sector - reqs[outidx].sector) << 9;
4463 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4465 // We should need to add any zeros between the two requests
4466 assert (reqs[i].sector <= oldreq_last);
4468 // Add the second request
4469 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4471 // Add tail of first request, if necessary
4472 if (qiov->size < reqs[outidx].qiov->size) {
4473 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4474 reqs[outidx].qiov->size - qiov->size);
4477 reqs[outidx].nb_sectors = qiov->size >> 9;
4478 reqs[outidx].qiov = qiov;
4480 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4481 } else {
4482 outidx++;
4483 reqs[outidx].sector = reqs[i].sector;
4484 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4485 reqs[outidx].qiov = reqs[i].qiov;
4489 return outidx + 1;
4493 * Submit multiple AIO write requests at once.
4495 * On success, the function returns 0 and all requests in the reqs array have
4496 * been submitted. In error case this function returns -1, and any of the
4497 * requests may or may not be submitted yet. In particular, this means that the
4498 * callback will be called for some of the requests, for others it won't. The
4499 * caller must check the error field of the BlockRequest to wait for the right
4500 * callbacks (if error != 0, no callback will be called).
4502 * The implementation may modify the contents of the reqs array, e.g. to merge
4503 * requests. However, the fields opaque and error are left unmodified as they
4504 * are used to signal failure for a single request to the caller.
4506 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4508 MultiwriteCB *mcb;
4509 int i;
4511 /* don't submit writes if we don't have a medium */
4512 if (bs->drv == NULL) {
4513 for (i = 0; i < num_reqs; i++) {
4514 reqs[i].error = -ENOMEDIUM;
4516 return -1;
4519 if (num_reqs == 0) {
4520 return 0;
4523 // Create MultiwriteCB structure
4524 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4525 mcb->num_requests = 0;
4526 mcb->num_callbacks = num_reqs;
4528 for (i = 0; i < num_reqs; i++) {
4529 mcb->callbacks[i].cb = reqs[i].cb;
4530 mcb->callbacks[i].opaque = reqs[i].opaque;
4533 // Check for mergable requests
4534 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4536 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4538 /* Run the aio requests. */
4539 mcb->num_requests = num_reqs;
4540 for (i = 0; i < num_reqs; i++) {
4541 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4542 reqs[i].nb_sectors, reqs[i].flags,
4543 multiwrite_cb, mcb,
4544 true);
4547 return 0;
4550 void bdrv_aio_cancel(BlockAIOCB *acb)
4552 qemu_aio_ref(acb);
4553 bdrv_aio_cancel_async(acb);
4554 while (acb->refcnt > 1) {
4555 if (acb->aiocb_info->get_aio_context) {
4556 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4557 } else if (acb->bs) {
4558 aio_poll(bdrv_get_aio_context(acb->bs), true);
4559 } else {
4560 abort();
4563 qemu_aio_unref(acb);
4566 /* Async version of aio cancel. The caller is not blocked if the acb implements
4567 * cancel_async, otherwise we do nothing and let the request normally complete.
4568 * In either case the completion callback must be called. */
4569 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4571 if (acb->aiocb_info->cancel_async) {
4572 acb->aiocb_info->cancel_async(acb);
4576 /**************************************************************/
4577 /* async block device emulation */
4579 typedef struct BlockAIOCBSync {
4580 BlockAIOCB common;
4581 QEMUBH *bh;
4582 int ret;
4583 /* vector translation state */
4584 QEMUIOVector *qiov;
4585 uint8_t *bounce;
4586 int is_write;
4587 } BlockAIOCBSync;
4589 static const AIOCBInfo bdrv_em_aiocb_info = {
4590 .aiocb_size = sizeof(BlockAIOCBSync),
4593 static void bdrv_aio_bh_cb(void *opaque)
4595 BlockAIOCBSync *acb = opaque;
4597 if (!acb->is_write && acb->ret >= 0) {
4598 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4600 qemu_vfree(acb->bounce);
4601 acb->common.cb(acb->common.opaque, acb->ret);
4602 qemu_bh_delete(acb->bh);
4603 acb->bh = NULL;
4604 qemu_aio_unref(acb);
4607 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4608 int64_t sector_num,
4609 QEMUIOVector *qiov,
4610 int nb_sectors,
4611 BlockCompletionFunc *cb,
4612 void *opaque,
4613 int is_write)
4616 BlockAIOCBSync *acb;
4618 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4619 acb->is_write = is_write;
4620 acb->qiov = qiov;
4621 acb->bounce = qemu_try_blockalign(bs, qiov->size);
4622 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4624 if (acb->bounce == NULL) {
4625 acb->ret = -ENOMEM;
4626 } else if (is_write) {
4627 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4628 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4629 } else {
4630 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4633 qemu_bh_schedule(acb->bh);
4635 return &acb->common;
4638 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4639 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4640 BlockCompletionFunc *cb, void *opaque)
4642 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4645 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4646 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4647 BlockCompletionFunc *cb, void *opaque)
4649 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4653 typedef struct BlockAIOCBCoroutine {
4654 BlockAIOCB common;
4655 BlockRequest req;
4656 bool is_write;
4657 bool *done;
4658 QEMUBH* bh;
4659 } BlockAIOCBCoroutine;
4661 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4662 .aiocb_size = sizeof(BlockAIOCBCoroutine),
4665 static void bdrv_co_em_bh(void *opaque)
4667 BlockAIOCBCoroutine *acb = opaque;
4669 acb->common.cb(acb->common.opaque, acb->req.error);
4671 qemu_bh_delete(acb->bh);
4672 qemu_aio_unref(acb);
4675 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4676 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4678 BlockAIOCBCoroutine *acb = opaque;
4679 BlockDriverState *bs = acb->common.bs;
4681 if (!acb->is_write) {
4682 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4683 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4684 } else {
4685 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4686 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4689 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4690 qemu_bh_schedule(acb->bh);
4693 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4694 int64_t sector_num,
4695 QEMUIOVector *qiov,
4696 int nb_sectors,
4697 BdrvRequestFlags flags,
4698 BlockCompletionFunc *cb,
4699 void *opaque,
4700 bool is_write)
4702 Coroutine *co;
4703 BlockAIOCBCoroutine *acb;
4705 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4706 acb->req.sector = sector_num;
4707 acb->req.nb_sectors = nb_sectors;
4708 acb->req.qiov = qiov;
4709 acb->req.flags = flags;
4710 acb->is_write = is_write;
4712 co = qemu_coroutine_create(bdrv_co_do_rw);
4713 qemu_coroutine_enter(co, acb);
4715 return &acb->common;
4718 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4720 BlockAIOCBCoroutine *acb = opaque;
4721 BlockDriverState *bs = acb->common.bs;
4723 acb->req.error = bdrv_co_flush(bs);
4724 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4725 qemu_bh_schedule(acb->bh);
4728 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4729 BlockCompletionFunc *cb, void *opaque)
4731 trace_bdrv_aio_flush(bs, opaque);
4733 Coroutine *co;
4734 BlockAIOCBCoroutine *acb;
4736 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4738 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4739 qemu_coroutine_enter(co, acb);
4741 return &acb->common;
4744 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4746 BlockAIOCBCoroutine *acb = opaque;
4747 BlockDriverState *bs = acb->common.bs;
4749 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4750 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4751 qemu_bh_schedule(acb->bh);
4754 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4755 int64_t sector_num, int nb_sectors,
4756 BlockCompletionFunc *cb, void *opaque)
4758 Coroutine *co;
4759 BlockAIOCBCoroutine *acb;
4761 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4763 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4764 acb->req.sector = sector_num;
4765 acb->req.nb_sectors = nb_sectors;
4766 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4767 qemu_coroutine_enter(co, acb);
4769 return &acb->common;
4772 void bdrv_init(void)
4774 module_call_init(MODULE_INIT_BLOCK);
4777 void bdrv_init_with_whitelist(void)
4779 use_bdrv_whitelist = 1;
4780 bdrv_init();
4783 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4784 BlockCompletionFunc *cb, void *opaque)
4786 BlockAIOCB *acb;
4788 acb = g_slice_alloc(aiocb_info->aiocb_size);
4789 acb->aiocb_info = aiocb_info;
4790 acb->bs = bs;
4791 acb->cb = cb;
4792 acb->opaque = opaque;
4793 acb->refcnt = 1;
4794 return acb;
4797 void qemu_aio_ref(void *p)
4799 BlockAIOCB *acb = p;
4800 acb->refcnt++;
4803 void qemu_aio_unref(void *p)
4805 BlockAIOCB *acb = p;
4806 assert(acb->refcnt > 0);
4807 if (--acb->refcnt == 0) {
4808 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4812 /**************************************************************/
4813 /* Coroutine block device emulation */
4815 typedef struct CoroutineIOCompletion {
4816 Coroutine *coroutine;
4817 int ret;
4818 } CoroutineIOCompletion;
4820 static void bdrv_co_io_em_complete(void *opaque, int ret)
4822 CoroutineIOCompletion *co = opaque;
4824 co->ret = ret;
4825 qemu_coroutine_enter(co->coroutine, NULL);
4828 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4829 int nb_sectors, QEMUIOVector *iov,
4830 bool is_write)
4832 CoroutineIOCompletion co = {
4833 .coroutine = qemu_coroutine_self(),
4835 BlockAIOCB *acb;
4837 if (is_write) {
4838 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4839 bdrv_co_io_em_complete, &co);
4840 } else {
4841 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4842 bdrv_co_io_em_complete, &co);
4845 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4846 if (!acb) {
4847 return -EIO;
4849 qemu_coroutine_yield();
4851 return co.ret;
4854 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4855 int64_t sector_num, int nb_sectors,
4856 QEMUIOVector *iov)
4858 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4861 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4862 int64_t sector_num, int nb_sectors,
4863 QEMUIOVector *iov)
4865 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4868 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4870 RwCo *rwco = opaque;
4872 rwco->ret = bdrv_co_flush(rwco->bs);
4875 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4877 int ret;
4879 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4880 return 0;
4883 /* Write back cached data to the OS even with cache=unsafe */
4884 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4885 if (bs->drv->bdrv_co_flush_to_os) {
4886 ret = bs->drv->bdrv_co_flush_to_os(bs);
4887 if (ret < 0) {
4888 return ret;
4892 /* But don't actually force it to the disk with cache=unsafe */
4893 if (bs->open_flags & BDRV_O_NO_FLUSH) {
4894 goto flush_parent;
4897 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4898 if (bs->drv->bdrv_co_flush_to_disk) {
4899 ret = bs->drv->bdrv_co_flush_to_disk(bs);
4900 } else if (bs->drv->bdrv_aio_flush) {
4901 BlockAIOCB *acb;
4902 CoroutineIOCompletion co = {
4903 .coroutine = qemu_coroutine_self(),
4906 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4907 if (acb == NULL) {
4908 ret = -EIO;
4909 } else {
4910 qemu_coroutine_yield();
4911 ret = co.ret;
4913 } else {
4915 * Some block drivers always operate in either writethrough or unsafe
4916 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4917 * know how the server works (because the behaviour is hardcoded or
4918 * depends on server-side configuration), so we can't ensure that
4919 * everything is safe on disk. Returning an error doesn't work because
4920 * that would break guests even if the server operates in writethrough
4921 * mode.
4923 * Let's hope the user knows what he's doing.
4925 ret = 0;
4927 if (ret < 0) {
4928 return ret;
4931 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4932 * in the case of cache=unsafe, so there are no useless flushes.
4934 flush_parent:
4935 return bdrv_co_flush(bs->file);
4938 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4940 Error *local_err = NULL;
4941 int ret;
4943 if (!bs->drv) {
4944 return;
4947 if (!(bs->open_flags & BDRV_O_INCOMING)) {
4948 return;
4950 bs->open_flags &= ~BDRV_O_INCOMING;
4952 if (bs->drv->bdrv_invalidate_cache) {
4953 bs->drv->bdrv_invalidate_cache(bs, &local_err);
4954 } else if (bs->file) {
4955 bdrv_invalidate_cache(bs->file, &local_err);
4957 if (local_err) {
4958 error_propagate(errp, local_err);
4959 return;
4962 ret = refresh_total_sectors(bs, bs->total_sectors);
4963 if (ret < 0) {
4964 error_setg_errno(errp, -ret, "Could not refresh total sector count");
4965 return;
4969 void bdrv_invalidate_cache_all(Error **errp)
4971 BlockDriverState *bs;
4972 Error *local_err = NULL;
4974 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4975 AioContext *aio_context = bdrv_get_aio_context(bs);
4977 aio_context_acquire(aio_context);
4978 bdrv_invalidate_cache(bs, &local_err);
4979 aio_context_release(aio_context);
4980 if (local_err) {
4981 error_propagate(errp, local_err);
4982 return;
4987 int bdrv_flush(BlockDriverState *bs)
4989 Coroutine *co;
4990 RwCo rwco = {
4991 .bs = bs,
4992 .ret = NOT_DONE,
4995 if (qemu_in_coroutine()) {
4996 /* Fast-path if already in coroutine context */
4997 bdrv_flush_co_entry(&rwco);
4998 } else {
4999 AioContext *aio_context = bdrv_get_aio_context(bs);
5001 co = qemu_coroutine_create(bdrv_flush_co_entry);
5002 qemu_coroutine_enter(co, &rwco);
5003 while (rwco.ret == NOT_DONE) {
5004 aio_poll(aio_context, true);
5008 return rwco.ret;
5011 typedef struct DiscardCo {
5012 BlockDriverState *bs;
5013 int64_t sector_num;
5014 int nb_sectors;
5015 int ret;
5016 } DiscardCo;
5017 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5019 DiscardCo *rwco = opaque;
5021 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5024 /* if no limit is specified in the BlockLimits use a default
5025 * of 32768 512-byte sectors (16 MiB) per request.
5027 #define MAX_DISCARD_DEFAULT 32768
5029 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5030 int nb_sectors)
5032 int max_discard;
5034 if (!bs->drv) {
5035 return -ENOMEDIUM;
5036 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5037 return -EIO;
5038 } else if (bs->read_only) {
5039 return -EROFS;
5042 bdrv_reset_dirty(bs, sector_num, nb_sectors);
5044 /* Do nothing if disabled. */
5045 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5046 return 0;
5049 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5050 return 0;
5053 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5054 while (nb_sectors > 0) {
5055 int ret;
5056 int num = nb_sectors;
5058 /* align request */
5059 if (bs->bl.discard_alignment &&
5060 num >= bs->bl.discard_alignment &&
5061 sector_num % bs->bl.discard_alignment) {
5062 if (num > bs->bl.discard_alignment) {
5063 num = bs->bl.discard_alignment;
5065 num -= sector_num % bs->bl.discard_alignment;
5068 /* limit request size */
5069 if (num > max_discard) {
5070 num = max_discard;
5073 if (bs->drv->bdrv_co_discard) {
5074 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5075 } else {
5076 BlockAIOCB *acb;
5077 CoroutineIOCompletion co = {
5078 .coroutine = qemu_coroutine_self(),
5081 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5082 bdrv_co_io_em_complete, &co);
5083 if (acb == NULL) {
5084 return -EIO;
5085 } else {
5086 qemu_coroutine_yield();
5087 ret = co.ret;
5090 if (ret && ret != -ENOTSUP) {
5091 return ret;
5094 sector_num += num;
5095 nb_sectors -= num;
5097 return 0;
5100 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5102 Coroutine *co;
5103 DiscardCo rwco = {
5104 .bs = bs,
5105 .sector_num = sector_num,
5106 .nb_sectors = nb_sectors,
5107 .ret = NOT_DONE,
5110 if (qemu_in_coroutine()) {
5111 /* Fast-path if already in coroutine context */
5112 bdrv_discard_co_entry(&rwco);
5113 } else {
5114 AioContext *aio_context = bdrv_get_aio_context(bs);
5116 co = qemu_coroutine_create(bdrv_discard_co_entry);
5117 qemu_coroutine_enter(co, &rwco);
5118 while (rwco.ret == NOT_DONE) {
5119 aio_poll(aio_context, true);
5123 return rwco.ret;
5126 /**************************************************************/
5127 /* removable device support */
5130 * Return TRUE if the media is present
5132 int bdrv_is_inserted(BlockDriverState *bs)
5134 BlockDriver *drv = bs->drv;
5136 if (!drv)
5137 return 0;
5138 if (!drv->bdrv_is_inserted)
5139 return 1;
5140 return drv->bdrv_is_inserted(bs);
5144 * Return whether the media changed since the last call to this
5145 * function, or -ENOTSUP if we don't know. Most drivers don't know.
5147 int bdrv_media_changed(BlockDriverState *bs)
5149 BlockDriver *drv = bs->drv;
5151 if (drv && drv->bdrv_media_changed) {
5152 return drv->bdrv_media_changed(bs);
5154 return -ENOTSUP;
5158 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5160 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5162 BlockDriver *drv = bs->drv;
5163 const char *device_name;
5165 if (drv && drv->bdrv_eject) {
5166 drv->bdrv_eject(bs, eject_flag);
5169 device_name = bdrv_get_device_name(bs);
5170 if (device_name[0] != '\0') {
5171 qapi_event_send_device_tray_moved(device_name,
5172 eject_flag, &error_abort);
5177 * Lock or unlock the media (if it is locked, the user won't be able
5178 * to eject it manually).
5180 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5182 BlockDriver *drv = bs->drv;
5184 trace_bdrv_lock_medium(bs, locked);
5186 if (drv && drv->bdrv_lock_medium) {
5187 drv->bdrv_lock_medium(bs, locked);
5191 /* needed for generic scsi interface */
5193 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5195 BlockDriver *drv = bs->drv;
5197 if (drv && drv->bdrv_ioctl)
5198 return drv->bdrv_ioctl(bs, req, buf);
5199 return -ENOTSUP;
5202 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5203 unsigned long int req, void *buf,
5204 BlockCompletionFunc *cb, void *opaque)
5206 BlockDriver *drv = bs->drv;
5208 if (drv && drv->bdrv_aio_ioctl)
5209 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5210 return NULL;
5213 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5215 bs->guest_block_size = align;
5218 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5220 return qemu_memalign(bdrv_opt_mem_align(bs), size);
5223 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5225 return memset(qemu_blockalign(bs, size), 0, size);
5228 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5230 size_t align = bdrv_opt_mem_align(bs);
5232 /* Ensure that NULL is never returned on success */
5233 assert(align > 0);
5234 if (size == 0) {
5235 size = align;
5238 return qemu_try_memalign(align, size);
5241 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5243 void *mem = qemu_try_blockalign(bs, size);
5245 if (mem) {
5246 memset(mem, 0, size);
5249 return mem;
5253 * Check if all memory in this vector is sector aligned.
5255 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5257 int i;
5258 size_t alignment = bdrv_opt_mem_align(bs);
5260 for (i = 0; i < qiov->niov; i++) {
5261 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5262 return false;
5264 if (qiov->iov[i].iov_len % alignment) {
5265 return false;
5269 return true;
5272 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5273 Error **errp)
5275 int64_t bitmap_size;
5276 BdrvDirtyBitmap *bitmap;
5278 assert((granularity & (granularity - 1)) == 0);
5280 granularity >>= BDRV_SECTOR_BITS;
5281 assert(granularity);
5282 bitmap_size = bdrv_nb_sectors(bs);
5283 if (bitmap_size < 0) {
5284 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5285 errno = -bitmap_size;
5286 return NULL;
5288 bitmap = g_new0(BdrvDirtyBitmap, 1);
5289 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5290 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5291 return bitmap;
5294 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5296 BdrvDirtyBitmap *bm, *next;
5297 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5298 if (bm == bitmap) {
5299 QLIST_REMOVE(bitmap, list);
5300 hbitmap_free(bitmap->bitmap);
5301 g_free(bitmap);
5302 return;
5307 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5309 BdrvDirtyBitmap *bm;
5310 BlockDirtyInfoList *list = NULL;
5311 BlockDirtyInfoList **plist = &list;
5313 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5314 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5315 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5316 info->count = bdrv_get_dirty_count(bs, bm);
5317 info->granularity =
5318 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5319 entry->value = info;
5320 *plist = entry;
5321 plist = &entry->next;
5324 return list;
5327 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5329 if (bitmap) {
5330 return hbitmap_get(bitmap->bitmap, sector);
5331 } else {
5332 return 0;
5336 void bdrv_dirty_iter_init(BlockDriverState *bs,
5337 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5339 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5342 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5343 int nr_sectors)
5345 BdrvDirtyBitmap *bitmap;
5346 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5347 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5351 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5353 BdrvDirtyBitmap *bitmap;
5354 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5355 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5359 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5361 return hbitmap_count(bitmap->bitmap);
5364 /* Get a reference to bs */
5365 void bdrv_ref(BlockDriverState *bs)
5367 bs->refcnt++;
5370 /* Release a previously grabbed reference to bs.
5371 * If after releasing, reference count is zero, the BlockDriverState is
5372 * deleted. */
5373 void bdrv_unref(BlockDriverState *bs)
5375 if (!bs) {
5376 return;
5378 assert(bs->refcnt > 0);
5379 if (--bs->refcnt == 0) {
5380 bdrv_delete(bs);
5384 struct BdrvOpBlocker {
5385 Error *reason;
5386 QLIST_ENTRY(BdrvOpBlocker) list;
5389 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5391 BdrvOpBlocker *blocker;
5392 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5393 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5394 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5395 if (errp) {
5396 error_setg(errp, "Device '%s' is busy: %s",
5397 bdrv_get_device_name(bs),
5398 error_get_pretty(blocker->reason));
5400 return true;
5402 return false;
5405 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5407 BdrvOpBlocker *blocker;
5408 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5410 blocker = g_new0(BdrvOpBlocker, 1);
5411 blocker->reason = reason;
5412 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5415 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5417 BdrvOpBlocker *blocker, *next;
5418 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5419 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5420 if (blocker->reason == reason) {
5421 QLIST_REMOVE(blocker, list);
5422 g_free(blocker);
5427 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5429 int i;
5430 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5431 bdrv_op_block(bs, i, reason);
5435 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5437 int i;
5438 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5439 bdrv_op_unblock(bs, i, reason);
5443 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5445 int i;
5447 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5448 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5449 return false;
5452 return true;
5455 void bdrv_iostatus_enable(BlockDriverState *bs)
5457 bs->iostatus_enabled = true;
5458 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5461 /* The I/O status is only enabled if the drive explicitly
5462 * enables it _and_ the VM is configured to stop on errors */
5463 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5465 return (bs->iostatus_enabled &&
5466 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5467 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5468 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5471 void bdrv_iostatus_disable(BlockDriverState *bs)
5473 bs->iostatus_enabled = false;
5476 void bdrv_iostatus_reset(BlockDriverState *bs)
5478 if (bdrv_iostatus_is_enabled(bs)) {
5479 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5480 if (bs->job) {
5481 block_job_iostatus_reset(bs->job);
5486 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5488 assert(bdrv_iostatus_is_enabled(bs));
5489 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5490 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5491 BLOCK_DEVICE_IO_STATUS_FAILED;
5495 void bdrv_img_create(const char *filename, const char *fmt,
5496 const char *base_filename, const char *base_fmt,
5497 char *options, uint64_t img_size, int flags,
5498 Error **errp, bool quiet)
5500 QemuOptsList *create_opts = NULL;
5501 QemuOpts *opts = NULL;
5502 const char *backing_fmt, *backing_file;
5503 int64_t size;
5504 BlockDriver *drv, *proto_drv;
5505 BlockDriver *backing_drv = NULL;
5506 Error *local_err = NULL;
5507 int ret = 0;
5509 /* Find driver and parse its options */
5510 drv = bdrv_find_format(fmt);
5511 if (!drv) {
5512 error_setg(errp, "Unknown file format '%s'", fmt);
5513 return;
5516 proto_drv = bdrv_find_protocol(filename, true);
5517 if (!proto_drv) {
5518 error_setg(errp, "Unknown protocol '%s'", filename);
5519 return;
5522 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5523 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5525 /* Create parameter list with default values */
5526 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5527 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5529 /* Parse -o options */
5530 if (options) {
5531 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5532 error_setg(errp, "Invalid options for file format '%s'", fmt);
5533 goto out;
5537 if (base_filename) {
5538 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5539 error_setg(errp, "Backing file not supported for file format '%s'",
5540 fmt);
5541 goto out;
5545 if (base_fmt) {
5546 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5547 error_setg(errp, "Backing file format not supported for file "
5548 "format '%s'", fmt);
5549 goto out;
5553 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5554 if (backing_file) {
5555 if (!strcmp(filename, backing_file)) {
5556 error_setg(errp, "Error: Trying to create an image with the "
5557 "same filename as the backing file");
5558 goto out;
5562 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5563 if (backing_fmt) {
5564 backing_drv = bdrv_find_format(backing_fmt);
5565 if (!backing_drv) {
5566 error_setg(errp, "Unknown backing file format '%s'",
5567 backing_fmt);
5568 goto out;
5572 // The size for the image must always be specified, with one exception:
5573 // If we are using a backing file, we can obtain the size from there
5574 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5575 if (size == -1) {
5576 if (backing_file) {
5577 BlockDriverState *bs;
5578 int64_t size;
5579 int back_flags;
5581 /* backing files always opened read-only */
5582 back_flags =
5583 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5585 bs = NULL;
5586 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5587 backing_drv, &local_err);
5588 if (ret < 0) {
5589 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5590 backing_file,
5591 error_get_pretty(local_err));
5592 error_free(local_err);
5593 local_err = NULL;
5594 goto out;
5596 size = bdrv_getlength(bs);
5597 if (size < 0) {
5598 error_setg_errno(errp, -size, "Could not get size of '%s'",
5599 backing_file);
5600 bdrv_unref(bs);
5601 goto out;
5604 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5606 bdrv_unref(bs);
5607 } else {
5608 error_setg(errp, "Image creation needs a size parameter");
5609 goto out;
5613 if (!quiet) {
5614 printf("Formatting '%s', fmt=%s ", filename, fmt);
5615 qemu_opts_print(opts);
5616 puts("");
5619 ret = bdrv_create(drv, filename, opts, &local_err);
5621 if (ret == -EFBIG) {
5622 /* This is generally a better message than whatever the driver would
5623 * deliver (especially because of the cluster_size_hint), since that
5624 * is most probably not much different from "image too large". */
5625 const char *cluster_size_hint = "";
5626 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5627 cluster_size_hint = " (try using a larger cluster size)";
5629 error_setg(errp, "The image size is too large for file format '%s'"
5630 "%s", fmt, cluster_size_hint);
5631 error_free(local_err);
5632 local_err = NULL;
5635 out:
5636 qemu_opts_del(opts);
5637 qemu_opts_free(create_opts);
5638 if (local_err) {
5639 error_propagate(errp, local_err);
5643 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5645 return bs->aio_context;
5648 void bdrv_detach_aio_context(BlockDriverState *bs)
5650 BdrvAioNotifier *baf;
5652 if (!bs->drv) {
5653 return;
5656 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5657 baf->detach_aio_context(baf->opaque);
5660 if (bs->io_limits_enabled) {
5661 throttle_detach_aio_context(&bs->throttle_state);
5663 if (bs->drv->bdrv_detach_aio_context) {
5664 bs->drv->bdrv_detach_aio_context(bs);
5666 if (bs->file) {
5667 bdrv_detach_aio_context(bs->file);
5669 if (bs->backing_hd) {
5670 bdrv_detach_aio_context(bs->backing_hd);
5673 bs->aio_context = NULL;
5676 void bdrv_attach_aio_context(BlockDriverState *bs,
5677 AioContext *new_context)
5679 BdrvAioNotifier *ban;
5681 if (!bs->drv) {
5682 return;
5685 bs->aio_context = new_context;
5687 if (bs->backing_hd) {
5688 bdrv_attach_aio_context(bs->backing_hd, new_context);
5690 if (bs->file) {
5691 bdrv_attach_aio_context(bs->file, new_context);
5693 if (bs->drv->bdrv_attach_aio_context) {
5694 bs->drv->bdrv_attach_aio_context(bs, new_context);
5696 if (bs->io_limits_enabled) {
5697 throttle_attach_aio_context(&bs->throttle_state, new_context);
5700 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5701 ban->attached_aio_context(new_context, ban->opaque);
5705 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5707 bdrv_drain_all(); /* ensure there are no in-flight requests */
5709 bdrv_detach_aio_context(bs);
5711 /* This function executes in the old AioContext so acquire the new one in
5712 * case it runs in a different thread.
5714 aio_context_acquire(new_context);
5715 bdrv_attach_aio_context(bs, new_context);
5716 aio_context_release(new_context);
5719 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5720 void (*attached_aio_context)(AioContext *new_context, void *opaque),
5721 void (*detach_aio_context)(void *opaque), void *opaque)
5723 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5724 *ban = (BdrvAioNotifier){
5725 .attached_aio_context = attached_aio_context,
5726 .detach_aio_context = detach_aio_context,
5727 .opaque = opaque
5730 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5733 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5734 void (*attached_aio_context)(AioContext *,
5735 void *),
5736 void (*detach_aio_context)(void *),
5737 void *opaque)
5739 BdrvAioNotifier *ban, *ban_next;
5741 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5742 if (ban->attached_aio_context == attached_aio_context &&
5743 ban->detach_aio_context == detach_aio_context &&
5744 ban->opaque == opaque)
5746 QLIST_REMOVE(ban, list);
5747 g_free(ban);
5749 return;
5753 abort();
5756 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5757 NotifierWithReturn *notifier)
5759 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5762 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts)
5764 if (!bs->drv->bdrv_amend_options) {
5765 return -ENOTSUP;
5767 return bs->drv->bdrv_amend_options(bs, opts);
5770 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5771 * of block filter and by bdrv_is_first_non_filter.
5772 * It is used to test if the given bs is the candidate or recurse more in the
5773 * node graph.
5775 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5776 BlockDriverState *candidate)
5778 /* return false if basic checks fails */
5779 if (!bs || !bs->drv) {
5780 return false;
5783 /* the code reached a non block filter driver -> check if the bs is
5784 * the same as the candidate. It's the recursion termination condition.
5786 if (!bs->drv->is_filter) {
5787 return bs == candidate;
5789 /* Down this path the driver is a block filter driver */
5791 /* If the block filter recursion method is defined use it to recurse down
5792 * the node graph.
5794 if (bs->drv->bdrv_recurse_is_first_non_filter) {
5795 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5798 /* the driver is a block filter but don't allow to recurse -> return false
5800 return false;
5803 /* This function checks if the candidate is the first non filter bs down it's
5804 * bs chain. Since we don't have pointers to parents it explore all bs chains
5805 * from the top. Some filters can choose not to pass down the recursion.
5807 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5809 BlockDriverState *bs;
5811 /* walk down the bs forest recursively */
5812 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5813 bool perm;
5815 /* try to recurse in this top level bs */
5816 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5818 /* candidate is the first non filter */
5819 if (perm) {
5820 return true;
5824 return false;
5827 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5829 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5830 if (!to_replace_bs) {
5831 error_setg(errp, "Node name '%s' not found", node_name);
5832 return NULL;
5835 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5836 return NULL;
5839 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5840 * most non filter in order to prevent data corruption.
5841 * Another benefit is that this tests exclude backing files which are
5842 * blocked by the backing blockers.
5844 if (!bdrv_is_first_non_filter(to_replace_bs)) {
5845 error_setg(errp, "Only top most non filter can be replaced");
5846 return NULL;
5849 return to_replace_bs;
5852 void bdrv_io_plug(BlockDriverState *bs)
5854 BlockDriver *drv = bs->drv;
5855 if (drv && drv->bdrv_io_plug) {
5856 drv->bdrv_io_plug(bs);
5857 } else if (bs->file) {
5858 bdrv_io_plug(bs->file);
5862 void bdrv_io_unplug(BlockDriverState *bs)
5864 BlockDriver *drv = bs->drv;
5865 if (drv && drv->bdrv_io_unplug) {
5866 drv->bdrv_io_unplug(bs);
5867 } else if (bs->file) {
5868 bdrv_io_unplug(bs->file);
5872 void bdrv_flush_io_queue(BlockDriverState *bs)
5874 BlockDriver *drv = bs->drv;
5875 if (drv && drv->bdrv_flush_io_queue) {
5876 drv->bdrv_flush_io_queue(bs);
5877 } else if (bs->file) {
5878 bdrv_flush_io_queue(bs->file);
5882 static bool append_open_options(QDict *d, BlockDriverState *bs)
5884 const QDictEntry *entry;
5885 bool found_any = false;
5887 for (entry = qdict_first(bs->options); entry;
5888 entry = qdict_next(bs->options, entry))
5890 /* Only take options for this level and exclude all non-driver-specific
5891 * options */
5892 if (!strchr(qdict_entry_key(entry), '.') &&
5893 strcmp(qdict_entry_key(entry), "node-name"))
5895 qobject_incref(qdict_entry_value(entry));
5896 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5897 found_any = true;
5901 return found_any;
5904 /* Updates the following BDS fields:
5905 * - exact_filename: A filename which may be used for opening a block device
5906 * which (mostly) equals the given BDS (even without any
5907 * other options; so reading and writing must return the same
5908 * results, but caching etc. may be different)
5909 * - full_open_options: Options which, when given when opening a block device
5910 * (without a filename), result in a BDS (mostly)
5911 * equalling the given one
5912 * - filename: If exact_filename is set, it is copied here. Otherwise,
5913 * full_open_options is converted to a JSON object, prefixed with
5914 * "json:" (for use through the JSON pseudo protocol) and put here.
5916 void bdrv_refresh_filename(BlockDriverState *bs)
5918 BlockDriver *drv = bs->drv;
5919 QDict *opts;
5921 if (!drv) {
5922 return;
5925 /* This BDS's file name will most probably depend on its file's name, so
5926 * refresh that first */
5927 if (bs->file) {
5928 bdrv_refresh_filename(bs->file);
5931 if (drv->bdrv_refresh_filename) {
5932 /* Obsolete information is of no use here, so drop the old file name
5933 * information before refreshing it */
5934 bs->exact_filename[0] = '\0';
5935 if (bs->full_open_options) {
5936 QDECREF(bs->full_open_options);
5937 bs->full_open_options = NULL;
5940 drv->bdrv_refresh_filename(bs);
5941 } else if (bs->file) {
5942 /* Try to reconstruct valid information from the underlying file */
5943 bool has_open_options;
5945 bs->exact_filename[0] = '\0';
5946 if (bs->full_open_options) {
5947 QDECREF(bs->full_open_options);
5948 bs->full_open_options = NULL;
5951 opts = qdict_new();
5952 has_open_options = append_open_options(opts, bs);
5954 /* If no specific options have been given for this BDS, the filename of
5955 * the underlying file should suffice for this one as well */
5956 if (bs->file->exact_filename[0] && !has_open_options) {
5957 strcpy(bs->exact_filename, bs->file->exact_filename);
5959 /* Reconstructing the full options QDict is simple for most format block
5960 * drivers, as long as the full options are known for the underlying
5961 * file BDS. The full options QDict of that file BDS should somehow
5962 * contain a representation of the filename, therefore the following
5963 * suffices without querying the (exact_)filename of this BDS. */
5964 if (bs->file->full_open_options) {
5965 qdict_put_obj(opts, "driver",
5966 QOBJECT(qstring_from_str(drv->format_name)));
5967 QINCREF(bs->file->full_open_options);
5968 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
5970 bs->full_open_options = opts;
5971 } else {
5972 QDECREF(opts);
5974 } else if (!bs->full_open_options && qdict_size(bs->options)) {
5975 /* There is no underlying file BDS (at least referenced by BDS.file),
5976 * so the full options QDict should be equal to the options given
5977 * specifically for this block device when it was opened (plus the
5978 * driver specification).
5979 * Because those options don't change, there is no need to update
5980 * full_open_options when it's already set. */
5982 opts = qdict_new();
5983 append_open_options(opts, bs);
5984 qdict_put_obj(opts, "driver",
5985 QOBJECT(qstring_from_str(drv->format_name)));
5987 if (bs->exact_filename[0]) {
5988 /* This may not work for all block protocol drivers (some may
5989 * require this filename to be parsed), but we have to find some
5990 * default solution here, so just include it. If some block driver
5991 * does not support pure options without any filename at all or
5992 * needs some special format of the options QDict, it needs to
5993 * implement the driver-specific bdrv_refresh_filename() function.
5995 qdict_put_obj(opts, "filename",
5996 QOBJECT(qstring_from_str(bs->exact_filename)));
5999 bs->full_open_options = opts;
6002 if (bs->exact_filename[0]) {
6003 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6004 } else if (bs->full_open_options) {
6005 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6006 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6007 qstring_get_str(json));
6008 QDECREF(json);
6012 /* This accessor function purpose is to allow the device models to access the
6013 * BlockAcctStats structure embedded inside a BlockDriverState without being
6014 * aware of the BlockDriverState structure layout.
6015 * It will go away when the BlockAcctStats structure will be moved inside
6016 * the device models.
6018 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6020 return &bs->stats;