block: New bdrv_nb_sectors()
[qemu/ar7.git] / block.c
blob1239bb36610261fedbdda6d4ede1f5766747b35b
1 /*
2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/sysemu.h"
32 #include "qemu/notify.h"
33 #include "block/coroutine.h"
34 #include "block/qapi.h"
35 #include "qmp-commands.h"
36 #include "qemu/timer.h"
37 #include "qapi-event.h"
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
53 struct BdrvDirtyBitmap {
54 HBitmap *bitmap;
55 QLIST_ENTRY(BdrvDirtyBitmap) list;
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63 BlockDriverCompletionFunc *cb, void *opaque);
64 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66 BlockDriverCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75 BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78 BdrvRequestFlags flags);
79 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
83 BdrvRequestFlags flags,
84 BlockDriverCompletionFunc *cb,
85 void *opaque,
86 bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
111 int is_windows_drive(const char *filename)
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
121 #endif
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
127 int i;
129 throttle_config(&bs->throttle_state, cfg);
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
143 bs->io_limits_enabled = false;
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
151 bs->io_limits_enabled = enabled;
153 return drained;
156 void bdrv_io_limits_disable(BlockDriverState *bs)
158 bs->io_limits_enabled = false;
160 bdrv_start_throttled_reqs(bs);
162 throttle_destroy(&bs->throttle_state);
165 static void bdrv_throttle_read_timer_cb(void *opaque)
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
171 static void bdrv_throttle_write_timer_cb(void *opaque)
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 bdrv_get_aio_context(bs),
183 QEMU_CLOCK_VIRTUAL,
184 bdrv_throttle_read_timer_cb,
185 bdrv_throttle_write_timer_cb,
186 bs);
187 bs->io_limits_enabled = true;
190 /* This function makes an IO wait if needed
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
195 static void bdrv_io_limits_intercept(BlockDriverState *bs,
196 unsigned int bytes,
197 bool is_write)
199 /* does this io must wait */
200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
202 /* if must wait or any request of this type throttled queue the IO */
203 if (must_wait ||
204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
208 /* the IO will be executed, do the accounting */
209 throttle_account(&bs->throttle_state, is_write, bytes);
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214 return;
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
221 size_t bdrv_opt_mem_align(BlockDriverState *bs)
223 if (!bs || !bs->drv) {
224 /* 4k should be on the safe side */
225 return 4096;
228 return bs->bl.opt_mem_alignment;
231 /* check if the path starts with "<protocol>:" */
232 static int path_has_protocol(const char *path)
234 const char *p;
236 #ifdef _WIN32
237 if (is_windows_drive(path) ||
238 is_windows_drive_prefix(path)) {
239 return 0;
241 p = path + strcspn(path, ":/\\");
242 #else
243 p = path + strcspn(path, ":/");
244 #endif
246 return *p == ':';
249 int path_is_absolute(const char *path)
251 #ifdef _WIN32
252 /* specific case for names like: "\\.\d:" */
253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254 return 1;
256 return (*path == '/' || *path == '\\');
257 #else
258 return (*path == '/');
259 #endif
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
264 supported. */
265 void path_combine(char *dest, int dest_size,
266 const char *base_path,
267 const char *filename)
269 const char *p, *p1;
270 int len;
272 if (dest_size <= 0)
273 return;
274 if (path_is_absolute(filename)) {
275 pstrcpy(dest, dest_size, filename);
276 } else {
277 p = strchr(base_path, ':');
278 if (p)
279 p++;
280 else
281 p = base_path;
282 p1 = strrchr(base_path, '/');
283 #ifdef _WIN32
285 const char *p2;
286 p2 = strrchr(base_path, '\\');
287 if (!p1 || p2 > p1)
288 p1 = p2;
290 #endif
291 if (p1)
292 p1++;
293 else
294 p1 = base_path;
295 if (p1 > p)
296 p = p1;
297 len = p - base_path;
298 if (len > dest_size - 1)
299 len = dest_size - 1;
300 memcpy(dest, base_path, len);
301 dest[len] = '\0';
302 pstrcat(dest, dest_size, filename);
306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
308 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309 pstrcpy(dest, sz, bs->backing_file);
310 } else {
311 path_combine(dest, sz, bs->filename, bs->backing_file);
315 void bdrv_register(BlockDriver *bdrv)
317 /* Block drivers without coroutine functions need emulation */
318 if (!bdrv->bdrv_co_readv) {
319 bdrv->bdrv_co_readv = bdrv_co_readv_em;
320 bdrv->bdrv_co_writev = bdrv_co_writev_em;
322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323 * the block driver lacks aio we need to emulate that too.
325 if (!bdrv->bdrv_aio_readv) {
326 /* add AIO emulation layer */
327 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
332 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
335 /* create a new block device (by default it is empty) */
336 BlockDriverState *bdrv_new(const char *device_name, Error **errp)
338 BlockDriverState *bs;
339 int i;
341 if (bdrv_find(device_name)) {
342 error_setg(errp, "Device with id '%s' already exists",
343 device_name);
344 return NULL;
346 if (bdrv_find_node(device_name)) {
347 error_setg(errp, "Device with node-name '%s' already exists",
348 device_name);
349 return NULL;
352 bs = g_malloc0(sizeof(BlockDriverState));
353 QLIST_INIT(&bs->dirty_bitmaps);
354 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
355 if (device_name[0] != '\0') {
356 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
358 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
359 QLIST_INIT(&bs->op_blockers[i]);
361 bdrv_iostatus_disable(bs);
362 notifier_list_init(&bs->close_notifiers);
363 notifier_with_return_list_init(&bs->before_write_notifiers);
364 qemu_co_queue_init(&bs->throttled_reqs[0]);
365 qemu_co_queue_init(&bs->throttled_reqs[1]);
366 bs->refcnt = 1;
367 bs->aio_context = qemu_get_aio_context();
369 return bs;
372 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
374 notifier_list_add(&bs->close_notifiers, notify);
377 BlockDriver *bdrv_find_format(const char *format_name)
379 BlockDriver *drv1;
380 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
381 if (!strcmp(drv1->format_name, format_name)) {
382 return drv1;
385 return NULL;
388 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
390 static const char *whitelist_rw[] = {
391 CONFIG_BDRV_RW_WHITELIST
393 static const char *whitelist_ro[] = {
394 CONFIG_BDRV_RO_WHITELIST
396 const char **p;
398 if (!whitelist_rw[0] && !whitelist_ro[0]) {
399 return 1; /* no whitelist, anything goes */
402 for (p = whitelist_rw; *p; p++) {
403 if (!strcmp(drv->format_name, *p)) {
404 return 1;
407 if (read_only) {
408 for (p = whitelist_ro; *p; p++) {
409 if (!strcmp(drv->format_name, *p)) {
410 return 1;
414 return 0;
417 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
418 bool read_only)
420 BlockDriver *drv = bdrv_find_format(format_name);
421 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
424 typedef struct CreateCo {
425 BlockDriver *drv;
426 char *filename;
427 QemuOpts *opts;
428 int ret;
429 Error *err;
430 } CreateCo;
432 static void coroutine_fn bdrv_create_co_entry(void *opaque)
434 Error *local_err = NULL;
435 int ret;
437 CreateCo *cco = opaque;
438 assert(cco->drv);
440 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
441 if (local_err) {
442 error_propagate(&cco->err, local_err);
444 cco->ret = ret;
447 int bdrv_create(BlockDriver *drv, const char* filename,
448 QemuOpts *opts, Error **errp)
450 int ret;
452 Coroutine *co;
453 CreateCo cco = {
454 .drv = drv,
455 .filename = g_strdup(filename),
456 .opts = opts,
457 .ret = NOT_DONE,
458 .err = NULL,
461 if (!drv->bdrv_create) {
462 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
463 ret = -ENOTSUP;
464 goto out;
467 if (qemu_in_coroutine()) {
468 /* Fast-path if already in coroutine context */
469 bdrv_create_co_entry(&cco);
470 } else {
471 co = qemu_coroutine_create(bdrv_create_co_entry);
472 qemu_coroutine_enter(co, &cco);
473 while (cco.ret == NOT_DONE) {
474 aio_poll(qemu_get_aio_context(), true);
478 ret = cco.ret;
479 if (ret < 0) {
480 if (cco.err) {
481 error_propagate(errp, cco.err);
482 } else {
483 error_setg_errno(errp, -ret, "Could not create image");
487 out:
488 g_free(cco.filename);
489 return ret;
492 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
494 BlockDriver *drv;
495 Error *local_err = NULL;
496 int ret;
498 drv = bdrv_find_protocol(filename, true);
499 if (drv == NULL) {
500 error_setg(errp, "Could not find protocol for file '%s'", filename);
501 return -ENOENT;
504 ret = bdrv_create(drv, filename, opts, &local_err);
505 if (local_err) {
506 error_propagate(errp, local_err);
508 return ret;
511 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
513 BlockDriver *drv = bs->drv;
514 Error *local_err = NULL;
516 memset(&bs->bl, 0, sizeof(bs->bl));
518 if (!drv) {
519 return;
522 /* Take some limits from the children as a default */
523 if (bs->file) {
524 bdrv_refresh_limits(bs->file, &local_err);
525 if (local_err) {
526 error_propagate(errp, local_err);
527 return;
529 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
530 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
531 } else {
532 bs->bl.opt_mem_alignment = 512;
535 if (bs->backing_hd) {
536 bdrv_refresh_limits(bs->backing_hd, &local_err);
537 if (local_err) {
538 error_propagate(errp, local_err);
539 return;
541 bs->bl.opt_transfer_length =
542 MAX(bs->bl.opt_transfer_length,
543 bs->backing_hd->bl.opt_transfer_length);
544 bs->bl.opt_mem_alignment =
545 MAX(bs->bl.opt_mem_alignment,
546 bs->backing_hd->bl.opt_mem_alignment);
549 /* Then let the driver override it */
550 if (drv->bdrv_refresh_limits) {
551 drv->bdrv_refresh_limits(bs, errp);
556 * Create a uniquely-named empty temporary file.
557 * Return 0 upon success, otherwise a negative errno value.
559 int get_tmp_filename(char *filename, int size)
561 #ifdef _WIN32
562 char temp_dir[MAX_PATH];
563 /* GetTempFileName requires that its output buffer (4th param)
564 have length MAX_PATH or greater. */
565 assert(size >= MAX_PATH);
566 return (GetTempPath(MAX_PATH, temp_dir)
567 && GetTempFileName(temp_dir, "qem", 0, filename)
568 ? 0 : -GetLastError());
569 #else
570 int fd;
571 const char *tmpdir;
572 tmpdir = getenv("TMPDIR");
573 if (!tmpdir) {
574 tmpdir = "/var/tmp";
576 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
577 return -EOVERFLOW;
579 fd = mkstemp(filename);
580 if (fd < 0) {
581 return -errno;
583 if (close(fd) != 0) {
584 unlink(filename);
585 return -errno;
587 return 0;
588 #endif
592 * Detect host devices. By convention, /dev/cdrom[N] is always
593 * recognized as a host CDROM.
595 static BlockDriver *find_hdev_driver(const char *filename)
597 int score_max = 0, score;
598 BlockDriver *drv = NULL, *d;
600 QLIST_FOREACH(d, &bdrv_drivers, list) {
601 if (d->bdrv_probe_device) {
602 score = d->bdrv_probe_device(filename);
603 if (score > score_max) {
604 score_max = score;
605 drv = d;
610 return drv;
613 BlockDriver *bdrv_find_protocol(const char *filename,
614 bool allow_protocol_prefix)
616 BlockDriver *drv1;
617 char protocol[128];
618 int len;
619 const char *p;
621 /* TODO Drivers without bdrv_file_open must be specified explicitly */
624 * XXX(hch): we really should not let host device detection
625 * override an explicit protocol specification, but moving this
626 * later breaks access to device names with colons in them.
627 * Thanks to the brain-dead persistent naming schemes on udev-
628 * based Linux systems those actually are quite common.
630 drv1 = find_hdev_driver(filename);
631 if (drv1) {
632 return drv1;
635 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
636 return bdrv_find_format("file");
639 p = strchr(filename, ':');
640 assert(p != NULL);
641 len = p - filename;
642 if (len > sizeof(protocol) - 1)
643 len = sizeof(protocol) - 1;
644 memcpy(protocol, filename, len);
645 protocol[len] = '\0';
646 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
647 if (drv1->protocol_name &&
648 !strcmp(drv1->protocol_name, protocol)) {
649 return drv1;
652 return NULL;
655 static int find_image_format(BlockDriverState *bs, const char *filename,
656 BlockDriver **pdrv, Error **errp)
658 int score, score_max;
659 BlockDriver *drv1, *drv;
660 uint8_t buf[2048];
661 int ret = 0;
663 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
664 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
665 drv = bdrv_find_format("raw");
666 if (!drv) {
667 error_setg(errp, "Could not find raw image format");
668 ret = -ENOENT;
670 *pdrv = drv;
671 return ret;
674 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
675 if (ret < 0) {
676 error_setg_errno(errp, -ret, "Could not read image for determining its "
677 "format");
678 *pdrv = NULL;
679 return ret;
682 score_max = 0;
683 drv = NULL;
684 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
685 if (drv1->bdrv_probe) {
686 score = drv1->bdrv_probe(buf, ret, filename);
687 if (score > score_max) {
688 score_max = score;
689 drv = drv1;
693 if (!drv) {
694 error_setg(errp, "Could not determine image format: No compatible "
695 "driver found");
696 ret = -ENOENT;
698 *pdrv = drv;
699 return ret;
703 * Set the current 'total_sectors' value
704 * Return 0 on success, -errno on error.
706 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
708 BlockDriver *drv = bs->drv;
710 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
711 if (bs->sg)
712 return 0;
714 /* query actual device if possible, otherwise just trust the hint */
715 if (drv->bdrv_getlength) {
716 int64_t length = drv->bdrv_getlength(bs);
717 if (length < 0) {
718 return length;
720 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
723 bs->total_sectors = hint;
724 return 0;
728 * Set open flags for a given discard mode
730 * Return 0 on success, -1 if the discard mode was invalid.
732 int bdrv_parse_discard_flags(const char *mode, int *flags)
734 *flags &= ~BDRV_O_UNMAP;
736 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
737 /* do nothing */
738 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
739 *flags |= BDRV_O_UNMAP;
740 } else {
741 return -1;
744 return 0;
748 * Set open flags for a given cache mode
750 * Return 0 on success, -1 if the cache mode was invalid.
752 int bdrv_parse_cache_flags(const char *mode, int *flags)
754 *flags &= ~BDRV_O_CACHE_MASK;
756 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
757 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
758 } else if (!strcmp(mode, "directsync")) {
759 *flags |= BDRV_O_NOCACHE;
760 } else if (!strcmp(mode, "writeback")) {
761 *flags |= BDRV_O_CACHE_WB;
762 } else if (!strcmp(mode, "unsafe")) {
763 *flags |= BDRV_O_CACHE_WB;
764 *flags |= BDRV_O_NO_FLUSH;
765 } else if (!strcmp(mode, "writethrough")) {
766 /* this is the default */
767 } else {
768 return -1;
771 return 0;
775 * The copy-on-read flag is actually a reference count so multiple users may
776 * use the feature without worrying about clobbering its previous state.
777 * Copy-on-read stays enabled until all users have called to disable it.
779 void bdrv_enable_copy_on_read(BlockDriverState *bs)
781 bs->copy_on_read++;
784 void bdrv_disable_copy_on_read(BlockDriverState *bs)
786 assert(bs->copy_on_read > 0);
787 bs->copy_on_read--;
791 * Returns the flags that a temporary snapshot should get, based on the
792 * originally requested flags (the originally requested image will have flags
793 * like a backing file)
795 static int bdrv_temp_snapshot_flags(int flags)
797 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
801 * Returns the flags that bs->file should get, based on the given flags for
802 * the parent BDS
804 static int bdrv_inherited_flags(int flags)
806 /* Enable protocol handling, disable format probing for bs->file */
807 flags |= BDRV_O_PROTOCOL;
809 /* Our block drivers take care to send flushes and respect unmap policy,
810 * so we can enable both unconditionally on lower layers. */
811 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
813 /* Clear flags that only apply to the top layer */
814 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
816 return flags;
820 * Returns the flags that bs->backing_hd should get, based on the given flags
821 * for the parent BDS
823 static int bdrv_backing_flags(int flags)
825 /* backing files always opened read-only */
826 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
828 /* snapshot=on is handled on the top layer */
829 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
831 return flags;
834 static int bdrv_open_flags(BlockDriverState *bs, int flags)
836 int open_flags = flags | BDRV_O_CACHE_WB;
839 * Clear flags that are internal to the block layer before opening the
840 * image.
842 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
845 * Snapshots should be writable.
847 if (flags & BDRV_O_TEMPORARY) {
848 open_flags |= BDRV_O_RDWR;
851 return open_flags;
854 static void bdrv_assign_node_name(BlockDriverState *bs,
855 const char *node_name,
856 Error **errp)
858 if (!node_name) {
859 return;
862 /* empty string node name is invalid */
863 if (node_name[0] == '\0') {
864 error_setg(errp, "Empty node name");
865 return;
868 /* takes care of avoiding namespaces collisions */
869 if (bdrv_find(node_name)) {
870 error_setg(errp, "node-name=%s is conflicting with a device id",
871 node_name);
872 return;
875 /* takes care of avoiding duplicates node names */
876 if (bdrv_find_node(node_name)) {
877 error_setg(errp, "Duplicate node name");
878 return;
881 /* copy node name into the bs and insert it into the graph list */
882 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
883 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
887 * Common part for opening disk images and files
889 * Removes all processed options from *options.
891 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
892 QDict *options, int flags, BlockDriver *drv, Error **errp)
894 int ret, open_flags;
895 const char *filename;
896 const char *node_name = NULL;
897 Error *local_err = NULL;
899 assert(drv != NULL);
900 assert(bs->file == NULL);
901 assert(options != NULL && bs->options != options);
903 if (file != NULL) {
904 filename = file->filename;
905 } else {
906 filename = qdict_get_try_str(options, "filename");
909 if (drv->bdrv_needs_filename && !filename) {
910 error_setg(errp, "The '%s' block driver requires a file name",
911 drv->format_name);
912 return -EINVAL;
915 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
917 node_name = qdict_get_try_str(options, "node-name");
918 bdrv_assign_node_name(bs, node_name, &local_err);
919 if (local_err) {
920 error_propagate(errp, local_err);
921 return -EINVAL;
923 qdict_del(options, "node-name");
925 /* bdrv_open() with directly using a protocol as drv. This layer is already
926 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
927 * and return immediately. */
928 if (file != NULL && drv->bdrv_file_open) {
929 bdrv_swap(file, bs);
930 return 0;
933 bs->open_flags = flags;
934 bs->guest_block_size = 512;
935 bs->request_alignment = 512;
936 bs->zero_beyond_eof = true;
937 open_flags = bdrv_open_flags(bs, flags);
938 bs->read_only = !(open_flags & BDRV_O_RDWR);
939 bs->growable = !!(flags & BDRV_O_PROTOCOL);
941 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
942 error_setg(errp,
943 !bs->read_only && bdrv_is_whitelisted(drv, true)
944 ? "Driver '%s' can only be used for read-only devices"
945 : "Driver '%s' is not whitelisted",
946 drv->format_name);
947 return -ENOTSUP;
950 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
951 if (flags & BDRV_O_COPY_ON_READ) {
952 if (!bs->read_only) {
953 bdrv_enable_copy_on_read(bs);
954 } else {
955 error_setg(errp, "Can't use copy-on-read on read-only device");
956 return -EINVAL;
960 if (filename != NULL) {
961 pstrcpy(bs->filename, sizeof(bs->filename), filename);
962 } else {
963 bs->filename[0] = '\0';
966 bs->drv = drv;
967 bs->opaque = g_malloc0(drv->instance_size);
969 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
971 /* Open the image, either directly or using a protocol */
972 if (drv->bdrv_file_open) {
973 assert(file == NULL);
974 assert(!drv->bdrv_needs_filename || filename != NULL);
975 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
976 } else {
977 if (file == NULL) {
978 error_setg(errp, "Can't use '%s' as a block driver for the "
979 "protocol level", drv->format_name);
980 ret = -EINVAL;
981 goto free_and_fail;
983 bs->file = file;
984 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
987 if (ret < 0) {
988 if (local_err) {
989 error_propagate(errp, local_err);
990 } else if (bs->filename[0]) {
991 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
992 } else {
993 error_setg_errno(errp, -ret, "Could not open image");
995 goto free_and_fail;
998 ret = refresh_total_sectors(bs, bs->total_sectors);
999 if (ret < 0) {
1000 error_setg_errno(errp, -ret, "Could not refresh total sector count");
1001 goto free_and_fail;
1004 bdrv_refresh_limits(bs, &local_err);
1005 if (local_err) {
1006 error_propagate(errp, local_err);
1007 ret = -EINVAL;
1008 goto free_and_fail;
1011 assert(bdrv_opt_mem_align(bs) != 0);
1012 assert((bs->request_alignment != 0) || bs->sg);
1013 return 0;
1015 free_and_fail:
1016 bs->file = NULL;
1017 g_free(bs->opaque);
1018 bs->opaque = NULL;
1019 bs->drv = NULL;
1020 return ret;
1023 static QDict *parse_json_filename(const char *filename, Error **errp)
1025 QObject *options_obj;
1026 QDict *options;
1027 int ret;
1029 ret = strstart(filename, "json:", &filename);
1030 assert(ret);
1032 options_obj = qobject_from_json(filename);
1033 if (!options_obj) {
1034 error_setg(errp, "Could not parse the JSON options");
1035 return NULL;
1038 if (qobject_type(options_obj) != QTYPE_QDICT) {
1039 qobject_decref(options_obj);
1040 error_setg(errp, "Invalid JSON object given");
1041 return NULL;
1044 options = qobject_to_qdict(options_obj);
1045 qdict_flatten(options);
1047 return options;
1051 * Fills in default options for opening images and converts the legacy
1052 * filename/flags pair to option QDict entries.
1054 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1055 BlockDriver *drv, Error **errp)
1057 const char *filename = *pfilename;
1058 const char *drvname;
1059 bool protocol = flags & BDRV_O_PROTOCOL;
1060 bool parse_filename = false;
1061 Error *local_err = NULL;
1063 /* Parse json: pseudo-protocol */
1064 if (filename && g_str_has_prefix(filename, "json:")) {
1065 QDict *json_options = parse_json_filename(filename, &local_err);
1066 if (local_err) {
1067 error_propagate(errp, local_err);
1068 return -EINVAL;
1071 /* Options given in the filename have lower priority than options
1072 * specified directly */
1073 qdict_join(*options, json_options, false);
1074 QDECREF(json_options);
1075 *pfilename = filename = NULL;
1078 /* Fetch the file name from the options QDict if necessary */
1079 if (protocol && filename) {
1080 if (!qdict_haskey(*options, "filename")) {
1081 qdict_put(*options, "filename", qstring_from_str(filename));
1082 parse_filename = true;
1083 } else {
1084 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1085 "the same time");
1086 return -EINVAL;
1090 /* Find the right block driver */
1091 filename = qdict_get_try_str(*options, "filename");
1092 drvname = qdict_get_try_str(*options, "driver");
1094 if (drv) {
1095 if (drvname) {
1096 error_setg(errp, "Driver specified twice");
1097 return -EINVAL;
1099 drvname = drv->format_name;
1100 qdict_put(*options, "driver", qstring_from_str(drvname));
1101 } else {
1102 if (!drvname && protocol) {
1103 if (filename) {
1104 drv = bdrv_find_protocol(filename, parse_filename);
1105 if (!drv) {
1106 error_setg(errp, "Unknown protocol");
1107 return -EINVAL;
1110 drvname = drv->format_name;
1111 qdict_put(*options, "driver", qstring_from_str(drvname));
1112 } else {
1113 error_setg(errp, "Must specify either driver or file");
1114 return -EINVAL;
1116 } else if (drvname) {
1117 drv = bdrv_find_format(drvname);
1118 if (!drv) {
1119 error_setg(errp, "Unknown driver '%s'", drvname);
1120 return -ENOENT;
1125 assert(drv || !protocol);
1127 /* Driver-specific filename parsing */
1128 if (drv && drv->bdrv_parse_filename && parse_filename) {
1129 drv->bdrv_parse_filename(filename, *options, &local_err);
1130 if (local_err) {
1131 error_propagate(errp, local_err);
1132 return -EINVAL;
1135 if (!drv->bdrv_needs_filename) {
1136 qdict_del(*options, "filename");
1140 return 0;
1143 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1146 if (bs->backing_hd) {
1147 assert(bs->backing_blocker);
1148 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1149 } else if (backing_hd) {
1150 error_setg(&bs->backing_blocker,
1151 "device is used as backing hd of '%s'",
1152 bs->device_name);
1155 bs->backing_hd = backing_hd;
1156 if (!backing_hd) {
1157 error_free(bs->backing_blocker);
1158 bs->backing_blocker = NULL;
1159 goto out;
1161 bs->open_flags &= ~BDRV_O_NO_BACKING;
1162 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1163 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1164 backing_hd->drv ? backing_hd->drv->format_name : "");
1166 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1167 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1168 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1169 bs->backing_blocker);
1170 out:
1171 bdrv_refresh_limits(bs, NULL);
1175 * Opens the backing file for a BlockDriverState if not yet open
1177 * options is a QDict of options to pass to the block drivers, or NULL for an
1178 * empty set of options. The reference to the QDict is transferred to this
1179 * function (even on failure), so if the caller intends to reuse the dictionary,
1180 * it needs to use QINCREF() before calling bdrv_file_open.
1182 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1184 char *backing_filename = g_malloc0(PATH_MAX);
1185 int ret = 0;
1186 BlockDriver *back_drv = NULL;
1187 BlockDriverState *backing_hd;
1188 Error *local_err = NULL;
1190 if (bs->backing_hd != NULL) {
1191 QDECREF(options);
1192 goto free_exit;
1195 /* NULL means an empty set of options */
1196 if (options == NULL) {
1197 options = qdict_new();
1200 bs->open_flags &= ~BDRV_O_NO_BACKING;
1201 if (qdict_haskey(options, "file.filename")) {
1202 backing_filename[0] = '\0';
1203 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1204 QDECREF(options);
1205 goto free_exit;
1206 } else {
1207 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1210 if (!bs->drv || !bs->drv->supports_backing) {
1211 ret = -EINVAL;
1212 error_setg(errp, "Driver doesn't support backing files");
1213 QDECREF(options);
1214 goto free_exit;
1217 backing_hd = bdrv_new("", errp);
1219 if (bs->backing_format[0] != '\0') {
1220 back_drv = bdrv_find_format(bs->backing_format);
1223 assert(bs->backing_hd == NULL);
1224 ret = bdrv_open(&backing_hd,
1225 *backing_filename ? backing_filename : NULL, NULL, options,
1226 bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1227 if (ret < 0) {
1228 bdrv_unref(backing_hd);
1229 backing_hd = NULL;
1230 bs->open_flags |= BDRV_O_NO_BACKING;
1231 error_setg(errp, "Could not open backing file: %s",
1232 error_get_pretty(local_err));
1233 error_free(local_err);
1234 goto free_exit;
1236 bdrv_set_backing_hd(bs, backing_hd);
1238 free_exit:
1239 g_free(backing_filename);
1240 return ret;
1244 * Opens a disk image whose options are given as BlockdevRef in another block
1245 * device's options.
1247 * If allow_none is true, no image will be opened if filename is false and no
1248 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1250 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1251 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1252 * itself, all options starting with "${bdref_key}." are considered part of the
1253 * BlockdevRef.
1255 * The BlockdevRef will be removed from the options QDict.
1257 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1259 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1260 QDict *options, const char *bdref_key, int flags,
1261 bool allow_none, Error **errp)
1263 QDict *image_options;
1264 int ret;
1265 char *bdref_key_dot;
1266 const char *reference;
1268 assert(pbs);
1269 assert(*pbs == NULL);
1271 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1272 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1273 g_free(bdref_key_dot);
1275 reference = qdict_get_try_str(options, bdref_key);
1276 if (!filename && !reference && !qdict_size(image_options)) {
1277 if (allow_none) {
1278 ret = 0;
1279 } else {
1280 error_setg(errp, "A block device must be specified for \"%s\"",
1281 bdref_key);
1282 ret = -EINVAL;
1284 QDECREF(image_options);
1285 goto done;
1288 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1290 done:
1291 qdict_del(options, bdref_key);
1292 return ret;
1295 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1297 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1298 char *tmp_filename = g_malloc0(PATH_MAX + 1);
1299 int64_t total_size;
1300 BlockDriver *bdrv_qcow2;
1301 QemuOpts *opts = NULL;
1302 QDict *snapshot_options;
1303 BlockDriverState *bs_snapshot;
1304 Error *local_err;
1305 int ret;
1307 /* if snapshot, we create a temporary backing file and open it
1308 instead of opening 'filename' directly */
1310 /* Get the required size from the image */
1311 total_size = bdrv_getlength(bs);
1312 if (total_size < 0) {
1313 ret = total_size;
1314 error_setg_errno(errp, -total_size, "Could not get image size");
1315 goto out;
1317 total_size &= BDRV_SECTOR_MASK;
1319 /* Create the temporary image */
1320 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1321 if (ret < 0) {
1322 error_setg_errno(errp, -ret, "Could not get temporary filename");
1323 goto out;
1326 bdrv_qcow2 = bdrv_find_format("qcow2");
1327 opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
1328 &error_abort);
1329 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1330 ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
1331 qemu_opts_del(opts);
1332 if (ret < 0) {
1333 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1334 "'%s': %s", tmp_filename,
1335 error_get_pretty(local_err));
1336 error_free(local_err);
1337 goto out;
1340 /* Prepare a new options QDict for the temporary file */
1341 snapshot_options = qdict_new();
1342 qdict_put(snapshot_options, "file.driver",
1343 qstring_from_str("file"));
1344 qdict_put(snapshot_options, "file.filename",
1345 qstring_from_str(tmp_filename));
1347 bs_snapshot = bdrv_new("", &error_abort);
1349 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1350 flags, bdrv_qcow2, &local_err);
1351 if (ret < 0) {
1352 error_propagate(errp, local_err);
1353 goto out;
1356 bdrv_append(bs_snapshot, bs);
1358 out:
1359 g_free(tmp_filename);
1360 return ret;
1364 * Opens a disk image (raw, qcow2, vmdk, ...)
1366 * options is a QDict of options to pass to the block drivers, or NULL for an
1367 * empty set of options. The reference to the QDict belongs to the block layer
1368 * after the call (even on failure), so if the caller intends to reuse the
1369 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1371 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1372 * If it is not NULL, the referenced BDS will be reused.
1374 * The reference parameter may be used to specify an existing block device which
1375 * should be opened. If specified, neither options nor a filename may be given,
1376 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1378 int bdrv_open(BlockDriverState **pbs, const char *filename,
1379 const char *reference, QDict *options, int flags,
1380 BlockDriver *drv, Error **errp)
1382 int ret;
1383 BlockDriverState *file = NULL, *bs;
1384 const char *drvname;
1385 Error *local_err = NULL;
1386 int snapshot_flags = 0;
1388 assert(pbs);
1390 if (reference) {
1391 bool options_non_empty = options ? qdict_size(options) : false;
1392 QDECREF(options);
1394 if (*pbs) {
1395 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1396 "another block device");
1397 return -EINVAL;
1400 if (filename || options_non_empty) {
1401 error_setg(errp, "Cannot reference an existing block device with "
1402 "additional options or a new filename");
1403 return -EINVAL;
1406 bs = bdrv_lookup_bs(reference, reference, errp);
1407 if (!bs) {
1408 return -ENODEV;
1410 bdrv_ref(bs);
1411 *pbs = bs;
1412 return 0;
1415 if (*pbs) {
1416 bs = *pbs;
1417 } else {
1418 bs = bdrv_new("", &error_abort);
1421 /* NULL means an empty set of options */
1422 if (options == NULL) {
1423 options = qdict_new();
1426 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1427 if (local_err) {
1428 goto fail;
1431 /* Find the right image format driver */
1432 drv = NULL;
1433 drvname = qdict_get_try_str(options, "driver");
1434 if (drvname) {
1435 drv = bdrv_find_format(drvname);
1436 qdict_del(options, "driver");
1437 if (!drv) {
1438 error_setg(errp, "Unknown driver: '%s'", drvname);
1439 ret = -EINVAL;
1440 goto fail;
1444 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1445 if (drv && !drv->bdrv_file_open) {
1446 /* If the user explicitly wants a format driver here, we'll need to add
1447 * another layer for the protocol in bs->file */
1448 flags &= ~BDRV_O_PROTOCOL;
1451 bs->options = options;
1452 options = qdict_clone_shallow(options);
1454 /* Open image file without format layer */
1455 if ((flags & BDRV_O_PROTOCOL) == 0) {
1456 if (flags & BDRV_O_RDWR) {
1457 flags |= BDRV_O_ALLOW_RDWR;
1459 if (flags & BDRV_O_SNAPSHOT) {
1460 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1461 flags = bdrv_backing_flags(flags);
1464 assert(file == NULL);
1465 ret = bdrv_open_image(&file, filename, options, "file",
1466 bdrv_inherited_flags(flags),
1467 true, &local_err);
1468 if (ret < 0) {
1469 goto fail;
1473 /* Image format probing */
1474 if (!drv && file) {
1475 ret = find_image_format(file, filename, &drv, &local_err);
1476 if (ret < 0) {
1477 goto fail;
1479 } else if (!drv) {
1480 error_setg(errp, "Must specify either driver or file");
1481 ret = -EINVAL;
1482 goto fail;
1485 /* Open the image */
1486 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1487 if (ret < 0) {
1488 goto fail;
1491 if (file && (bs->file != file)) {
1492 bdrv_unref(file);
1493 file = NULL;
1496 /* If there is a backing file, use it */
1497 if ((flags & BDRV_O_NO_BACKING) == 0) {
1498 QDict *backing_options;
1500 qdict_extract_subqdict(options, &backing_options, "backing.");
1501 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1502 if (ret < 0) {
1503 goto close_and_fail;
1507 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1508 * temporary snapshot afterwards. */
1509 if (snapshot_flags) {
1510 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1511 if (local_err) {
1512 goto close_and_fail;
1516 /* Check if any unknown options were used */
1517 if (options && (qdict_size(options) != 0)) {
1518 const QDictEntry *entry = qdict_first(options);
1519 if (flags & BDRV_O_PROTOCOL) {
1520 error_setg(errp, "Block protocol '%s' doesn't support the option "
1521 "'%s'", drv->format_name, entry->key);
1522 } else {
1523 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1524 "support the option '%s'", drv->format_name,
1525 bs->device_name, entry->key);
1528 ret = -EINVAL;
1529 goto close_and_fail;
1532 if (!bdrv_key_required(bs)) {
1533 bdrv_dev_change_media_cb(bs, true);
1534 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1535 && !runstate_check(RUN_STATE_INMIGRATE)
1536 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1537 error_setg(errp,
1538 "Guest must be stopped for opening of encrypted image");
1539 ret = -EBUSY;
1540 goto close_and_fail;
1543 QDECREF(options);
1544 *pbs = bs;
1545 return 0;
1547 fail:
1548 if (file != NULL) {
1549 bdrv_unref(file);
1551 QDECREF(bs->options);
1552 QDECREF(options);
1553 bs->options = NULL;
1554 if (!*pbs) {
1555 /* If *pbs is NULL, a new BDS has been created in this function and
1556 needs to be freed now. Otherwise, it does not need to be closed,
1557 since it has not really been opened yet. */
1558 bdrv_unref(bs);
1560 if (local_err) {
1561 error_propagate(errp, local_err);
1563 return ret;
1565 close_and_fail:
1566 /* See fail path, but now the BDS has to be always closed */
1567 if (*pbs) {
1568 bdrv_close(bs);
1569 } else {
1570 bdrv_unref(bs);
1572 QDECREF(options);
1573 if (local_err) {
1574 error_propagate(errp, local_err);
1576 return ret;
1579 typedef struct BlockReopenQueueEntry {
1580 bool prepared;
1581 BDRVReopenState state;
1582 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1583 } BlockReopenQueueEntry;
1586 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1587 * reopen of multiple devices.
1589 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1590 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1591 * be created and initialized. This newly created BlockReopenQueue should be
1592 * passed back in for subsequent calls that are intended to be of the same
1593 * atomic 'set'.
1595 * bs is the BlockDriverState to add to the reopen queue.
1597 * flags contains the open flags for the associated bs
1599 * returns a pointer to bs_queue, which is either the newly allocated
1600 * bs_queue, or the existing bs_queue being used.
1603 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1604 BlockDriverState *bs, int flags)
1606 assert(bs != NULL);
1608 BlockReopenQueueEntry *bs_entry;
1609 if (bs_queue == NULL) {
1610 bs_queue = g_new0(BlockReopenQueue, 1);
1611 QSIMPLEQ_INIT(bs_queue);
1614 /* bdrv_open() masks this flag out */
1615 flags &= ~BDRV_O_PROTOCOL;
1617 if (bs->file) {
1618 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1621 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1622 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1624 bs_entry->state.bs = bs;
1625 bs_entry->state.flags = flags;
1627 return bs_queue;
1631 * Reopen multiple BlockDriverStates atomically & transactionally.
1633 * The queue passed in (bs_queue) must have been built up previous
1634 * via bdrv_reopen_queue().
1636 * Reopens all BDS specified in the queue, with the appropriate
1637 * flags. All devices are prepared for reopen, and failure of any
1638 * device will cause all device changes to be abandonded, and intermediate
1639 * data cleaned up.
1641 * If all devices prepare successfully, then the changes are committed
1642 * to all devices.
1645 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1647 int ret = -1;
1648 BlockReopenQueueEntry *bs_entry, *next;
1649 Error *local_err = NULL;
1651 assert(bs_queue != NULL);
1653 bdrv_drain_all();
1655 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1656 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1657 error_propagate(errp, local_err);
1658 goto cleanup;
1660 bs_entry->prepared = true;
1663 /* If we reach this point, we have success and just need to apply the
1664 * changes
1666 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1667 bdrv_reopen_commit(&bs_entry->state);
1670 ret = 0;
1672 cleanup:
1673 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1674 if (ret && bs_entry->prepared) {
1675 bdrv_reopen_abort(&bs_entry->state);
1677 g_free(bs_entry);
1679 g_free(bs_queue);
1680 return ret;
1684 /* Reopen a single BlockDriverState with the specified flags. */
1685 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1687 int ret = -1;
1688 Error *local_err = NULL;
1689 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1691 ret = bdrv_reopen_multiple(queue, &local_err);
1692 if (local_err != NULL) {
1693 error_propagate(errp, local_err);
1695 return ret;
1700 * Prepares a BlockDriverState for reopen. All changes are staged in the
1701 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1702 * the block driver layer .bdrv_reopen_prepare()
1704 * bs is the BlockDriverState to reopen
1705 * flags are the new open flags
1706 * queue is the reopen queue
1708 * Returns 0 on success, non-zero on error. On error errp will be set
1709 * as well.
1711 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1712 * It is the responsibility of the caller to then call the abort() or
1713 * commit() for any other BDS that have been left in a prepare() state
1716 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1717 Error **errp)
1719 int ret = -1;
1720 Error *local_err = NULL;
1721 BlockDriver *drv;
1723 assert(reopen_state != NULL);
1724 assert(reopen_state->bs->drv != NULL);
1725 drv = reopen_state->bs->drv;
1727 /* if we are to stay read-only, do not allow permission change
1728 * to r/w */
1729 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1730 reopen_state->flags & BDRV_O_RDWR) {
1731 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1732 reopen_state->bs->device_name);
1733 goto error;
1737 ret = bdrv_flush(reopen_state->bs);
1738 if (ret) {
1739 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1740 strerror(-ret));
1741 goto error;
1744 if (drv->bdrv_reopen_prepare) {
1745 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1746 if (ret) {
1747 if (local_err != NULL) {
1748 error_propagate(errp, local_err);
1749 } else {
1750 error_setg(errp, "failed while preparing to reopen image '%s'",
1751 reopen_state->bs->filename);
1753 goto error;
1755 } else {
1756 /* It is currently mandatory to have a bdrv_reopen_prepare()
1757 * handler for each supported drv. */
1758 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1759 drv->format_name, reopen_state->bs->device_name,
1760 "reopening of file");
1761 ret = -1;
1762 goto error;
1765 ret = 0;
1767 error:
1768 return ret;
1772 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1773 * makes them final by swapping the staging BlockDriverState contents into
1774 * the active BlockDriverState contents.
1776 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1778 BlockDriver *drv;
1780 assert(reopen_state != NULL);
1781 drv = reopen_state->bs->drv;
1782 assert(drv != NULL);
1784 /* If there are any driver level actions to take */
1785 if (drv->bdrv_reopen_commit) {
1786 drv->bdrv_reopen_commit(reopen_state);
1789 /* set BDS specific flags now */
1790 reopen_state->bs->open_flags = reopen_state->flags;
1791 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1792 BDRV_O_CACHE_WB);
1793 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1795 bdrv_refresh_limits(reopen_state->bs, NULL);
1799 * Abort the reopen, and delete and free the staged changes in
1800 * reopen_state
1802 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1804 BlockDriver *drv;
1806 assert(reopen_state != NULL);
1807 drv = reopen_state->bs->drv;
1808 assert(drv != NULL);
1810 if (drv->bdrv_reopen_abort) {
1811 drv->bdrv_reopen_abort(reopen_state);
1816 void bdrv_close(BlockDriverState *bs)
1818 if (bs->job) {
1819 block_job_cancel_sync(bs->job);
1821 bdrv_drain_all(); /* complete I/O */
1822 bdrv_flush(bs);
1823 bdrv_drain_all(); /* in case flush left pending I/O */
1824 notifier_list_notify(&bs->close_notifiers, bs);
1826 if (bs->drv) {
1827 if (bs->backing_hd) {
1828 BlockDriverState *backing_hd = bs->backing_hd;
1829 bdrv_set_backing_hd(bs, NULL);
1830 bdrv_unref(backing_hd);
1832 bs->drv->bdrv_close(bs);
1833 g_free(bs->opaque);
1834 bs->opaque = NULL;
1835 bs->drv = NULL;
1836 bs->copy_on_read = 0;
1837 bs->backing_file[0] = '\0';
1838 bs->backing_format[0] = '\0';
1839 bs->total_sectors = 0;
1840 bs->encrypted = 0;
1841 bs->valid_key = 0;
1842 bs->sg = 0;
1843 bs->growable = 0;
1844 bs->zero_beyond_eof = false;
1845 QDECREF(bs->options);
1846 bs->options = NULL;
1848 if (bs->file != NULL) {
1849 bdrv_unref(bs->file);
1850 bs->file = NULL;
1854 bdrv_dev_change_media_cb(bs, false);
1856 /*throttling disk I/O limits*/
1857 if (bs->io_limits_enabled) {
1858 bdrv_io_limits_disable(bs);
1862 void bdrv_close_all(void)
1864 BlockDriverState *bs;
1866 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1867 AioContext *aio_context = bdrv_get_aio_context(bs);
1869 aio_context_acquire(aio_context);
1870 bdrv_close(bs);
1871 aio_context_release(aio_context);
1875 /* Check if any requests are in-flight (including throttled requests) */
1876 static bool bdrv_requests_pending(BlockDriverState *bs)
1878 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1879 return true;
1881 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1882 return true;
1884 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1885 return true;
1887 if (bs->file && bdrv_requests_pending(bs->file)) {
1888 return true;
1890 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1891 return true;
1893 return false;
1897 * Wait for pending requests to complete across all BlockDriverStates
1899 * This function does not flush data to disk, use bdrv_flush_all() for that
1900 * after calling this function.
1902 * Note that completion of an asynchronous I/O operation can trigger any
1903 * number of other I/O operations on other devices---for example a coroutine
1904 * can be arbitrarily complex and a constant flow of I/O can come until the
1905 * coroutine is complete. Because of this, it is not possible to have a
1906 * function to drain a single device's I/O queue.
1908 void bdrv_drain_all(void)
1910 /* Always run first iteration so any pending completion BHs run */
1911 bool busy = true;
1912 BlockDriverState *bs;
1914 while (busy) {
1915 busy = false;
1917 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1918 AioContext *aio_context = bdrv_get_aio_context(bs);
1919 bool bs_busy;
1921 aio_context_acquire(aio_context);
1922 bdrv_flush_io_queue(bs);
1923 bdrv_start_throttled_reqs(bs);
1924 bs_busy = bdrv_requests_pending(bs);
1925 bs_busy |= aio_poll(aio_context, bs_busy);
1926 aio_context_release(aio_context);
1928 busy |= bs_busy;
1933 /* make a BlockDriverState anonymous by removing from bdrv_state and
1934 * graph_bdrv_state list.
1935 Also, NULL terminate the device_name to prevent double remove */
1936 void bdrv_make_anon(BlockDriverState *bs)
1938 if (bs->device_name[0] != '\0') {
1939 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1941 bs->device_name[0] = '\0';
1942 if (bs->node_name[0] != '\0') {
1943 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1945 bs->node_name[0] = '\0';
1948 static void bdrv_rebind(BlockDriverState *bs)
1950 if (bs->drv && bs->drv->bdrv_rebind) {
1951 bs->drv->bdrv_rebind(bs);
1955 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1956 BlockDriverState *bs_src)
1958 /* move some fields that need to stay attached to the device */
1960 /* dev info */
1961 bs_dest->dev_ops = bs_src->dev_ops;
1962 bs_dest->dev_opaque = bs_src->dev_opaque;
1963 bs_dest->dev = bs_src->dev;
1964 bs_dest->guest_block_size = bs_src->guest_block_size;
1965 bs_dest->copy_on_read = bs_src->copy_on_read;
1967 bs_dest->enable_write_cache = bs_src->enable_write_cache;
1969 /* i/o throttled req */
1970 memcpy(&bs_dest->throttle_state,
1971 &bs_src->throttle_state,
1972 sizeof(ThrottleState));
1973 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
1974 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
1975 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
1977 /* r/w error */
1978 bs_dest->on_read_error = bs_src->on_read_error;
1979 bs_dest->on_write_error = bs_src->on_write_error;
1981 /* i/o status */
1982 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
1983 bs_dest->iostatus = bs_src->iostatus;
1985 /* dirty bitmap */
1986 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
1988 /* reference count */
1989 bs_dest->refcnt = bs_src->refcnt;
1991 /* job */
1992 bs_dest->job = bs_src->job;
1994 /* keep the same entry in bdrv_states */
1995 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1996 bs_src->device_name);
1997 bs_dest->device_list = bs_src->device_list;
1998 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
1999 sizeof(bs_dest->op_blockers));
2003 * Swap bs contents for two image chains while they are live,
2004 * while keeping required fields on the BlockDriverState that is
2005 * actually attached to a device.
2007 * This will modify the BlockDriverState fields, and swap contents
2008 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2010 * bs_new is required to be anonymous.
2012 * This function does not create any image files.
2014 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2016 BlockDriverState tmp;
2018 /* The code needs to swap the node_name but simply swapping node_list won't
2019 * work so first remove the nodes from the graph list, do the swap then
2020 * insert them back if needed.
2022 if (bs_new->node_name[0] != '\0') {
2023 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2025 if (bs_old->node_name[0] != '\0') {
2026 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2029 /* bs_new must be anonymous and shouldn't have anything fancy enabled */
2030 assert(bs_new->device_name[0] == '\0');
2031 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2032 assert(bs_new->job == NULL);
2033 assert(bs_new->dev == NULL);
2034 assert(bs_new->io_limits_enabled == false);
2035 assert(!throttle_have_timer(&bs_new->throttle_state));
2037 tmp = *bs_new;
2038 *bs_new = *bs_old;
2039 *bs_old = tmp;
2041 /* there are some fields that should not be swapped, move them back */
2042 bdrv_move_feature_fields(&tmp, bs_old);
2043 bdrv_move_feature_fields(bs_old, bs_new);
2044 bdrv_move_feature_fields(bs_new, &tmp);
2046 /* bs_new shouldn't be in bdrv_states even after the swap! */
2047 assert(bs_new->device_name[0] == '\0');
2049 /* Check a few fields that should remain attached to the device */
2050 assert(bs_new->dev == NULL);
2051 assert(bs_new->job == NULL);
2052 assert(bs_new->io_limits_enabled == false);
2053 assert(!throttle_have_timer(&bs_new->throttle_state));
2055 /* insert the nodes back into the graph node list if needed */
2056 if (bs_new->node_name[0] != '\0') {
2057 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2059 if (bs_old->node_name[0] != '\0') {
2060 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2063 bdrv_rebind(bs_new);
2064 bdrv_rebind(bs_old);
2068 * Add new bs contents at the top of an image chain while the chain is
2069 * live, while keeping required fields on the top layer.
2071 * This will modify the BlockDriverState fields, and swap contents
2072 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2074 * bs_new is required to be anonymous.
2076 * This function does not create any image files.
2078 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2080 bdrv_swap(bs_new, bs_top);
2082 /* The contents of 'tmp' will become bs_top, as we are
2083 * swapping bs_new and bs_top contents. */
2084 bdrv_set_backing_hd(bs_top, bs_new);
2087 static void bdrv_delete(BlockDriverState *bs)
2089 assert(!bs->dev);
2090 assert(!bs->job);
2091 assert(bdrv_op_blocker_is_empty(bs));
2092 assert(!bs->refcnt);
2093 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2095 bdrv_close(bs);
2097 /* remove from list, if necessary */
2098 bdrv_make_anon(bs);
2100 g_free(bs);
2103 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
2104 /* TODO change to DeviceState *dev when all users are qdevified */
2106 if (bs->dev) {
2107 return -EBUSY;
2109 bs->dev = dev;
2110 bdrv_iostatus_reset(bs);
2111 return 0;
2114 /* TODO qdevified devices don't use this, remove when devices are qdevified */
2115 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
2117 if (bdrv_attach_dev(bs, dev) < 0) {
2118 abort();
2122 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2123 /* TODO change to DeviceState *dev when all users are qdevified */
2125 assert(bs->dev == dev);
2126 bs->dev = NULL;
2127 bs->dev_ops = NULL;
2128 bs->dev_opaque = NULL;
2129 bs->guest_block_size = 512;
2132 /* TODO change to return DeviceState * when all users are qdevified */
2133 void *bdrv_get_attached_dev(BlockDriverState *bs)
2135 return bs->dev;
2138 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2139 void *opaque)
2141 bs->dev_ops = ops;
2142 bs->dev_opaque = opaque;
2145 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2147 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2148 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2149 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2150 if (tray_was_closed) {
2151 /* tray open */
2152 qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
2153 true, &error_abort);
2155 if (load) {
2156 /* tray close */
2157 qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
2158 false, &error_abort);
2163 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2165 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2168 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2170 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2171 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2175 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2177 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2178 return bs->dev_ops->is_tray_open(bs->dev_opaque);
2180 return false;
2183 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2185 if (bs->dev_ops && bs->dev_ops->resize_cb) {
2186 bs->dev_ops->resize_cb(bs->dev_opaque);
2190 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2192 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2193 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2195 return false;
2199 * Run consistency checks on an image
2201 * Returns 0 if the check could be completed (it doesn't mean that the image is
2202 * free of errors) or -errno when an internal error occurred. The results of the
2203 * check are stored in res.
2205 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2207 if (bs->drv->bdrv_check == NULL) {
2208 return -ENOTSUP;
2211 memset(res, 0, sizeof(*res));
2212 return bs->drv->bdrv_check(bs, res, fix);
2215 #define COMMIT_BUF_SECTORS 2048
2217 /* commit COW file into the raw image */
2218 int bdrv_commit(BlockDriverState *bs)
2220 BlockDriver *drv = bs->drv;
2221 int64_t sector, total_sectors, length, backing_length;
2222 int n, ro, open_flags;
2223 int ret = 0;
2224 uint8_t *buf = NULL;
2225 char filename[PATH_MAX];
2227 if (!drv)
2228 return -ENOMEDIUM;
2230 if (!bs->backing_hd) {
2231 return -ENOTSUP;
2234 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2235 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2236 return -EBUSY;
2239 ro = bs->backing_hd->read_only;
2240 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2241 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2242 open_flags = bs->backing_hd->open_flags;
2244 if (ro) {
2245 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2246 return -EACCES;
2250 length = bdrv_getlength(bs);
2251 if (length < 0) {
2252 ret = length;
2253 goto ro_cleanup;
2256 backing_length = bdrv_getlength(bs->backing_hd);
2257 if (backing_length < 0) {
2258 ret = backing_length;
2259 goto ro_cleanup;
2262 /* If our top snapshot is larger than the backing file image,
2263 * grow the backing file image if possible. If not possible,
2264 * we must return an error */
2265 if (length > backing_length) {
2266 ret = bdrv_truncate(bs->backing_hd, length);
2267 if (ret < 0) {
2268 goto ro_cleanup;
2272 total_sectors = length >> BDRV_SECTOR_BITS;
2273 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2275 for (sector = 0; sector < total_sectors; sector += n) {
2276 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2277 if (ret < 0) {
2278 goto ro_cleanup;
2280 if (ret) {
2281 ret = bdrv_read(bs, sector, buf, n);
2282 if (ret < 0) {
2283 goto ro_cleanup;
2286 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2287 if (ret < 0) {
2288 goto ro_cleanup;
2293 if (drv->bdrv_make_empty) {
2294 ret = drv->bdrv_make_empty(bs);
2295 if (ret < 0) {
2296 goto ro_cleanup;
2298 bdrv_flush(bs);
2302 * Make sure all data we wrote to the backing device is actually
2303 * stable on disk.
2305 if (bs->backing_hd) {
2306 bdrv_flush(bs->backing_hd);
2309 ret = 0;
2310 ro_cleanup:
2311 g_free(buf);
2313 if (ro) {
2314 /* ignoring error return here */
2315 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2318 return ret;
2321 int bdrv_commit_all(void)
2323 BlockDriverState *bs;
2325 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2326 AioContext *aio_context = bdrv_get_aio_context(bs);
2328 aio_context_acquire(aio_context);
2329 if (bs->drv && bs->backing_hd) {
2330 int ret = bdrv_commit(bs);
2331 if (ret < 0) {
2332 aio_context_release(aio_context);
2333 return ret;
2336 aio_context_release(aio_context);
2338 return 0;
2342 * Remove an active request from the tracked requests list
2344 * This function should be called when a tracked request is completing.
2346 static void tracked_request_end(BdrvTrackedRequest *req)
2348 if (req->serialising) {
2349 req->bs->serialising_in_flight--;
2352 QLIST_REMOVE(req, list);
2353 qemu_co_queue_restart_all(&req->wait_queue);
2357 * Add an active request to the tracked requests list
2359 static void tracked_request_begin(BdrvTrackedRequest *req,
2360 BlockDriverState *bs,
2361 int64_t offset,
2362 unsigned int bytes, bool is_write)
2364 *req = (BdrvTrackedRequest){
2365 .bs = bs,
2366 .offset = offset,
2367 .bytes = bytes,
2368 .is_write = is_write,
2369 .co = qemu_coroutine_self(),
2370 .serialising = false,
2371 .overlap_offset = offset,
2372 .overlap_bytes = bytes,
2375 qemu_co_queue_init(&req->wait_queue);
2377 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2380 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2382 int64_t overlap_offset = req->offset & ~(align - 1);
2383 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2384 - overlap_offset;
2386 if (!req->serialising) {
2387 req->bs->serialising_in_flight++;
2388 req->serialising = true;
2391 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2392 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2396 * Round a region to cluster boundaries
2398 void bdrv_round_to_clusters(BlockDriverState *bs,
2399 int64_t sector_num, int nb_sectors,
2400 int64_t *cluster_sector_num,
2401 int *cluster_nb_sectors)
2403 BlockDriverInfo bdi;
2405 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2406 *cluster_sector_num = sector_num;
2407 *cluster_nb_sectors = nb_sectors;
2408 } else {
2409 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2410 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2411 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2412 nb_sectors, c);
2416 static int bdrv_get_cluster_size(BlockDriverState *bs)
2418 BlockDriverInfo bdi;
2419 int ret;
2421 ret = bdrv_get_info(bs, &bdi);
2422 if (ret < 0 || bdi.cluster_size == 0) {
2423 return bs->request_alignment;
2424 } else {
2425 return bdi.cluster_size;
2429 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2430 int64_t offset, unsigned int bytes)
2432 /* aaaa bbbb */
2433 if (offset >= req->overlap_offset + req->overlap_bytes) {
2434 return false;
2436 /* bbbb aaaa */
2437 if (req->overlap_offset >= offset + bytes) {
2438 return false;
2440 return true;
2443 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2445 BlockDriverState *bs = self->bs;
2446 BdrvTrackedRequest *req;
2447 bool retry;
2448 bool waited = false;
2450 if (!bs->serialising_in_flight) {
2451 return false;
2454 do {
2455 retry = false;
2456 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2457 if (req == self || (!req->serialising && !self->serialising)) {
2458 continue;
2460 if (tracked_request_overlaps(req, self->overlap_offset,
2461 self->overlap_bytes))
2463 /* Hitting this means there was a reentrant request, for
2464 * example, a block driver issuing nested requests. This must
2465 * never happen since it means deadlock.
2467 assert(qemu_coroutine_self() != req->co);
2469 /* If the request is already (indirectly) waiting for us, or
2470 * will wait for us as soon as it wakes up, then just go on
2471 * (instead of producing a deadlock in the former case). */
2472 if (!req->waiting_for) {
2473 self->waiting_for = req;
2474 qemu_co_queue_wait(&req->wait_queue);
2475 self->waiting_for = NULL;
2476 retry = true;
2477 waited = true;
2478 break;
2482 } while (retry);
2484 return waited;
2488 * Return values:
2489 * 0 - success
2490 * -EINVAL - backing format specified, but no file
2491 * -ENOSPC - can't update the backing file because no space is left in the
2492 * image file header
2493 * -ENOTSUP - format driver doesn't support changing the backing file
2495 int bdrv_change_backing_file(BlockDriverState *bs,
2496 const char *backing_file, const char *backing_fmt)
2498 BlockDriver *drv = bs->drv;
2499 int ret;
2501 /* Backing file format doesn't make sense without a backing file */
2502 if (backing_fmt && !backing_file) {
2503 return -EINVAL;
2506 if (drv->bdrv_change_backing_file != NULL) {
2507 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2508 } else {
2509 ret = -ENOTSUP;
2512 if (ret == 0) {
2513 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2514 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2516 return ret;
2520 * Finds the image layer in the chain that has 'bs' as its backing file.
2522 * active is the current topmost image.
2524 * Returns NULL if bs is not found in active's image chain,
2525 * or if active == bs.
2527 * Returns the bottommost base image if bs == NULL.
2529 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2530 BlockDriverState *bs)
2532 while (active && bs != active->backing_hd) {
2533 active = active->backing_hd;
2536 return active;
2539 /* Given a BDS, searches for the base layer. */
2540 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2542 return bdrv_find_overlay(bs, NULL);
2545 typedef struct BlkIntermediateStates {
2546 BlockDriverState *bs;
2547 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2548 } BlkIntermediateStates;
2552 * Drops images above 'base' up to and including 'top', and sets the image
2553 * above 'top' to have base as its backing file.
2555 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2556 * information in 'bs' can be properly updated.
2558 * E.g., this will convert the following chain:
2559 * bottom <- base <- intermediate <- top <- active
2561 * to
2563 * bottom <- base <- active
2565 * It is allowed for bottom==base, in which case it converts:
2567 * base <- intermediate <- top <- active
2569 * to
2571 * base <- active
2573 * If backing_file_str is non-NULL, it will be used when modifying top's
2574 * overlay image metadata.
2576 * Error conditions:
2577 * if active == top, that is considered an error
2580 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2581 BlockDriverState *base, const char *backing_file_str)
2583 BlockDriverState *intermediate;
2584 BlockDriverState *base_bs = NULL;
2585 BlockDriverState *new_top_bs = NULL;
2586 BlkIntermediateStates *intermediate_state, *next;
2587 int ret = -EIO;
2589 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2590 QSIMPLEQ_INIT(&states_to_delete);
2592 if (!top->drv || !base->drv) {
2593 goto exit;
2596 new_top_bs = bdrv_find_overlay(active, top);
2598 if (new_top_bs == NULL) {
2599 /* we could not find the image above 'top', this is an error */
2600 goto exit;
2603 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2604 * to do, no intermediate images */
2605 if (new_top_bs->backing_hd == base) {
2606 ret = 0;
2607 goto exit;
2610 intermediate = top;
2612 /* now we will go down through the list, and add each BDS we find
2613 * into our deletion queue, until we hit the 'base'
2615 while (intermediate) {
2616 intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2617 intermediate_state->bs = intermediate;
2618 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2620 if (intermediate->backing_hd == base) {
2621 base_bs = intermediate->backing_hd;
2622 break;
2624 intermediate = intermediate->backing_hd;
2626 if (base_bs == NULL) {
2627 /* something went wrong, we did not end at the base. safely
2628 * unravel everything, and exit with error */
2629 goto exit;
2632 /* success - we can delete the intermediate states, and link top->base */
2633 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2634 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2635 base_bs->drv ? base_bs->drv->format_name : "");
2636 if (ret) {
2637 goto exit;
2639 bdrv_set_backing_hd(new_top_bs, base_bs);
2641 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2642 /* so that bdrv_close() does not recursively close the chain */
2643 bdrv_set_backing_hd(intermediate_state->bs, NULL);
2644 bdrv_unref(intermediate_state->bs);
2646 ret = 0;
2648 exit:
2649 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2650 g_free(intermediate_state);
2652 return ret;
2656 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2657 size_t size)
2659 int64_t len;
2661 if (size > INT_MAX) {
2662 return -EIO;
2665 if (!bdrv_is_inserted(bs))
2666 return -ENOMEDIUM;
2668 if (bs->growable)
2669 return 0;
2671 len = bdrv_getlength(bs);
2673 if (offset < 0)
2674 return -EIO;
2676 if ((offset > len) || (len - offset < size))
2677 return -EIO;
2679 return 0;
2682 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2683 int nb_sectors)
2685 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2686 return -EIO;
2689 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2690 nb_sectors * BDRV_SECTOR_SIZE);
2693 typedef struct RwCo {
2694 BlockDriverState *bs;
2695 int64_t offset;
2696 QEMUIOVector *qiov;
2697 bool is_write;
2698 int ret;
2699 BdrvRequestFlags flags;
2700 } RwCo;
2702 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2704 RwCo *rwco = opaque;
2706 if (!rwco->is_write) {
2707 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2708 rwco->qiov->size, rwco->qiov,
2709 rwco->flags);
2710 } else {
2711 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2712 rwco->qiov->size, rwco->qiov,
2713 rwco->flags);
2718 * Process a vectored synchronous request using coroutines
2720 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2721 QEMUIOVector *qiov, bool is_write,
2722 BdrvRequestFlags flags)
2724 Coroutine *co;
2725 RwCo rwco = {
2726 .bs = bs,
2727 .offset = offset,
2728 .qiov = qiov,
2729 .is_write = is_write,
2730 .ret = NOT_DONE,
2731 .flags = flags,
2735 * In sync call context, when the vcpu is blocked, this throttling timer
2736 * will not fire; so the I/O throttling function has to be disabled here
2737 * if it has been enabled.
2739 if (bs->io_limits_enabled) {
2740 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2741 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2742 bdrv_io_limits_disable(bs);
2745 if (qemu_in_coroutine()) {
2746 /* Fast-path if already in coroutine context */
2747 bdrv_rw_co_entry(&rwco);
2748 } else {
2749 AioContext *aio_context = bdrv_get_aio_context(bs);
2751 co = qemu_coroutine_create(bdrv_rw_co_entry);
2752 qemu_coroutine_enter(co, &rwco);
2753 while (rwco.ret == NOT_DONE) {
2754 aio_poll(aio_context, true);
2757 return rwco.ret;
2761 * Process a synchronous request using coroutines
2763 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2764 int nb_sectors, bool is_write, BdrvRequestFlags flags)
2766 QEMUIOVector qiov;
2767 struct iovec iov = {
2768 .iov_base = (void *)buf,
2769 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2772 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2773 return -EINVAL;
2776 qemu_iovec_init_external(&qiov, &iov, 1);
2777 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2778 &qiov, is_write, flags);
2781 /* return < 0 if error. See bdrv_write() for the return codes */
2782 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2783 uint8_t *buf, int nb_sectors)
2785 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2788 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2789 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2790 uint8_t *buf, int nb_sectors)
2792 bool enabled;
2793 int ret;
2795 enabled = bs->io_limits_enabled;
2796 bs->io_limits_enabled = false;
2797 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2798 bs->io_limits_enabled = enabled;
2799 return ret;
2802 /* Return < 0 if error. Important errors are:
2803 -EIO generic I/O error (may happen for all errors)
2804 -ENOMEDIUM No media inserted.
2805 -EINVAL Invalid sector number or nb_sectors
2806 -EACCES Trying to write a read-only device
2808 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2809 const uint8_t *buf, int nb_sectors)
2811 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2814 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2815 int nb_sectors, BdrvRequestFlags flags)
2817 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2818 BDRV_REQ_ZERO_WRITE | flags);
2822 * Completely zero out a block device with the help of bdrv_write_zeroes.
2823 * The operation is sped up by checking the block status and only writing
2824 * zeroes to the device if they currently do not return zeroes. Optional
2825 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2827 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2829 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2831 int64_t target_size;
2832 int64_t ret, nb_sectors, sector_num = 0;
2833 int n;
2835 target_size = bdrv_getlength(bs);
2836 if (target_size < 0) {
2837 return target_size;
2839 target_size /= BDRV_SECTOR_SIZE;
2841 for (;;) {
2842 nb_sectors = target_size - sector_num;
2843 if (nb_sectors <= 0) {
2844 return 0;
2846 if (nb_sectors > INT_MAX) {
2847 nb_sectors = INT_MAX;
2849 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2850 if (ret < 0) {
2851 error_report("error getting block status at sector %" PRId64 ": %s",
2852 sector_num, strerror(-ret));
2853 return ret;
2855 if (ret & BDRV_BLOCK_ZERO) {
2856 sector_num += n;
2857 continue;
2859 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2860 if (ret < 0) {
2861 error_report("error writing zeroes at sector %" PRId64 ": %s",
2862 sector_num, strerror(-ret));
2863 return ret;
2865 sector_num += n;
2869 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2871 QEMUIOVector qiov;
2872 struct iovec iov = {
2873 .iov_base = (void *)buf,
2874 .iov_len = bytes,
2876 int ret;
2878 if (bytes < 0) {
2879 return -EINVAL;
2882 qemu_iovec_init_external(&qiov, &iov, 1);
2883 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2884 if (ret < 0) {
2885 return ret;
2888 return bytes;
2891 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2893 int ret;
2895 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2896 if (ret < 0) {
2897 return ret;
2900 return qiov->size;
2903 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2904 const void *buf, int bytes)
2906 QEMUIOVector qiov;
2907 struct iovec iov = {
2908 .iov_base = (void *) buf,
2909 .iov_len = bytes,
2912 if (bytes < 0) {
2913 return -EINVAL;
2916 qemu_iovec_init_external(&qiov, &iov, 1);
2917 return bdrv_pwritev(bs, offset, &qiov);
2921 * Writes to the file and ensures that no writes are reordered across this
2922 * request (acts as a barrier)
2924 * Returns 0 on success, -errno in error cases.
2926 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2927 const void *buf, int count)
2929 int ret;
2931 ret = bdrv_pwrite(bs, offset, buf, count);
2932 if (ret < 0) {
2933 return ret;
2936 /* No flush needed for cache modes that already do it */
2937 if (bs->enable_write_cache) {
2938 bdrv_flush(bs);
2941 return 0;
2944 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2945 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2947 /* Perform I/O through a temporary buffer so that users who scribble over
2948 * their read buffer while the operation is in progress do not end up
2949 * modifying the image file. This is critical for zero-copy guest I/O
2950 * where anything might happen inside guest memory.
2952 void *bounce_buffer;
2954 BlockDriver *drv = bs->drv;
2955 struct iovec iov;
2956 QEMUIOVector bounce_qiov;
2957 int64_t cluster_sector_num;
2958 int cluster_nb_sectors;
2959 size_t skip_bytes;
2960 int ret;
2962 /* Cover entire cluster so no additional backing file I/O is required when
2963 * allocating cluster in the image file.
2965 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2966 &cluster_sector_num, &cluster_nb_sectors);
2968 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2969 cluster_sector_num, cluster_nb_sectors);
2971 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2972 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2973 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2975 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2976 &bounce_qiov);
2977 if (ret < 0) {
2978 goto err;
2981 if (drv->bdrv_co_write_zeroes &&
2982 buffer_is_zero(bounce_buffer, iov.iov_len)) {
2983 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2984 cluster_nb_sectors, 0);
2985 } else {
2986 /* This does not change the data on the disk, it is not necessary
2987 * to flush even in cache=writethrough mode.
2989 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2990 &bounce_qiov);
2993 if (ret < 0) {
2994 /* It might be okay to ignore write errors for guest requests. If this
2995 * is a deliberate copy-on-read then we don't want to ignore the error.
2996 * Simply report it in all cases.
2998 goto err;
3001 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
3002 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3003 nb_sectors * BDRV_SECTOR_SIZE);
3005 err:
3006 qemu_vfree(bounce_buffer);
3007 return ret;
3011 * Forwards an already correctly aligned request to the BlockDriver. This
3012 * handles copy on read and zeroing after EOF; any other features must be
3013 * implemented by the caller.
3015 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
3016 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3017 int64_t align, QEMUIOVector *qiov, int flags)
3019 BlockDriver *drv = bs->drv;
3020 int ret;
3022 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3023 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3025 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3026 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3027 assert(!qiov || bytes == qiov->size);
3029 /* Handle Copy on Read and associated serialisation */
3030 if (flags & BDRV_REQ_COPY_ON_READ) {
3031 /* If we touch the same cluster it counts as an overlap. This
3032 * guarantees that allocating writes will be serialized and not race
3033 * with each other for the same cluster. For example, in copy-on-read
3034 * it ensures that the CoR read and write operations are atomic and
3035 * guest writes cannot interleave between them. */
3036 mark_request_serialising(req, bdrv_get_cluster_size(bs));
3039 wait_serialising_requests(req);
3041 if (flags & BDRV_REQ_COPY_ON_READ) {
3042 int pnum;
3044 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3045 if (ret < 0) {
3046 goto out;
3049 if (!ret || pnum != nb_sectors) {
3050 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3051 goto out;
3055 /* Forward the request to the BlockDriver */
3056 if (!(bs->zero_beyond_eof && bs->growable)) {
3057 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3058 } else {
3059 /* Read zeros after EOF of growable BDSes */
3060 int64_t len, total_sectors, max_nb_sectors;
3062 len = bdrv_getlength(bs);
3063 if (len < 0) {
3064 ret = len;
3065 goto out;
3068 total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
3069 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3070 align >> BDRV_SECTOR_BITS);
3071 if (max_nb_sectors > 0) {
3072 QEMUIOVector local_qiov;
3073 size_t local_sectors;
3075 max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3076 local_sectors = MIN(max_nb_sectors, nb_sectors);
3078 qemu_iovec_init(&local_qiov, qiov->niov);
3079 qemu_iovec_concat(&local_qiov, qiov, 0,
3080 local_sectors * BDRV_SECTOR_SIZE);
3082 ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3083 &local_qiov);
3085 qemu_iovec_destroy(&local_qiov);
3086 } else {
3087 ret = 0;
3090 /* Reading beyond end of file is supposed to produce zeroes */
3091 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3092 uint64_t offset = MAX(0, total_sectors - sector_num);
3093 uint64_t bytes = (sector_num + nb_sectors - offset) *
3094 BDRV_SECTOR_SIZE;
3095 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3099 out:
3100 return ret;
3104 * Handle a read request in coroutine context
3106 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3107 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3108 BdrvRequestFlags flags)
3110 BlockDriver *drv = bs->drv;
3111 BdrvTrackedRequest req;
3113 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3114 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3115 uint8_t *head_buf = NULL;
3116 uint8_t *tail_buf = NULL;
3117 QEMUIOVector local_qiov;
3118 bool use_local_qiov = false;
3119 int ret;
3121 if (!drv) {
3122 return -ENOMEDIUM;
3124 if (bdrv_check_byte_request(bs, offset, bytes)) {
3125 return -EIO;
3128 if (bs->copy_on_read) {
3129 flags |= BDRV_REQ_COPY_ON_READ;
3132 /* throttling disk I/O */
3133 if (bs->io_limits_enabled) {
3134 bdrv_io_limits_intercept(bs, bytes, false);
3137 /* Align read if necessary by padding qiov */
3138 if (offset & (align - 1)) {
3139 head_buf = qemu_blockalign(bs, align);
3140 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3141 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3142 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3143 use_local_qiov = true;
3145 bytes += offset & (align - 1);
3146 offset = offset & ~(align - 1);
3149 if ((offset + bytes) & (align - 1)) {
3150 if (!use_local_qiov) {
3151 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3152 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3153 use_local_qiov = true;
3155 tail_buf = qemu_blockalign(bs, align);
3156 qemu_iovec_add(&local_qiov, tail_buf,
3157 align - ((offset + bytes) & (align - 1)));
3159 bytes = ROUND_UP(bytes, align);
3162 tracked_request_begin(&req, bs, offset, bytes, false);
3163 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3164 use_local_qiov ? &local_qiov : qiov,
3165 flags);
3166 tracked_request_end(&req);
3168 if (use_local_qiov) {
3169 qemu_iovec_destroy(&local_qiov);
3170 qemu_vfree(head_buf);
3171 qemu_vfree(tail_buf);
3174 return ret;
3177 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3178 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3179 BdrvRequestFlags flags)
3181 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3182 return -EINVAL;
3185 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3186 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3189 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3190 int nb_sectors, QEMUIOVector *qiov)
3192 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3194 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3197 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3198 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3200 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3202 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3203 BDRV_REQ_COPY_ON_READ);
3206 /* if no limit is specified in the BlockLimits use a default
3207 * of 32768 512-byte sectors (16 MiB) per request.
3209 #define MAX_WRITE_ZEROES_DEFAULT 32768
3211 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3212 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3214 BlockDriver *drv = bs->drv;
3215 QEMUIOVector qiov;
3216 struct iovec iov = {0};
3217 int ret = 0;
3219 int max_write_zeroes = bs->bl.max_write_zeroes ?
3220 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3222 while (nb_sectors > 0 && !ret) {
3223 int num = nb_sectors;
3225 /* Align request. Block drivers can expect the "bulk" of the request
3226 * to be aligned.
3228 if (bs->bl.write_zeroes_alignment
3229 && num > bs->bl.write_zeroes_alignment) {
3230 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3231 /* Make a small request up to the first aligned sector. */
3232 num = bs->bl.write_zeroes_alignment;
3233 num -= sector_num % bs->bl.write_zeroes_alignment;
3234 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3235 /* Shorten the request to the last aligned sector. num cannot
3236 * underflow because num > bs->bl.write_zeroes_alignment.
3238 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3242 /* limit request size */
3243 if (num > max_write_zeroes) {
3244 num = max_write_zeroes;
3247 ret = -ENOTSUP;
3248 /* First try the efficient write zeroes operation */
3249 if (drv->bdrv_co_write_zeroes) {
3250 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3253 if (ret == -ENOTSUP) {
3254 /* Fall back to bounce buffer if write zeroes is unsupported */
3255 iov.iov_len = num * BDRV_SECTOR_SIZE;
3256 if (iov.iov_base == NULL) {
3257 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3258 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3260 qemu_iovec_init_external(&qiov, &iov, 1);
3262 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3264 /* Keep bounce buffer around if it is big enough for all
3265 * all future requests.
3267 if (num < max_write_zeroes) {
3268 qemu_vfree(iov.iov_base);
3269 iov.iov_base = NULL;
3273 sector_num += num;
3274 nb_sectors -= num;
3277 qemu_vfree(iov.iov_base);
3278 return ret;
3282 * Forwards an already correctly aligned write request to the BlockDriver.
3284 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3285 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3286 QEMUIOVector *qiov, int flags)
3288 BlockDriver *drv = bs->drv;
3289 bool waited;
3290 int ret;
3292 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3293 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3295 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3296 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3297 assert(!qiov || bytes == qiov->size);
3299 waited = wait_serialising_requests(req);
3300 assert(!waited || !req->serialising);
3301 assert(req->overlap_offset <= offset);
3302 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3304 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3306 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3307 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3308 qemu_iovec_is_zero(qiov)) {
3309 flags |= BDRV_REQ_ZERO_WRITE;
3310 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3311 flags |= BDRV_REQ_MAY_UNMAP;
3315 if (ret < 0) {
3316 /* Do nothing, write notifier decided to fail this request */
3317 } else if (flags & BDRV_REQ_ZERO_WRITE) {
3318 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3319 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3320 } else {
3321 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3322 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3324 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3326 if (ret == 0 && !bs->enable_write_cache) {
3327 ret = bdrv_co_flush(bs);
3330 bdrv_set_dirty(bs, sector_num, nb_sectors);
3332 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3333 bs->wr_highest_sector = sector_num + nb_sectors - 1;
3335 if (bs->growable && ret >= 0) {
3336 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3339 return ret;
3343 * Handle a write request in coroutine context
3345 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3346 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3347 BdrvRequestFlags flags)
3349 BdrvTrackedRequest req;
3350 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3351 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3352 uint8_t *head_buf = NULL;
3353 uint8_t *tail_buf = NULL;
3354 QEMUIOVector local_qiov;
3355 bool use_local_qiov = false;
3356 int ret;
3358 if (!bs->drv) {
3359 return -ENOMEDIUM;
3361 if (bs->read_only) {
3362 return -EACCES;
3364 if (bdrv_check_byte_request(bs, offset, bytes)) {
3365 return -EIO;
3368 /* throttling disk I/O */
3369 if (bs->io_limits_enabled) {
3370 bdrv_io_limits_intercept(bs, bytes, true);
3374 * Align write if necessary by performing a read-modify-write cycle.
3375 * Pad qiov with the read parts and be sure to have a tracked request not
3376 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3378 tracked_request_begin(&req, bs, offset, bytes, true);
3380 if (offset & (align - 1)) {
3381 QEMUIOVector head_qiov;
3382 struct iovec head_iov;
3384 mark_request_serialising(&req, align);
3385 wait_serialising_requests(&req);
3387 head_buf = qemu_blockalign(bs, align);
3388 head_iov = (struct iovec) {
3389 .iov_base = head_buf,
3390 .iov_len = align,
3392 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3394 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3395 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3396 align, &head_qiov, 0);
3397 if (ret < 0) {
3398 goto fail;
3400 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3402 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3403 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3404 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3405 use_local_qiov = true;
3407 bytes += offset & (align - 1);
3408 offset = offset & ~(align - 1);
3411 if ((offset + bytes) & (align - 1)) {
3412 QEMUIOVector tail_qiov;
3413 struct iovec tail_iov;
3414 size_t tail_bytes;
3415 bool waited;
3417 mark_request_serialising(&req, align);
3418 waited = wait_serialising_requests(&req);
3419 assert(!waited || !use_local_qiov);
3421 tail_buf = qemu_blockalign(bs, align);
3422 tail_iov = (struct iovec) {
3423 .iov_base = tail_buf,
3424 .iov_len = align,
3426 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3428 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3429 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3430 align, &tail_qiov, 0);
3431 if (ret < 0) {
3432 goto fail;
3434 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3436 if (!use_local_qiov) {
3437 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3438 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3439 use_local_qiov = true;
3442 tail_bytes = (offset + bytes) & (align - 1);
3443 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3445 bytes = ROUND_UP(bytes, align);
3448 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3449 use_local_qiov ? &local_qiov : qiov,
3450 flags);
3452 fail:
3453 tracked_request_end(&req);
3455 if (use_local_qiov) {
3456 qemu_iovec_destroy(&local_qiov);
3458 qemu_vfree(head_buf);
3459 qemu_vfree(tail_buf);
3461 return ret;
3464 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3465 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3466 BdrvRequestFlags flags)
3468 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3469 return -EINVAL;
3472 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3473 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3476 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3477 int nb_sectors, QEMUIOVector *qiov)
3479 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3481 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3484 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3485 int64_t sector_num, int nb_sectors,
3486 BdrvRequestFlags flags)
3488 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3490 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3491 flags &= ~BDRV_REQ_MAY_UNMAP;
3494 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3495 BDRV_REQ_ZERO_WRITE | flags);
3499 * Truncate file to 'offset' bytes (needed only for file protocols)
3501 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3503 BlockDriver *drv = bs->drv;
3504 int ret;
3505 if (!drv)
3506 return -ENOMEDIUM;
3507 if (!drv->bdrv_truncate)
3508 return -ENOTSUP;
3509 if (bs->read_only)
3510 return -EACCES;
3512 ret = drv->bdrv_truncate(bs, offset);
3513 if (ret == 0) {
3514 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3515 bdrv_dev_resize_cb(bs);
3517 return ret;
3521 * Length of a allocated file in bytes. Sparse files are counted by actual
3522 * allocated space. Return < 0 if error or unknown.
3524 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3526 BlockDriver *drv = bs->drv;
3527 if (!drv) {
3528 return -ENOMEDIUM;
3530 if (drv->bdrv_get_allocated_file_size) {
3531 return drv->bdrv_get_allocated_file_size(bs);
3533 if (bs->file) {
3534 return bdrv_get_allocated_file_size(bs->file);
3536 return -ENOTSUP;
3540 * Return number of sectors on success, -errno on error.
3542 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3544 BlockDriver *drv = bs->drv;
3546 if (!drv)
3547 return -ENOMEDIUM;
3549 if (drv->has_variable_length) {
3550 int ret = refresh_total_sectors(bs, bs->total_sectors);
3551 if (ret < 0) {
3552 return ret;
3555 return bs->total_sectors;
3559 * Return length in bytes on success, -errno on error.
3560 * The length is always a multiple of BDRV_SECTOR_SIZE.
3562 int64_t bdrv_getlength(BlockDriverState *bs)
3564 int64_t ret = bdrv_nb_sectors(bs);
3566 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3569 /* return 0 as number of sectors if no device present or error */
3570 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3572 int64_t nb_sectors = bdrv_nb_sectors(bs);
3574 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3577 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3578 BlockdevOnError on_write_error)
3580 bs->on_read_error = on_read_error;
3581 bs->on_write_error = on_write_error;
3584 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3586 return is_read ? bs->on_read_error : bs->on_write_error;
3589 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3591 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3593 switch (on_err) {
3594 case BLOCKDEV_ON_ERROR_ENOSPC:
3595 return (error == ENOSPC) ?
3596 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3597 case BLOCKDEV_ON_ERROR_STOP:
3598 return BLOCK_ERROR_ACTION_STOP;
3599 case BLOCKDEV_ON_ERROR_REPORT:
3600 return BLOCK_ERROR_ACTION_REPORT;
3601 case BLOCKDEV_ON_ERROR_IGNORE:
3602 return BLOCK_ERROR_ACTION_IGNORE;
3603 default:
3604 abort();
3608 /* This is done by device models because, while the block layer knows
3609 * about the error, it does not know whether an operation comes from
3610 * the device or the block layer (from a job, for example).
3612 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3613 bool is_read, int error)
3615 assert(error >= 0);
3617 if (action == BLOCK_ERROR_ACTION_STOP) {
3618 /* First set the iostatus, so that "info block" returns an iostatus
3619 * that matches the events raised so far (an additional error iostatus
3620 * is fine, but not a lost one).
3622 bdrv_iostatus_set_err(bs, error);
3624 /* Then raise the request to stop the VM and the event.
3625 * qemu_system_vmstop_request_prepare has two effects. First,
3626 * it ensures that the STOP event always comes after the
3627 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3628 * can observe the STOP event and do a "cont" before the STOP
3629 * event is issued, the VM will not stop. In this case, vm_start()
3630 * also ensures that the STOP/RESUME pair of events is emitted.
3632 qemu_system_vmstop_request_prepare();
3633 qapi_event_send_block_io_error(bdrv_get_device_name(bs),
3634 is_read ? IO_OPERATION_TYPE_READ :
3635 IO_OPERATION_TYPE_WRITE,
3636 action, &error_abort);
3637 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3638 } else {
3639 qapi_event_send_block_io_error(bdrv_get_device_name(bs),
3640 is_read ? IO_OPERATION_TYPE_READ :
3641 IO_OPERATION_TYPE_WRITE,
3642 action, &error_abort);
3646 int bdrv_is_read_only(BlockDriverState *bs)
3648 return bs->read_only;
3651 int bdrv_is_sg(BlockDriverState *bs)
3653 return bs->sg;
3656 int bdrv_enable_write_cache(BlockDriverState *bs)
3658 return bs->enable_write_cache;
3661 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3663 bs->enable_write_cache = wce;
3665 /* so a reopen() will preserve wce */
3666 if (wce) {
3667 bs->open_flags |= BDRV_O_CACHE_WB;
3668 } else {
3669 bs->open_flags &= ~BDRV_O_CACHE_WB;
3673 int bdrv_is_encrypted(BlockDriverState *bs)
3675 if (bs->backing_hd && bs->backing_hd->encrypted)
3676 return 1;
3677 return bs->encrypted;
3680 int bdrv_key_required(BlockDriverState *bs)
3682 BlockDriverState *backing_hd = bs->backing_hd;
3684 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3685 return 1;
3686 return (bs->encrypted && !bs->valid_key);
3689 int bdrv_set_key(BlockDriverState *bs, const char *key)
3691 int ret;
3692 if (bs->backing_hd && bs->backing_hd->encrypted) {
3693 ret = bdrv_set_key(bs->backing_hd, key);
3694 if (ret < 0)
3695 return ret;
3696 if (!bs->encrypted)
3697 return 0;
3699 if (!bs->encrypted) {
3700 return -EINVAL;
3701 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3702 return -ENOMEDIUM;
3704 ret = bs->drv->bdrv_set_key(bs, key);
3705 if (ret < 0) {
3706 bs->valid_key = 0;
3707 } else if (!bs->valid_key) {
3708 bs->valid_key = 1;
3709 /* call the change callback now, we skipped it on open */
3710 bdrv_dev_change_media_cb(bs, true);
3712 return ret;
3715 const char *bdrv_get_format_name(BlockDriverState *bs)
3717 return bs->drv ? bs->drv->format_name : NULL;
3720 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3721 void *opaque)
3723 BlockDriver *drv;
3724 int count = 0;
3725 const char **formats = NULL;
3727 QLIST_FOREACH(drv, &bdrv_drivers, list) {
3728 if (drv->format_name) {
3729 bool found = false;
3730 int i = count;
3731 while (formats && i && !found) {
3732 found = !strcmp(formats[--i], drv->format_name);
3735 if (!found) {
3736 formats = g_realloc(formats, (count + 1) * sizeof(char *));
3737 formats[count++] = drv->format_name;
3738 it(opaque, drv->format_name);
3742 g_free(formats);
3745 /* This function is to find block backend bs */
3746 BlockDriverState *bdrv_find(const char *name)
3748 BlockDriverState *bs;
3750 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3751 if (!strcmp(name, bs->device_name)) {
3752 return bs;
3755 return NULL;
3758 /* This function is to find a node in the bs graph */
3759 BlockDriverState *bdrv_find_node(const char *node_name)
3761 BlockDriverState *bs;
3763 assert(node_name);
3765 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3766 if (!strcmp(node_name, bs->node_name)) {
3767 return bs;
3770 return NULL;
3773 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3774 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3776 BlockDeviceInfoList *list, *entry;
3777 BlockDriverState *bs;
3779 list = NULL;
3780 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3781 entry = g_malloc0(sizeof(*entry));
3782 entry->value = bdrv_block_device_info(bs);
3783 entry->next = list;
3784 list = entry;
3787 return list;
3790 BlockDriverState *bdrv_lookup_bs(const char *device,
3791 const char *node_name,
3792 Error **errp)
3794 BlockDriverState *bs = NULL;
3796 if (device) {
3797 bs = bdrv_find(device);
3799 if (bs) {
3800 return bs;
3804 if (node_name) {
3805 bs = bdrv_find_node(node_name);
3807 if (bs) {
3808 return bs;
3812 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3813 device ? device : "",
3814 node_name ? node_name : "");
3815 return NULL;
3818 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3819 * return false. If either argument is NULL, return false. */
3820 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3822 while (top && top != base) {
3823 top = top->backing_hd;
3826 return top != NULL;
3829 BlockDriverState *bdrv_next(BlockDriverState *bs)
3831 if (!bs) {
3832 return QTAILQ_FIRST(&bdrv_states);
3834 return QTAILQ_NEXT(bs, device_list);
3837 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3839 BlockDriverState *bs;
3841 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3842 it(opaque, bs);
3846 const char *bdrv_get_device_name(BlockDriverState *bs)
3848 return bs->device_name;
3851 int bdrv_get_flags(BlockDriverState *bs)
3853 return bs->open_flags;
3856 int bdrv_flush_all(void)
3858 BlockDriverState *bs;
3859 int result = 0;
3861 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3862 AioContext *aio_context = bdrv_get_aio_context(bs);
3863 int ret;
3865 aio_context_acquire(aio_context);
3866 ret = bdrv_flush(bs);
3867 if (ret < 0 && !result) {
3868 result = ret;
3870 aio_context_release(aio_context);
3873 return result;
3876 int bdrv_has_zero_init_1(BlockDriverState *bs)
3878 return 1;
3881 int bdrv_has_zero_init(BlockDriverState *bs)
3883 assert(bs->drv);
3885 /* If BS is a copy on write image, it is initialized to
3886 the contents of the base image, which may not be zeroes. */
3887 if (bs->backing_hd) {
3888 return 0;
3890 if (bs->drv->bdrv_has_zero_init) {
3891 return bs->drv->bdrv_has_zero_init(bs);
3894 /* safe default */
3895 return 0;
3898 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3900 BlockDriverInfo bdi;
3902 if (bs->backing_hd) {
3903 return false;
3906 if (bdrv_get_info(bs, &bdi) == 0) {
3907 return bdi.unallocated_blocks_are_zero;
3910 return false;
3913 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3915 BlockDriverInfo bdi;
3917 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3918 return false;
3921 if (bdrv_get_info(bs, &bdi) == 0) {
3922 return bdi.can_write_zeroes_with_unmap;
3925 return false;
3928 typedef struct BdrvCoGetBlockStatusData {
3929 BlockDriverState *bs;
3930 BlockDriverState *base;
3931 int64_t sector_num;
3932 int nb_sectors;
3933 int *pnum;
3934 int64_t ret;
3935 bool done;
3936 } BdrvCoGetBlockStatusData;
3939 * Returns true iff the specified sector is present in the disk image. Drivers
3940 * not implementing the functionality are assumed to not support backing files,
3941 * hence all their sectors are reported as allocated.
3943 * If 'sector_num' is beyond the end of the disk image the return value is 0
3944 * and 'pnum' is set to 0.
3946 * 'pnum' is set to the number of sectors (including and immediately following
3947 * the specified sector) that are known to be in the same
3948 * allocated/unallocated state.
3950 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3951 * beyond the end of the disk image it will be clamped.
3953 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3954 int64_t sector_num,
3955 int nb_sectors, int *pnum)
3957 int64_t length;
3958 int64_t n;
3959 int64_t ret, ret2;
3961 length = bdrv_getlength(bs);
3962 if (length < 0) {
3963 return length;
3966 if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3967 *pnum = 0;
3968 return 0;
3971 n = bs->total_sectors - sector_num;
3972 if (n < nb_sectors) {
3973 nb_sectors = n;
3976 if (!bs->drv->bdrv_co_get_block_status) {
3977 *pnum = nb_sectors;
3978 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3979 if (bs->drv->protocol_name) {
3980 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3982 return ret;
3985 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3986 if (ret < 0) {
3987 *pnum = 0;
3988 return ret;
3991 if (ret & BDRV_BLOCK_RAW) {
3992 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3993 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3994 *pnum, pnum);
3997 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
3998 ret |= BDRV_BLOCK_ALLOCATED;
4001 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4002 if (bdrv_unallocated_blocks_are_zero(bs)) {
4003 ret |= BDRV_BLOCK_ZERO;
4004 } else if (bs->backing_hd) {
4005 BlockDriverState *bs2 = bs->backing_hd;
4006 int64_t length2 = bdrv_getlength(bs2);
4007 if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
4008 ret |= BDRV_BLOCK_ZERO;
4013 if (bs->file &&
4014 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4015 (ret & BDRV_BLOCK_OFFSET_VALID)) {
4016 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4017 *pnum, pnum);
4018 if (ret2 >= 0) {
4019 /* Ignore errors. This is just providing extra information, it
4020 * is useful but not necessary.
4022 ret |= (ret2 & BDRV_BLOCK_ZERO);
4026 return ret;
4029 /* Coroutine wrapper for bdrv_get_block_status() */
4030 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4032 BdrvCoGetBlockStatusData *data = opaque;
4033 BlockDriverState *bs = data->bs;
4035 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4036 data->pnum);
4037 data->done = true;
4041 * Synchronous wrapper around bdrv_co_get_block_status().
4043 * See bdrv_co_get_block_status() for details.
4045 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4046 int nb_sectors, int *pnum)
4048 Coroutine *co;
4049 BdrvCoGetBlockStatusData data = {
4050 .bs = bs,
4051 .sector_num = sector_num,
4052 .nb_sectors = nb_sectors,
4053 .pnum = pnum,
4054 .done = false,
4057 if (qemu_in_coroutine()) {
4058 /* Fast-path if already in coroutine context */
4059 bdrv_get_block_status_co_entry(&data);
4060 } else {
4061 AioContext *aio_context = bdrv_get_aio_context(bs);
4063 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4064 qemu_coroutine_enter(co, &data);
4065 while (!data.done) {
4066 aio_poll(aio_context, true);
4069 return data.ret;
4072 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4073 int nb_sectors, int *pnum)
4075 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4076 if (ret < 0) {
4077 return ret;
4079 return !!(ret & BDRV_BLOCK_ALLOCATED);
4083 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4085 * Return true if the given sector is allocated in any image between
4086 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4087 * sector is allocated in any image of the chain. Return false otherwise.
4089 * 'pnum' is set to the number of sectors (including and immediately following
4090 * the specified sector) that are known to be in the same
4091 * allocated/unallocated state.
4094 int bdrv_is_allocated_above(BlockDriverState *top,
4095 BlockDriverState *base,
4096 int64_t sector_num,
4097 int nb_sectors, int *pnum)
4099 BlockDriverState *intermediate;
4100 int ret, n = nb_sectors;
4102 intermediate = top;
4103 while (intermediate && intermediate != base) {
4104 int pnum_inter;
4105 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4106 &pnum_inter);
4107 if (ret < 0) {
4108 return ret;
4109 } else if (ret) {
4110 *pnum = pnum_inter;
4111 return 1;
4115 * [sector_num, nb_sectors] is unallocated on top but intermediate
4116 * might have
4118 * [sector_num+x, nr_sectors] allocated.
4120 if (n > pnum_inter &&
4121 (intermediate == top ||
4122 sector_num + pnum_inter < intermediate->total_sectors)) {
4123 n = pnum_inter;
4126 intermediate = intermediate->backing_hd;
4129 *pnum = n;
4130 return 0;
4133 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4135 if (bs->backing_hd && bs->backing_hd->encrypted)
4136 return bs->backing_file;
4137 else if (bs->encrypted)
4138 return bs->filename;
4139 else
4140 return NULL;
4143 void bdrv_get_backing_filename(BlockDriverState *bs,
4144 char *filename, int filename_size)
4146 pstrcpy(filename, filename_size, bs->backing_file);
4149 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4150 const uint8_t *buf, int nb_sectors)
4152 BlockDriver *drv = bs->drv;
4153 if (!drv)
4154 return -ENOMEDIUM;
4155 if (!drv->bdrv_write_compressed)
4156 return -ENOTSUP;
4157 if (bdrv_check_request(bs, sector_num, nb_sectors))
4158 return -EIO;
4160 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4162 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4165 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4167 BlockDriver *drv = bs->drv;
4168 if (!drv)
4169 return -ENOMEDIUM;
4170 if (!drv->bdrv_get_info)
4171 return -ENOTSUP;
4172 memset(bdi, 0, sizeof(*bdi));
4173 return drv->bdrv_get_info(bs, bdi);
4176 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4178 BlockDriver *drv = bs->drv;
4179 if (drv && drv->bdrv_get_specific_info) {
4180 return drv->bdrv_get_specific_info(bs);
4182 return NULL;
4185 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4186 int64_t pos, int size)
4188 QEMUIOVector qiov;
4189 struct iovec iov = {
4190 .iov_base = (void *) buf,
4191 .iov_len = size,
4194 qemu_iovec_init_external(&qiov, &iov, 1);
4195 return bdrv_writev_vmstate(bs, &qiov, pos);
4198 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4200 BlockDriver *drv = bs->drv;
4202 if (!drv) {
4203 return -ENOMEDIUM;
4204 } else if (drv->bdrv_save_vmstate) {
4205 return drv->bdrv_save_vmstate(bs, qiov, pos);
4206 } else if (bs->file) {
4207 return bdrv_writev_vmstate(bs->file, qiov, pos);
4210 return -ENOTSUP;
4213 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4214 int64_t pos, int size)
4216 BlockDriver *drv = bs->drv;
4217 if (!drv)
4218 return -ENOMEDIUM;
4219 if (drv->bdrv_load_vmstate)
4220 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4221 if (bs->file)
4222 return bdrv_load_vmstate(bs->file, buf, pos, size);
4223 return -ENOTSUP;
4226 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4228 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4229 return;
4232 bs->drv->bdrv_debug_event(bs, event);
4235 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4236 const char *tag)
4238 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4239 bs = bs->file;
4242 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4243 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4246 return -ENOTSUP;
4249 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4251 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4252 bs = bs->file;
4255 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4256 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4259 return -ENOTSUP;
4262 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4264 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4265 bs = bs->file;
4268 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4269 return bs->drv->bdrv_debug_resume(bs, tag);
4272 return -ENOTSUP;
4275 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4277 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4278 bs = bs->file;
4281 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4282 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4285 return false;
4288 int bdrv_is_snapshot(BlockDriverState *bs)
4290 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4293 /* backing_file can either be relative, or absolute, or a protocol. If it is
4294 * relative, it must be relative to the chain. So, passing in bs->filename
4295 * from a BDS as backing_file should not be done, as that may be relative to
4296 * the CWD rather than the chain. */
4297 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4298 const char *backing_file)
4300 char *filename_full = NULL;
4301 char *backing_file_full = NULL;
4302 char *filename_tmp = NULL;
4303 int is_protocol = 0;
4304 BlockDriverState *curr_bs = NULL;
4305 BlockDriverState *retval = NULL;
4307 if (!bs || !bs->drv || !backing_file) {
4308 return NULL;
4311 filename_full = g_malloc(PATH_MAX);
4312 backing_file_full = g_malloc(PATH_MAX);
4313 filename_tmp = g_malloc(PATH_MAX);
4315 is_protocol = path_has_protocol(backing_file);
4317 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4319 /* If either of the filename paths is actually a protocol, then
4320 * compare unmodified paths; otherwise make paths relative */
4321 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4322 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4323 retval = curr_bs->backing_hd;
4324 break;
4326 } else {
4327 /* If not an absolute filename path, make it relative to the current
4328 * image's filename path */
4329 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4330 backing_file);
4332 /* We are going to compare absolute pathnames */
4333 if (!realpath(filename_tmp, filename_full)) {
4334 continue;
4337 /* We need to make sure the backing filename we are comparing against
4338 * is relative to the current image filename (or absolute) */
4339 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4340 curr_bs->backing_file);
4342 if (!realpath(filename_tmp, backing_file_full)) {
4343 continue;
4346 if (strcmp(backing_file_full, filename_full) == 0) {
4347 retval = curr_bs->backing_hd;
4348 break;
4353 g_free(filename_full);
4354 g_free(backing_file_full);
4355 g_free(filename_tmp);
4356 return retval;
4359 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4361 if (!bs->drv) {
4362 return 0;
4365 if (!bs->backing_hd) {
4366 return 0;
4369 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4372 /**************************************************************/
4373 /* async I/Os */
4375 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4376 QEMUIOVector *qiov, int nb_sectors,
4377 BlockDriverCompletionFunc *cb, void *opaque)
4379 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4381 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4382 cb, opaque, false);
4385 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4386 QEMUIOVector *qiov, int nb_sectors,
4387 BlockDriverCompletionFunc *cb, void *opaque)
4389 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4391 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4392 cb, opaque, true);
4395 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4396 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4397 BlockDriverCompletionFunc *cb, void *opaque)
4399 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4401 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4402 BDRV_REQ_ZERO_WRITE | flags,
4403 cb, opaque, true);
4407 typedef struct MultiwriteCB {
4408 int error;
4409 int num_requests;
4410 int num_callbacks;
4411 struct {
4412 BlockDriverCompletionFunc *cb;
4413 void *opaque;
4414 QEMUIOVector *free_qiov;
4415 } callbacks[];
4416 } MultiwriteCB;
4418 static void multiwrite_user_cb(MultiwriteCB *mcb)
4420 int i;
4422 for (i = 0; i < mcb->num_callbacks; i++) {
4423 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4424 if (mcb->callbacks[i].free_qiov) {
4425 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4427 g_free(mcb->callbacks[i].free_qiov);
4431 static void multiwrite_cb(void *opaque, int ret)
4433 MultiwriteCB *mcb = opaque;
4435 trace_multiwrite_cb(mcb, ret);
4437 if (ret < 0 && !mcb->error) {
4438 mcb->error = ret;
4441 mcb->num_requests--;
4442 if (mcb->num_requests == 0) {
4443 multiwrite_user_cb(mcb);
4444 g_free(mcb);
4448 static int multiwrite_req_compare(const void *a, const void *b)
4450 const BlockRequest *req1 = a, *req2 = b;
4453 * Note that we can't simply subtract req2->sector from req1->sector
4454 * here as that could overflow the return value.
4456 if (req1->sector > req2->sector) {
4457 return 1;
4458 } else if (req1->sector < req2->sector) {
4459 return -1;
4460 } else {
4461 return 0;
4466 * Takes a bunch of requests and tries to merge them. Returns the number of
4467 * requests that remain after merging.
4469 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4470 int num_reqs, MultiwriteCB *mcb)
4472 int i, outidx;
4474 // Sort requests by start sector
4475 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4477 // Check if adjacent requests touch the same clusters. If so, combine them,
4478 // filling up gaps with zero sectors.
4479 outidx = 0;
4480 for (i = 1; i < num_reqs; i++) {
4481 int merge = 0;
4482 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4484 // Handle exactly sequential writes and overlapping writes.
4485 if (reqs[i].sector <= oldreq_last) {
4486 merge = 1;
4489 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4490 merge = 0;
4493 if (merge) {
4494 size_t size;
4495 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4496 qemu_iovec_init(qiov,
4497 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4499 // Add the first request to the merged one. If the requests are
4500 // overlapping, drop the last sectors of the first request.
4501 size = (reqs[i].sector - reqs[outidx].sector) << 9;
4502 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4504 // We should need to add any zeros between the two requests
4505 assert (reqs[i].sector <= oldreq_last);
4507 // Add the second request
4508 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4510 reqs[outidx].nb_sectors = qiov->size >> 9;
4511 reqs[outidx].qiov = qiov;
4513 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4514 } else {
4515 outidx++;
4516 reqs[outidx].sector = reqs[i].sector;
4517 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4518 reqs[outidx].qiov = reqs[i].qiov;
4522 return outidx + 1;
4526 * Submit multiple AIO write requests at once.
4528 * On success, the function returns 0 and all requests in the reqs array have
4529 * been submitted. In error case this function returns -1, and any of the
4530 * requests may or may not be submitted yet. In particular, this means that the
4531 * callback will be called for some of the requests, for others it won't. The
4532 * caller must check the error field of the BlockRequest to wait for the right
4533 * callbacks (if error != 0, no callback will be called).
4535 * The implementation may modify the contents of the reqs array, e.g. to merge
4536 * requests. However, the fields opaque and error are left unmodified as they
4537 * are used to signal failure for a single request to the caller.
4539 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4541 MultiwriteCB *mcb;
4542 int i;
4544 /* don't submit writes if we don't have a medium */
4545 if (bs->drv == NULL) {
4546 for (i = 0; i < num_reqs; i++) {
4547 reqs[i].error = -ENOMEDIUM;
4549 return -1;
4552 if (num_reqs == 0) {
4553 return 0;
4556 // Create MultiwriteCB structure
4557 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4558 mcb->num_requests = 0;
4559 mcb->num_callbacks = num_reqs;
4561 for (i = 0; i < num_reqs; i++) {
4562 mcb->callbacks[i].cb = reqs[i].cb;
4563 mcb->callbacks[i].opaque = reqs[i].opaque;
4566 // Check for mergable requests
4567 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4569 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4571 /* Run the aio requests. */
4572 mcb->num_requests = num_reqs;
4573 for (i = 0; i < num_reqs; i++) {
4574 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4575 reqs[i].nb_sectors, reqs[i].flags,
4576 multiwrite_cb, mcb,
4577 true);
4580 return 0;
4583 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4585 acb->aiocb_info->cancel(acb);
4588 /**************************************************************/
4589 /* async block device emulation */
4591 typedef struct BlockDriverAIOCBSync {
4592 BlockDriverAIOCB common;
4593 QEMUBH *bh;
4594 int ret;
4595 /* vector translation state */
4596 QEMUIOVector *qiov;
4597 uint8_t *bounce;
4598 int is_write;
4599 } BlockDriverAIOCBSync;
4601 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4603 BlockDriverAIOCBSync *acb =
4604 container_of(blockacb, BlockDriverAIOCBSync, common);
4605 qemu_bh_delete(acb->bh);
4606 acb->bh = NULL;
4607 qemu_aio_release(acb);
4610 static const AIOCBInfo bdrv_em_aiocb_info = {
4611 .aiocb_size = sizeof(BlockDriverAIOCBSync),
4612 .cancel = bdrv_aio_cancel_em,
4615 static void bdrv_aio_bh_cb(void *opaque)
4617 BlockDriverAIOCBSync *acb = opaque;
4619 if (!acb->is_write)
4620 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4621 qemu_vfree(acb->bounce);
4622 acb->common.cb(acb->common.opaque, acb->ret);
4623 qemu_bh_delete(acb->bh);
4624 acb->bh = NULL;
4625 qemu_aio_release(acb);
4628 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4629 int64_t sector_num,
4630 QEMUIOVector *qiov,
4631 int nb_sectors,
4632 BlockDriverCompletionFunc *cb,
4633 void *opaque,
4634 int is_write)
4637 BlockDriverAIOCBSync *acb;
4639 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4640 acb->is_write = is_write;
4641 acb->qiov = qiov;
4642 acb->bounce = qemu_blockalign(bs, qiov->size);
4643 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4645 if (is_write) {
4646 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4647 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4648 } else {
4649 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4652 qemu_bh_schedule(acb->bh);
4654 return &acb->common;
4657 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4658 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4659 BlockDriverCompletionFunc *cb, void *opaque)
4661 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4664 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4665 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4666 BlockDriverCompletionFunc *cb, void *opaque)
4668 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4672 typedef struct BlockDriverAIOCBCoroutine {
4673 BlockDriverAIOCB common;
4674 BlockRequest req;
4675 bool is_write;
4676 bool *done;
4677 QEMUBH* bh;
4678 } BlockDriverAIOCBCoroutine;
4680 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4682 AioContext *aio_context = bdrv_get_aio_context(blockacb->bs);
4683 BlockDriverAIOCBCoroutine *acb =
4684 container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4685 bool done = false;
4687 acb->done = &done;
4688 while (!done) {
4689 aio_poll(aio_context, true);
4693 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4694 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
4695 .cancel = bdrv_aio_co_cancel_em,
4698 static void bdrv_co_em_bh(void *opaque)
4700 BlockDriverAIOCBCoroutine *acb = opaque;
4702 acb->common.cb(acb->common.opaque, acb->req.error);
4704 if (acb->done) {
4705 *acb->done = true;
4708 qemu_bh_delete(acb->bh);
4709 qemu_aio_release(acb);
4712 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4713 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4715 BlockDriverAIOCBCoroutine *acb = opaque;
4716 BlockDriverState *bs = acb->common.bs;
4718 if (!acb->is_write) {
4719 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4720 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4721 } else {
4722 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4723 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4726 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4727 qemu_bh_schedule(acb->bh);
4730 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4731 int64_t sector_num,
4732 QEMUIOVector *qiov,
4733 int nb_sectors,
4734 BdrvRequestFlags flags,
4735 BlockDriverCompletionFunc *cb,
4736 void *opaque,
4737 bool is_write)
4739 Coroutine *co;
4740 BlockDriverAIOCBCoroutine *acb;
4742 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4743 acb->req.sector = sector_num;
4744 acb->req.nb_sectors = nb_sectors;
4745 acb->req.qiov = qiov;
4746 acb->req.flags = flags;
4747 acb->is_write = is_write;
4748 acb->done = NULL;
4750 co = qemu_coroutine_create(bdrv_co_do_rw);
4751 qemu_coroutine_enter(co, acb);
4753 return &acb->common;
4756 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4758 BlockDriverAIOCBCoroutine *acb = opaque;
4759 BlockDriverState *bs = acb->common.bs;
4761 acb->req.error = bdrv_co_flush(bs);
4762 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4763 qemu_bh_schedule(acb->bh);
4766 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4767 BlockDriverCompletionFunc *cb, void *opaque)
4769 trace_bdrv_aio_flush(bs, opaque);
4771 Coroutine *co;
4772 BlockDriverAIOCBCoroutine *acb;
4774 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4775 acb->done = NULL;
4777 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4778 qemu_coroutine_enter(co, acb);
4780 return &acb->common;
4783 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4785 BlockDriverAIOCBCoroutine *acb = opaque;
4786 BlockDriverState *bs = acb->common.bs;
4788 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4789 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4790 qemu_bh_schedule(acb->bh);
4793 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4794 int64_t sector_num, int nb_sectors,
4795 BlockDriverCompletionFunc *cb, void *opaque)
4797 Coroutine *co;
4798 BlockDriverAIOCBCoroutine *acb;
4800 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4802 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4803 acb->req.sector = sector_num;
4804 acb->req.nb_sectors = nb_sectors;
4805 acb->done = NULL;
4806 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4807 qemu_coroutine_enter(co, acb);
4809 return &acb->common;
4812 void bdrv_init(void)
4814 module_call_init(MODULE_INIT_BLOCK);
4817 void bdrv_init_with_whitelist(void)
4819 use_bdrv_whitelist = 1;
4820 bdrv_init();
4823 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4824 BlockDriverCompletionFunc *cb, void *opaque)
4826 BlockDriverAIOCB *acb;
4828 acb = g_slice_alloc(aiocb_info->aiocb_size);
4829 acb->aiocb_info = aiocb_info;
4830 acb->bs = bs;
4831 acb->cb = cb;
4832 acb->opaque = opaque;
4833 return acb;
4836 void qemu_aio_release(void *p)
4838 BlockDriverAIOCB *acb = p;
4839 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4842 /**************************************************************/
4843 /* Coroutine block device emulation */
4845 typedef struct CoroutineIOCompletion {
4846 Coroutine *coroutine;
4847 int ret;
4848 } CoroutineIOCompletion;
4850 static void bdrv_co_io_em_complete(void *opaque, int ret)
4852 CoroutineIOCompletion *co = opaque;
4854 co->ret = ret;
4855 qemu_coroutine_enter(co->coroutine, NULL);
4858 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4859 int nb_sectors, QEMUIOVector *iov,
4860 bool is_write)
4862 CoroutineIOCompletion co = {
4863 .coroutine = qemu_coroutine_self(),
4865 BlockDriverAIOCB *acb;
4867 if (is_write) {
4868 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4869 bdrv_co_io_em_complete, &co);
4870 } else {
4871 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4872 bdrv_co_io_em_complete, &co);
4875 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4876 if (!acb) {
4877 return -EIO;
4879 qemu_coroutine_yield();
4881 return co.ret;
4884 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4885 int64_t sector_num, int nb_sectors,
4886 QEMUIOVector *iov)
4888 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4891 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4892 int64_t sector_num, int nb_sectors,
4893 QEMUIOVector *iov)
4895 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4898 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4900 RwCo *rwco = opaque;
4902 rwco->ret = bdrv_co_flush(rwco->bs);
4905 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4907 int ret;
4909 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4910 return 0;
4913 /* Write back cached data to the OS even with cache=unsafe */
4914 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4915 if (bs->drv->bdrv_co_flush_to_os) {
4916 ret = bs->drv->bdrv_co_flush_to_os(bs);
4917 if (ret < 0) {
4918 return ret;
4922 /* But don't actually force it to the disk with cache=unsafe */
4923 if (bs->open_flags & BDRV_O_NO_FLUSH) {
4924 goto flush_parent;
4927 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4928 if (bs->drv->bdrv_co_flush_to_disk) {
4929 ret = bs->drv->bdrv_co_flush_to_disk(bs);
4930 } else if (bs->drv->bdrv_aio_flush) {
4931 BlockDriverAIOCB *acb;
4932 CoroutineIOCompletion co = {
4933 .coroutine = qemu_coroutine_self(),
4936 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4937 if (acb == NULL) {
4938 ret = -EIO;
4939 } else {
4940 qemu_coroutine_yield();
4941 ret = co.ret;
4943 } else {
4945 * Some block drivers always operate in either writethrough or unsafe
4946 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4947 * know how the server works (because the behaviour is hardcoded or
4948 * depends on server-side configuration), so we can't ensure that
4949 * everything is safe on disk. Returning an error doesn't work because
4950 * that would break guests even if the server operates in writethrough
4951 * mode.
4953 * Let's hope the user knows what he's doing.
4955 ret = 0;
4957 if (ret < 0) {
4958 return ret;
4961 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4962 * in the case of cache=unsafe, so there are no useless flushes.
4964 flush_parent:
4965 return bdrv_co_flush(bs->file);
4968 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4970 Error *local_err = NULL;
4971 int ret;
4973 if (!bs->drv) {
4974 return;
4977 if (bs->drv->bdrv_invalidate_cache) {
4978 bs->drv->bdrv_invalidate_cache(bs, &local_err);
4979 } else if (bs->file) {
4980 bdrv_invalidate_cache(bs->file, &local_err);
4982 if (local_err) {
4983 error_propagate(errp, local_err);
4984 return;
4987 ret = refresh_total_sectors(bs, bs->total_sectors);
4988 if (ret < 0) {
4989 error_setg_errno(errp, -ret, "Could not refresh total sector count");
4990 return;
4994 void bdrv_invalidate_cache_all(Error **errp)
4996 BlockDriverState *bs;
4997 Error *local_err = NULL;
4999 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5000 AioContext *aio_context = bdrv_get_aio_context(bs);
5002 aio_context_acquire(aio_context);
5003 bdrv_invalidate_cache(bs, &local_err);
5004 aio_context_release(aio_context);
5005 if (local_err) {
5006 error_propagate(errp, local_err);
5007 return;
5012 void bdrv_clear_incoming_migration_all(void)
5014 BlockDriverState *bs;
5016 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5017 AioContext *aio_context = bdrv_get_aio_context(bs);
5019 aio_context_acquire(aio_context);
5020 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
5021 aio_context_release(aio_context);
5025 int bdrv_flush(BlockDriverState *bs)
5027 Coroutine *co;
5028 RwCo rwco = {
5029 .bs = bs,
5030 .ret = NOT_DONE,
5033 if (qemu_in_coroutine()) {
5034 /* Fast-path if already in coroutine context */
5035 bdrv_flush_co_entry(&rwco);
5036 } else {
5037 AioContext *aio_context = bdrv_get_aio_context(bs);
5039 co = qemu_coroutine_create(bdrv_flush_co_entry);
5040 qemu_coroutine_enter(co, &rwco);
5041 while (rwco.ret == NOT_DONE) {
5042 aio_poll(aio_context, true);
5046 return rwco.ret;
5049 typedef struct DiscardCo {
5050 BlockDriverState *bs;
5051 int64_t sector_num;
5052 int nb_sectors;
5053 int ret;
5054 } DiscardCo;
5055 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5057 DiscardCo *rwco = opaque;
5059 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5062 /* if no limit is specified in the BlockLimits use a default
5063 * of 32768 512-byte sectors (16 MiB) per request.
5065 #define MAX_DISCARD_DEFAULT 32768
5067 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5068 int nb_sectors)
5070 int max_discard;
5072 if (!bs->drv) {
5073 return -ENOMEDIUM;
5074 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5075 return -EIO;
5076 } else if (bs->read_only) {
5077 return -EROFS;
5080 bdrv_reset_dirty(bs, sector_num, nb_sectors);
5082 /* Do nothing if disabled. */
5083 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5084 return 0;
5087 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5088 return 0;
5091 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5092 while (nb_sectors > 0) {
5093 int ret;
5094 int num = nb_sectors;
5096 /* align request */
5097 if (bs->bl.discard_alignment &&
5098 num >= bs->bl.discard_alignment &&
5099 sector_num % bs->bl.discard_alignment) {
5100 if (num > bs->bl.discard_alignment) {
5101 num = bs->bl.discard_alignment;
5103 num -= sector_num % bs->bl.discard_alignment;
5106 /* limit request size */
5107 if (num > max_discard) {
5108 num = max_discard;
5111 if (bs->drv->bdrv_co_discard) {
5112 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5113 } else {
5114 BlockDriverAIOCB *acb;
5115 CoroutineIOCompletion co = {
5116 .coroutine = qemu_coroutine_self(),
5119 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5120 bdrv_co_io_em_complete, &co);
5121 if (acb == NULL) {
5122 return -EIO;
5123 } else {
5124 qemu_coroutine_yield();
5125 ret = co.ret;
5128 if (ret && ret != -ENOTSUP) {
5129 return ret;
5132 sector_num += num;
5133 nb_sectors -= num;
5135 return 0;
5138 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5140 Coroutine *co;
5141 DiscardCo rwco = {
5142 .bs = bs,
5143 .sector_num = sector_num,
5144 .nb_sectors = nb_sectors,
5145 .ret = NOT_DONE,
5148 if (qemu_in_coroutine()) {
5149 /* Fast-path if already in coroutine context */
5150 bdrv_discard_co_entry(&rwco);
5151 } else {
5152 AioContext *aio_context = bdrv_get_aio_context(bs);
5154 co = qemu_coroutine_create(bdrv_discard_co_entry);
5155 qemu_coroutine_enter(co, &rwco);
5156 while (rwco.ret == NOT_DONE) {
5157 aio_poll(aio_context, true);
5161 return rwco.ret;
5164 /**************************************************************/
5165 /* removable device support */
5168 * Return TRUE if the media is present
5170 int bdrv_is_inserted(BlockDriverState *bs)
5172 BlockDriver *drv = bs->drv;
5174 if (!drv)
5175 return 0;
5176 if (!drv->bdrv_is_inserted)
5177 return 1;
5178 return drv->bdrv_is_inserted(bs);
5182 * Return whether the media changed since the last call to this
5183 * function, or -ENOTSUP if we don't know. Most drivers don't know.
5185 int bdrv_media_changed(BlockDriverState *bs)
5187 BlockDriver *drv = bs->drv;
5189 if (drv && drv->bdrv_media_changed) {
5190 return drv->bdrv_media_changed(bs);
5192 return -ENOTSUP;
5196 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5198 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5200 BlockDriver *drv = bs->drv;
5202 if (drv && drv->bdrv_eject) {
5203 drv->bdrv_eject(bs, eject_flag);
5206 if (bs->device_name[0] != '\0') {
5207 qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
5208 eject_flag, &error_abort);
5213 * Lock or unlock the media (if it is locked, the user won't be able
5214 * to eject it manually).
5216 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5218 BlockDriver *drv = bs->drv;
5220 trace_bdrv_lock_medium(bs, locked);
5222 if (drv && drv->bdrv_lock_medium) {
5223 drv->bdrv_lock_medium(bs, locked);
5227 /* needed for generic scsi interface */
5229 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5231 BlockDriver *drv = bs->drv;
5233 if (drv && drv->bdrv_ioctl)
5234 return drv->bdrv_ioctl(bs, req, buf);
5235 return -ENOTSUP;
5238 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5239 unsigned long int req, void *buf,
5240 BlockDriverCompletionFunc *cb, void *opaque)
5242 BlockDriver *drv = bs->drv;
5244 if (drv && drv->bdrv_aio_ioctl)
5245 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5246 return NULL;
5249 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5251 bs->guest_block_size = align;
5254 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5256 return qemu_memalign(bdrv_opt_mem_align(bs), size);
5260 * Check if all memory in this vector is sector aligned.
5262 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5264 int i;
5265 size_t alignment = bdrv_opt_mem_align(bs);
5267 for (i = 0; i < qiov->niov; i++) {
5268 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5269 return false;
5271 if (qiov->iov[i].iov_len % alignment) {
5272 return false;
5276 return true;
5279 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5280 Error **errp)
5282 int64_t bitmap_size;
5283 BdrvDirtyBitmap *bitmap;
5285 assert((granularity & (granularity - 1)) == 0);
5287 granularity >>= BDRV_SECTOR_BITS;
5288 assert(granularity);
5289 bitmap_size = bdrv_getlength(bs);
5290 if (bitmap_size < 0) {
5291 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5292 errno = -bitmap_size;
5293 return NULL;
5295 bitmap_size >>= BDRV_SECTOR_BITS;
5296 bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5297 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5298 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5299 return bitmap;
5302 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5304 BdrvDirtyBitmap *bm, *next;
5305 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5306 if (bm == bitmap) {
5307 QLIST_REMOVE(bitmap, list);
5308 hbitmap_free(bitmap->bitmap);
5309 g_free(bitmap);
5310 return;
5315 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5317 BdrvDirtyBitmap *bm;
5318 BlockDirtyInfoList *list = NULL;
5319 BlockDirtyInfoList **plist = &list;
5321 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5322 BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5323 BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5324 info->count = bdrv_get_dirty_count(bs, bm);
5325 info->granularity =
5326 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5327 entry->value = info;
5328 *plist = entry;
5329 plist = &entry->next;
5332 return list;
5335 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5337 if (bitmap) {
5338 return hbitmap_get(bitmap->bitmap, sector);
5339 } else {
5340 return 0;
5344 void bdrv_dirty_iter_init(BlockDriverState *bs,
5345 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5347 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5350 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5351 int nr_sectors)
5353 BdrvDirtyBitmap *bitmap;
5354 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5355 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5359 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5361 BdrvDirtyBitmap *bitmap;
5362 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5363 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5367 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5369 return hbitmap_count(bitmap->bitmap);
5372 /* Get a reference to bs */
5373 void bdrv_ref(BlockDriverState *bs)
5375 bs->refcnt++;
5378 /* Release a previously grabbed reference to bs.
5379 * If after releasing, reference count is zero, the BlockDriverState is
5380 * deleted. */
5381 void bdrv_unref(BlockDriverState *bs)
5383 assert(bs->refcnt > 0);
5384 if (--bs->refcnt == 0) {
5385 bdrv_delete(bs);
5389 struct BdrvOpBlocker {
5390 Error *reason;
5391 QLIST_ENTRY(BdrvOpBlocker) list;
5394 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5396 BdrvOpBlocker *blocker;
5397 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5398 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5399 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5400 if (errp) {
5401 error_setg(errp, "Device '%s' is busy: %s",
5402 bs->device_name, error_get_pretty(blocker->reason));
5404 return true;
5406 return false;
5409 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5411 BdrvOpBlocker *blocker;
5412 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5414 blocker = g_malloc0(sizeof(BdrvOpBlocker));
5415 blocker->reason = reason;
5416 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5419 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5421 BdrvOpBlocker *blocker, *next;
5422 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5423 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5424 if (blocker->reason == reason) {
5425 QLIST_REMOVE(blocker, list);
5426 g_free(blocker);
5431 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5433 int i;
5434 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5435 bdrv_op_block(bs, i, reason);
5439 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5441 int i;
5442 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5443 bdrv_op_unblock(bs, i, reason);
5447 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5449 int i;
5451 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5452 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5453 return false;
5456 return true;
5459 void bdrv_iostatus_enable(BlockDriverState *bs)
5461 bs->iostatus_enabled = true;
5462 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5465 /* The I/O status is only enabled if the drive explicitly
5466 * enables it _and_ the VM is configured to stop on errors */
5467 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5469 return (bs->iostatus_enabled &&
5470 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5471 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5472 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5475 void bdrv_iostatus_disable(BlockDriverState *bs)
5477 bs->iostatus_enabled = false;
5480 void bdrv_iostatus_reset(BlockDriverState *bs)
5482 if (bdrv_iostatus_is_enabled(bs)) {
5483 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5484 if (bs->job) {
5485 block_job_iostatus_reset(bs->job);
5490 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5492 assert(bdrv_iostatus_is_enabled(bs));
5493 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5494 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5495 BLOCK_DEVICE_IO_STATUS_FAILED;
5499 void
5500 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5501 enum BlockAcctType type)
5503 assert(type < BDRV_MAX_IOTYPE);
5505 cookie->bytes = bytes;
5506 cookie->start_time_ns = get_clock();
5507 cookie->type = type;
5510 void
5511 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5513 assert(cookie->type < BDRV_MAX_IOTYPE);
5515 bs->nr_bytes[cookie->type] += cookie->bytes;
5516 bs->nr_ops[cookie->type]++;
5517 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5520 void bdrv_img_create(const char *filename, const char *fmt,
5521 const char *base_filename, const char *base_fmt,
5522 char *options, uint64_t img_size, int flags,
5523 Error **errp, bool quiet)
5525 QemuOptsList *create_opts = NULL;
5526 QemuOpts *opts = NULL;
5527 const char *backing_fmt, *backing_file;
5528 int64_t size;
5529 BlockDriver *drv, *proto_drv;
5530 BlockDriver *backing_drv = NULL;
5531 Error *local_err = NULL;
5532 int ret = 0;
5534 /* Find driver and parse its options */
5535 drv = bdrv_find_format(fmt);
5536 if (!drv) {
5537 error_setg(errp, "Unknown file format '%s'", fmt);
5538 return;
5541 proto_drv = bdrv_find_protocol(filename, true);
5542 if (!proto_drv) {
5543 error_setg(errp, "Unknown protocol '%s'", filename);
5544 return;
5547 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5548 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5550 /* Create parameter list with default values */
5551 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5552 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5554 /* Parse -o options */
5555 if (options) {
5556 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5557 error_setg(errp, "Invalid options for file format '%s'", fmt);
5558 goto out;
5562 if (base_filename) {
5563 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5564 error_setg(errp, "Backing file not supported for file format '%s'",
5565 fmt);
5566 goto out;
5570 if (base_fmt) {
5571 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5572 error_setg(errp, "Backing file format not supported for file "
5573 "format '%s'", fmt);
5574 goto out;
5578 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5579 if (backing_file) {
5580 if (!strcmp(filename, backing_file)) {
5581 error_setg(errp, "Error: Trying to create an image with the "
5582 "same filename as the backing file");
5583 goto out;
5587 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5588 if (backing_fmt) {
5589 backing_drv = bdrv_find_format(backing_fmt);
5590 if (!backing_drv) {
5591 error_setg(errp, "Unknown backing file format '%s'",
5592 backing_fmt);
5593 goto out;
5597 // The size for the image must always be specified, with one exception:
5598 // If we are using a backing file, we can obtain the size from there
5599 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5600 if (size == -1) {
5601 if (backing_file) {
5602 BlockDriverState *bs;
5603 uint64_t size;
5604 int back_flags;
5606 /* backing files always opened read-only */
5607 back_flags =
5608 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5610 bs = NULL;
5611 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5612 backing_drv, &local_err);
5613 if (ret < 0) {
5614 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5615 backing_file,
5616 error_get_pretty(local_err));
5617 error_free(local_err);
5618 local_err = NULL;
5619 goto out;
5621 bdrv_get_geometry(bs, &size);
5622 size *= 512;
5624 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5626 bdrv_unref(bs);
5627 } else {
5628 error_setg(errp, "Image creation needs a size parameter");
5629 goto out;
5633 if (!quiet) {
5634 printf("Formatting '%s', fmt=%s ", filename, fmt);
5635 qemu_opts_print(opts);
5636 puts("");
5639 ret = bdrv_create(drv, filename, opts, &local_err);
5641 if (ret == -EFBIG) {
5642 /* This is generally a better message than whatever the driver would
5643 * deliver (especially because of the cluster_size_hint), since that
5644 * is most probably not much different from "image too large". */
5645 const char *cluster_size_hint = "";
5646 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5647 cluster_size_hint = " (try using a larger cluster size)";
5649 error_setg(errp, "The image size is too large for file format '%s'"
5650 "%s", fmt, cluster_size_hint);
5651 error_free(local_err);
5652 local_err = NULL;
5655 out:
5656 qemu_opts_del(opts);
5657 qemu_opts_free(create_opts);
5658 if (local_err) {
5659 error_propagate(errp, local_err);
5663 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5665 return bs->aio_context;
5668 void bdrv_detach_aio_context(BlockDriverState *bs)
5670 if (!bs->drv) {
5671 return;
5674 if (bs->io_limits_enabled) {
5675 throttle_detach_aio_context(&bs->throttle_state);
5677 if (bs->drv->bdrv_detach_aio_context) {
5678 bs->drv->bdrv_detach_aio_context(bs);
5680 if (bs->file) {
5681 bdrv_detach_aio_context(bs->file);
5683 if (bs->backing_hd) {
5684 bdrv_detach_aio_context(bs->backing_hd);
5687 bs->aio_context = NULL;
5690 void bdrv_attach_aio_context(BlockDriverState *bs,
5691 AioContext *new_context)
5693 if (!bs->drv) {
5694 return;
5697 bs->aio_context = new_context;
5699 if (bs->backing_hd) {
5700 bdrv_attach_aio_context(bs->backing_hd, new_context);
5702 if (bs->file) {
5703 bdrv_attach_aio_context(bs->file, new_context);
5705 if (bs->drv->bdrv_attach_aio_context) {
5706 bs->drv->bdrv_attach_aio_context(bs, new_context);
5708 if (bs->io_limits_enabled) {
5709 throttle_attach_aio_context(&bs->throttle_state, new_context);
5713 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5715 bdrv_drain_all(); /* ensure there are no in-flight requests */
5717 bdrv_detach_aio_context(bs);
5719 /* This function executes in the old AioContext so acquire the new one in
5720 * case it runs in a different thread.
5722 aio_context_acquire(new_context);
5723 bdrv_attach_aio_context(bs, new_context);
5724 aio_context_release(new_context);
5727 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5728 NotifierWithReturn *notifier)
5730 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5733 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts)
5735 if (!bs->drv->bdrv_amend_options) {
5736 return -ENOTSUP;
5738 return bs->drv->bdrv_amend_options(bs, opts);
5741 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5742 * of block filter and by bdrv_is_first_non_filter.
5743 * It is used to test if the given bs is the candidate or recurse more in the
5744 * node graph.
5746 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5747 BlockDriverState *candidate)
5749 /* return false if basic checks fails */
5750 if (!bs || !bs->drv) {
5751 return false;
5754 /* the code reached a non block filter driver -> check if the bs is
5755 * the same as the candidate. It's the recursion termination condition.
5757 if (!bs->drv->is_filter) {
5758 return bs == candidate;
5760 /* Down this path the driver is a block filter driver */
5762 /* If the block filter recursion method is defined use it to recurse down
5763 * the node graph.
5765 if (bs->drv->bdrv_recurse_is_first_non_filter) {
5766 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5769 /* the driver is a block filter but don't allow to recurse -> return false
5771 return false;
5774 /* This function checks if the candidate is the first non filter bs down it's
5775 * bs chain. Since we don't have pointers to parents it explore all bs chains
5776 * from the top. Some filters can choose not to pass down the recursion.
5778 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5780 BlockDriverState *bs;
5782 /* walk down the bs forest recursively */
5783 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5784 bool perm;
5786 /* try to recurse in this top level bs */
5787 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5789 /* candidate is the first non filter */
5790 if (perm) {
5791 return true;
5795 return false;
5798 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5800 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5801 if (!to_replace_bs) {
5802 error_setg(errp, "Node name '%s' not found", node_name);
5803 return NULL;
5806 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5807 return NULL;
5810 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5811 * most non filter in order to prevent data corruption.
5812 * Another benefit is that this tests exclude backing files which are
5813 * blocked by the backing blockers.
5815 if (!bdrv_is_first_non_filter(to_replace_bs)) {
5816 error_setg(errp, "Only top most non filter can be replaced");
5817 return NULL;
5820 return to_replace_bs;
5823 void bdrv_io_plug(BlockDriverState *bs)
5825 BlockDriver *drv = bs->drv;
5826 if (drv && drv->bdrv_io_plug) {
5827 drv->bdrv_io_plug(bs);
5828 } else if (bs->file) {
5829 bdrv_io_plug(bs->file);
5833 void bdrv_io_unplug(BlockDriverState *bs)
5835 BlockDriver *drv = bs->drv;
5836 if (drv && drv->bdrv_io_unplug) {
5837 drv->bdrv_io_unplug(bs);
5838 } else if (bs->file) {
5839 bdrv_io_unplug(bs->file);
5843 void bdrv_flush_io_queue(BlockDriverState *bs)
5845 BlockDriver *drv = bs->drv;
5846 if (drv && drv->bdrv_flush_io_queue) {
5847 drv->bdrv_flush_io_queue(bs);
5848 } else if (bs->file) {
5849 bdrv_flush_io_queue(bs->file);