block: Factor bdrv_probe_all() out of find_image_format()
[qemu/ar7.git] / block.c
blobaa7edf89117b04c3ffaab8204475340c8e5bc11f
1 /*
2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
40 #ifdef CONFIG_BSD
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
45 #ifndef __DragonFly__
46 #include <sys/disk.h>
47 #endif
48 #endif
50 #ifdef _WIN32
51 #include <windows.h>
52 #endif
54 struct BdrvDirtyBitmap {
55 HBitmap *bitmap;
56 QLIST_ENTRY(BdrvDirtyBitmap) list;
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63 BlockCompletionFunc *cb, void *opaque);
64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66 BlockCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75 BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78 BdrvRequestFlags flags);
79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
83 BdrvRequestFlags flags,
84 BlockCompletionFunc *cb,
85 void *opaque,
86 bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
111 int is_windows_drive(const char *filename)
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
121 #endif
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
127 int i;
129 throttle_config(&bs->throttle_state, cfg);
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
143 bs->io_limits_enabled = false;
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
151 bs->io_limits_enabled = enabled;
153 return drained;
156 void bdrv_io_limits_disable(BlockDriverState *bs)
158 bs->io_limits_enabled = false;
160 bdrv_start_throttled_reqs(bs);
162 throttle_destroy(&bs->throttle_state);
165 static void bdrv_throttle_read_timer_cb(void *opaque)
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
171 static void bdrv_throttle_write_timer_cb(void *opaque)
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 bdrv_get_aio_context(bs),
183 QEMU_CLOCK_VIRTUAL,
184 bdrv_throttle_read_timer_cb,
185 bdrv_throttle_write_timer_cb,
186 bs);
187 bs->io_limits_enabled = true;
190 /* This function makes an IO wait if needed
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
195 static void bdrv_io_limits_intercept(BlockDriverState *bs,
196 unsigned int bytes,
197 bool is_write)
199 /* does this io must wait */
200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
202 /* if must wait or any request of this type throttled queue the IO */
203 if (must_wait ||
204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
208 /* the IO will be executed, do the accounting */
209 throttle_account(&bs->throttle_state, is_write, bytes);
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214 return;
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
221 size_t bdrv_opt_mem_align(BlockDriverState *bs)
223 if (!bs || !bs->drv) {
224 /* 4k should be on the safe side */
225 return 4096;
228 return bs->bl.opt_mem_alignment;
231 /* check if the path starts with "<protocol>:" */
232 static int path_has_protocol(const char *path)
234 const char *p;
236 #ifdef _WIN32
237 if (is_windows_drive(path) ||
238 is_windows_drive_prefix(path)) {
239 return 0;
241 p = path + strcspn(path, ":/\\");
242 #else
243 p = path + strcspn(path, ":/");
244 #endif
246 return *p == ':';
249 int path_is_absolute(const char *path)
251 #ifdef _WIN32
252 /* specific case for names like: "\\.\d:" */
253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254 return 1;
256 return (*path == '/' || *path == '\\');
257 #else
258 return (*path == '/');
259 #endif
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
264 supported. */
265 void path_combine(char *dest, int dest_size,
266 const char *base_path,
267 const char *filename)
269 const char *p, *p1;
270 int len;
272 if (dest_size <= 0)
273 return;
274 if (path_is_absolute(filename)) {
275 pstrcpy(dest, dest_size, filename);
276 } else {
277 p = strchr(base_path, ':');
278 if (p)
279 p++;
280 else
281 p = base_path;
282 p1 = strrchr(base_path, '/');
283 #ifdef _WIN32
285 const char *p2;
286 p2 = strrchr(base_path, '\\');
287 if (!p1 || p2 > p1)
288 p1 = p2;
290 #endif
291 if (p1)
292 p1++;
293 else
294 p1 = base_path;
295 if (p1 > p)
296 p = p1;
297 len = p - base_path;
298 if (len > dest_size - 1)
299 len = dest_size - 1;
300 memcpy(dest, base_path, len);
301 dest[len] = '\0';
302 pstrcat(dest, dest_size, filename);
306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
308 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309 pstrcpy(dest, sz, bs->backing_file);
310 } else {
311 path_combine(dest, sz, bs->filename, bs->backing_file);
315 void bdrv_register(BlockDriver *bdrv)
317 /* Block drivers without coroutine functions need emulation */
318 if (!bdrv->bdrv_co_readv) {
319 bdrv->bdrv_co_readv = bdrv_co_readv_em;
320 bdrv->bdrv_co_writev = bdrv_co_writev_em;
322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323 * the block driver lacks aio we need to emulate that too.
325 if (!bdrv->bdrv_aio_readv) {
326 /* add AIO emulation layer */
327 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
332 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
335 BlockDriverState *bdrv_new_root(void)
337 BlockDriverState *bs = bdrv_new();
339 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
340 return bs;
343 BlockDriverState *bdrv_new(void)
345 BlockDriverState *bs;
346 int i;
348 bs = g_new0(BlockDriverState, 1);
349 QLIST_INIT(&bs->dirty_bitmaps);
350 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
351 QLIST_INIT(&bs->op_blockers[i]);
353 bdrv_iostatus_disable(bs);
354 notifier_list_init(&bs->close_notifiers);
355 notifier_with_return_list_init(&bs->before_write_notifiers);
356 qemu_co_queue_init(&bs->throttled_reqs[0]);
357 qemu_co_queue_init(&bs->throttled_reqs[1]);
358 bs->refcnt = 1;
359 bs->aio_context = qemu_get_aio_context();
361 return bs;
364 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
366 notifier_list_add(&bs->close_notifiers, notify);
369 BlockDriver *bdrv_find_format(const char *format_name)
371 BlockDriver *drv1;
372 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
373 if (!strcmp(drv1->format_name, format_name)) {
374 return drv1;
377 return NULL;
380 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
382 static const char *whitelist_rw[] = {
383 CONFIG_BDRV_RW_WHITELIST
385 static const char *whitelist_ro[] = {
386 CONFIG_BDRV_RO_WHITELIST
388 const char **p;
390 if (!whitelist_rw[0] && !whitelist_ro[0]) {
391 return 1; /* no whitelist, anything goes */
394 for (p = whitelist_rw; *p; p++) {
395 if (!strcmp(drv->format_name, *p)) {
396 return 1;
399 if (read_only) {
400 for (p = whitelist_ro; *p; p++) {
401 if (!strcmp(drv->format_name, *p)) {
402 return 1;
406 return 0;
409 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
410 bool read_only)
412 BlockDriver *drv = bdrv_find_format(format_name);
413 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
416 typedef struct CreateCo {
417 BlockDriver *drv;
418 char *filename;
419 QemuOpts *opts;
420 int ret;
421 Error *err;
422 } CreateCo;
424 static void coroutine_fn bdrv_create_co_entry(void *opaque)
426 Error *local_err = NULL;
427 int ret;
429 CreateCo *cco = opaque;
430 assert(cco->drv);
432 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
433 if (local_err) {
434 error_propagate(&cco->err, local_err);
436 cco->ret = ret;
439 int bdrv_create(BlockDriver *drv, const char* filename,
440 QemuOpts *opts, Error **errp)
442 int ret;
444 Coroutine *co;
445 CreateCo cco = {
446 .drv = drv,
447 .filename = g_strdup(filename),
448 .opts = opts,
449 .ret = NOT_DONE,
450 .err = NULL,
453 if (!drv->bdrv_create) {
454 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
455 ret = -ENOTSUP;
456 goto out;
459 if (qemu_in_coroutine()) {
460 /* Fast-path if already in coroutine context */
461 bdrv_create_co_entry(&cco);
462 } else {
463 co = qemu_coroutine_create(bdrv_create_co_entry);
464 qemu_coroutine_enter(co, &cco);
465 while (cco.ret == NOT_DONE) {
466 aio_poll(qemu_get_aio_context(), true);
470 ret = cco.ret;
471 if (ret < 0) {
472 if (cco.err) {
473 error_propagate(errp, cco.err);
474 } else {
475 error_setg_errno(errp, -ret, "Could not create image");
479 out:
480 g_free(cco.filename);
481 return ret;
484 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
486 BlockDriver *drv;
487 Error *local_err = NULL;
488 int ret;
490 drv = bdrv_find_protocol(filename, true);
491 if (drv == NULL) {
492 error_setg(errp, "Could not find protocol for file '%s'", filename);
493 return -ENOENT;
496 ret = bdrv_create(drv, filename, opts, &local_err);
497 if (local_err) {
498 error_propagate(errp, local_err);
500 return ret;
503 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
505 BlockDriver *drv = bs->drv;
506 Error *local_err = NULL;
508 memset(&bs->bl, 0, sizeof(bs->bl));
510 if (!drv) {
511 return;
514 /* Take some limits from the children as a default */
515 if (bs->file) {
516 bdrv_refresh_limits(bs->file, &local_err);
517 if (local_err) {
518 error_propagate(errp, local_err);
519 return;
521 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
522 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
523 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
524 } else {
525 bs->bl.opt_mem_alignment = 512;
528 if (bs->backing_hd) {
529 bdrv_refresh_limits(bs->backing_hd, &local_err);
530 if (local_err) {
531 error_propagate(errp, local_err);
532 return;
534 bs->bl.opt_transfer_length =
535 MAX(bs->bl.opt_transfer_length,
536 bs->backing_hd->bl.opt_transfer_length);
537 bs->bl.max_transfer_length =
538 MIN_NON_ZERO(bs->bl.max_transfer_length,
539 bs->backing_hd->bl.max_transfer_length);
540 bs->bl.opt_mem_alignment =
541 MAX(bs->bl.opt_mem_alignment,
542 bs->backing_hd->bl.opt_mem_alignment);
545 /* Then let the driver override it */
546 if (drv->bdrv_refresh_limits) {
547 drv->bdrv_refresh_limits(bs, errp);
552 * Create a uniquely-named empty temporary file.
553 * Return 0 upon success, otherwise a negative errno value.
555 int get_tmp_filename(char *filename, int size)
557 #ifdef _WIN32
558 char temp_dir[MAX_PATH];
559 /* GetTempFileName requires that its output buffer (4th param)
560 have length MAX_PATH or greater. */
561 assert(size >= MAX_PATH);
562 return (GetTempPath(MAX_PATH, temp_dir)
563 && GetTempFileName(temp_dir, "qem", 0, filename)
564 ? 0 : -GetLastError());
565 #else
566 int fd;
567 const char *tmpdir;
568 tmpdir = getenv("TMPDIR");
569 if (!tmpdir) {
570 tmpdir = "/var/tmp";
572 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
573 return -EOVERFLOW;
575 fd = mkstemp(filename);
576 if (fd < 0) {
577 return -errno;
579 if (close(fd) != 0) {
580 unlink(filename);
581 return -errno;
583 return 0;
584 #endif
588 * Detect host devices. By convention, /dev/cdrom[N] is always
589 * recognized as a host CDROM.
591 static BlockDriver *find_hdev_driver(const char *filename)
593 int score_max = 0, score;
594 BlockDriver *drv = NULL, *d;
596 QLIST_FOREACH(d, &bdrv_drivers, list) {
597 if (d->bdrv_probe_device) {
598 score = d->bdrv_probe_device(filename);
599 if (score > score_max) {
600 score_max = score;
601 drv = d;
606 return drv;
609 BlockDriver *bdrv_find_protocol(const char *filename,
610 bool allow_protocol_prefix)
612 BlockDriver *drv1;
613 char protocol[128];
614 int len;
615 const char *p;
617 /* TODO Drivers without bdrv_file_open must be specified explicitly */
620 * XXX(hch): we really should not let host device detection
621 * override an explicit protocol specification, but moving this
622 * later breaks access to device names with colons in them.
623 * Thanks to the brain-dead persistent naming schemes on udev-
624 * based Linux systems those actually are quite common.
626 drv1 = find_hdev_driver(filename);
627 if (drv1) {
628 return drv1;
631 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
632 return bdrv_find_format("file");
635 p = strchr(filename, ':');
636 assert(p != NULL);
637 len = p - filename;
638 if (len > sizeof(protocol) - 1)
639 len = sizeof(protocol) - 1;
640 memcpy(protocol, filename, len);
641 protocol[len] = '\0';
642 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
643 if (drv1->protocol_name &&
644 !strcmp(drv1->protocol_name, protocol)) {
645 return drv1;
648 return NULL;
652 * Guess image format by probing its contents.
653 * This is not a good idea when your image is raw (CVE-2008-2004), but
654 * we do it anyway for backward compatibility.
656 * @buf contains the image's first @buf_size bytes.
657 * @buf_size is the buffer size in bytes (generally 2048, but can be smaller
658 * if the image file is smaller)
659 * @filename is its filename.
661 * For all block drivers, call the bdrv_probe() method to get its
662 * probing score.
663 * Return the first block driver with the highest probing score.
665 static BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
666 const char *filename)
668 int score_max = 0, score;
669 BlockDriver *drv = NULL, *d;
671 QLIST_FOREACH(d, &bdrv_drivers, list) {
672 if (d->bdrv_probe) {
673 score = d->bdrv_probe(buf, buf_size, filename);
674 if (score > score_max) {
675 score_max = score;
676 drv = d;
681 return drv;
684 static int find_image_format(BlockDriverState *bs, const char *filename,
685 BlockDriver **pdrv, Error **errp)
687 BlockDriver *drv;
688 uint8_t buf[2048];
689 int ret = 0;
691 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
692 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
693 drv = bdrv_find_format("raw");
694 if (!drv) {
695 error_setg(errp, "Could not find raw image format");
696 ret = -ENOENT;
698 *pdrv = drv;
699 return ret;
702 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
703 if (ret < 0) {
704 error_setg_errno(errp, -ret, "Could not read image for determining its "
705 "format");
706 *pdrv = NULL;
707 return ret;
710 drv = bdrv_probe_all(buf, ret, filename);
711 if (!drv) {
712 error_setg(errp, "Could not determine image format: No compatible "
713 "driver found");
714 ret = -ENOENT;
716 *pdrv = drv;
717 return ret;
721 * Set the current 'total_sectors' value
722 * Return 0 on success, -errno on error.
724 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
726 BlockDriver *drv = bs->drv;
728 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
729 if (bs->sg)
730 return 0;
732 /* query actual device if possible, otherwise just trust the hint */
733 if (drv->bdrv_getlength) {
734 int64_t length = drv->bdrv_getlength(bs);
735 if (length < 0) {
736 return length;
738 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
741 bs->total_sectors = hint;
742 return 0;
746 * Set open flags for a given discard mode
748 * Return 0 on success, -1 if the discard mode was invalid.
750 int bdrv_parse_discard_flags(const char *mode, int *flags)
752 *flags &= ~BDRV_O_UNMAP;
754 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
755 /* do nothing */
756 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
757 *flags |= BDRV_O_UNMAP;
758 } else {
759 return -1;
762 return 0;
766 * Set open flags for a given cache mode
768 * Return 0 on success, -1 if the cache mode was invalid.
770 int bdrv_parse_cache_flags(const char *mode, int *flags)
772 *flags &= ~BDRV_O_CACHE_MASK;
774 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
775 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
776 } else if (!strcmp(mode, "directsync")) {
777 *flags |= BDRV_O_NOCACHE;
778 } else if (!strcmp(mode, "writeback")) {
779 *flags |= BDRV_O_CACHE_WB;
780 } else if (!strcmp(mode, "unsafe")) {
781 *flags |= BDRV_O_CACHE_WB;
782 *flags |= BDRV_O_NO_FLUSH;
783 } else if (!strcmp(mode, "writethrough")) {
784 /* this is the default */
785 } else {
786 return -1;
789 return 0;
793 * The copy-on-read flag is actually a reference count so multiple users may
794 * use the feature without worrying about clobbering its previous state.
795 * Copy-on-read stays enabled until all users have called to disable it.
797 void bdrv_enable_copy_on_read(BlockDriverState *bs)
799 bs->copy_on_read++;
802 void bdrv_disable_copy_on_read(BlockDriverState *bs)
804 assert(bs->copy_on_read > 0);
805 bs->copy_on_read--;
809 * Returns the flags that a temporary snapshot should get, based on the
810 * originally requested flags (the originally requested image will have flags
811 * like a backing file)
813 static int bdrv_temp_snapshot_flags(int flags)
815 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
819 * Returns the flags that bs->file should get, based on the given flags for
820 * the parent BDS
822 static int bdrv_inherited_flags(int flags)
824 /* Enable protocol handling, disable format probing for bs->file */
825 flags |= BDRV_O_PROTOCOL;
827 /* Our block drivers take care to send flushes and respect unmap policy,
828 * so we can enable both unconditionally on lower layers. */
829 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
831 /* Clear flags that only apply to the top layer */
832 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
834 return flags;
838 * Returns the flags that bs->backing_hd should get, based on the given flags
839 * for the parent BDS
841 static int bdrv_backing_flags(int flags)
843 /* backing files always opened read-only */
844 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
846 /* snapshot=on is handled on the top layer */
847 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
849 return flags;
852 static int bdrv_open_flags(BlockDriverState *bs, int flags)
854 int open_flags = flags | BDRV_O_CACHE_WB;
857 * Clear flags that are internal to the block layer before opening the
858 * image.
860 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
863 * Snapshots should be writable.
865 if (flags & BDRV_O_TEMPORARY) {
866 open_flags |= BDRV_O_RDWR;
869 return open_flags;
872 static void bdrv_assign_node_name(BlockDriverState *bs,
873 const char *node_name,
874 Error **errp)
876 if (!node_name) {
877 return;
880 /* Check for empty string or invalid characters */
881 if (!id_wellformed(node_name)) {
882 error_setg(errp, "Invalid node name");
883 return;
886 /* takes care of avoiding namespaces collisions */
887 if (blk_by_name(node_name)) {
888 error_setg(errp, "node-name=%s is conflicting with a device id",
889 node_name);
890 return;
893 /* takes care of avoiding duplicates node names */
894 if (bdrv_find_node(node_name)) {
895 error_setg(errp, "Duplicate node name");
896 return;
899 /* copy node name into the bs and insert it into the graph list */
900 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
901 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
905 * Common part for opening disk images and files
907 * Removes all processed options from *options.
909 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
910 QDict *options, int flags, BlockDriver *drv, Error **errp)
912 int ret, open_flags;
913 const char *filename;
914 const char *node_name = NULL;
915 Error *local_err = NULL;
917 assert(drv != NULL);
918 assert(bs->file == NULL);
919 assert(options != NULL && bs->options != options);
921 if (file != NULL) {
922 filename = file->filename;
923 } else {
924 filename = qdict_get_try_str(options, "filename");
927 if (drv->bdrv_needs_filename && !filename) {
928 error_setg(errp, "The '%s' block driver requires a file name",
929 drv->format_name);
930 return -EINVAL;
933 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
935 node_name = qdict_get_try_str(options, "node-name");
936 bdrv_assign_node_name(bs, node_name, &local_err);
937 if (local_err) {
938 error_propagate(errp, local_err);
939 return -EINVAL;
941 qdict_del(options, "node-name");
943 /* bdrv_open() with directly using a protocol as drv. This layer is already
944 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
945 * and return immediately. */
946 if (file != NULL && drv->bdrv_file_open) {
947 bdrv_swap(file, bs);
948 return 0;
951 bs->open_flags = flags;
952 bs->guest_block_size = 512;
953 bs->request_alignment = 512;
954 bs->zero_beyond_eof = true;
955 open_flags = bdrv_open_flags(bs, flags);
956 bs->read_only = !(open_flags & BDRV_O_RDWR);
957 bs->growable = !!(flags & BDRV_O_PROTOCOL);
959 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
960 error_setg(errp,
961 !bs->read_only && bdrv_is_whitelisted(drv, true)
962 ? "Driver '%s' can only be used for read-only devices"
963 : "Driver '%s' is not whitelisted",
964 drv->format_name);
965 return -ENOTSUP;
968 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
969 if (flags & BDRV_O_COPY_ON_READ) {
970 if (!bs->read_only) {
971 bdrv_enable_copy_on_read(bs);
972 } else {
973 error_setg(errp, "Can't use copy-on-read on read-only device");
974 return -EINVAL;
978 if (filename != NULL) {
979 pstrcpy(bs->filename, sizeof(bs->filename), filename);
980 } else {
981 bs->filename[0] = '\0';
983 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
985 bs->drv = drv;
986 bs->opaque = g_malloc0(drv->instance_size);
988 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
990 /* Open the image, either directly or using a protocol */
991 if (drv->bdrv_file_open) {
992 assert(file == NULL);
993 assert(!drv->bdrv_needs_filename || filename != NULL);
994 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
995 } else {
996 if (file == NULL) {
997 error_setg(errp, "Can't use '%s' as a block driver for the "
998 "protocol level", drv->format_name);
999 ret = -EINVAL;
1000 goto free_and_fail;
1002 bs->file = file;
1003 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1006 if (ret < 0) {
1007 if (local_err) {
1008 error_propagate(errp, local_err);
1009 } else if (bs->filename[0]) {
1010 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1011 } else {
1012 error_setg_errno(errp, -ret, "Could not open image");
1014 goto free_and_fail;
1017 ret = refresh_total_sectors(bs, bs->total_sectors);
1018 if (ret < 0) {
1019 error_setg_errno(errp, -ret, "Could not refresh total sector count");
1020 goto free_and_fail;
1023 bdrv_refresh_limits(bs, &local_err);
1024 if (local_err) {
1025 error_propagate(errp, local_err);
1026 ret = -EINVAL;
1027 goto free_and_fail;
1030 assert(bdrv_opt_mem_align(bs) != 0);
1031 assert((bs->request_alignment != 0) || bs->sg);
1032 return 0;
1034 free_and_fail:
1035 bs->file = NULL;
1036 g_free(bs->opaque);
1037 bs->opaque = NULL;
1038 bs->drv = NULL;
1039 return ret;
1042 static QDict *parse_json_filename(const char *filename, Error **errp)
1044 QObject *options_obj;
1045 QDict *options;
1046 int ret;
1048 ret = strstart(filename, "json:", &filename);
1049 assert(ret);
1051 options_obj = qobject_from_json(filename);
1052 if (!options_obj) {
1053 error_setg(errp, "Could not parse the JSON options");
1054 return NULL;
1057 if (qobject_type(options_obj) != QTYPE_QDICT) {
1058 qobject_decref(options_obj);
1059 error_setg(errp, "Invalid JSON object given");
1060 return NULL;
1063 options = qobject_to_qdict(options_obj);
1064 qdict_flatten(options);
1066 return options;
1070 * Fills in default options for opening images and converts the legacy
1071 * filename/flags pair to option QDict entries.
1073 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1074 BlockDriver *drv, Error **errp)
1076 const char *filename = *pfilename;
1077 const char *drvname;
1078 bool protocol = flags & BDRV_O_PROTOCOL;
1079 bool parse_filename = false;
1080 Error *local_err = NULL;
1082 /* Parse json: pseudo-protocol */
1083 if (filename && g_str_has_prefix(filename, "json:")) {
1084 QDict *json_options = parse_json_filename(filename, &local_err);
1085 if (local_err) {
1086 error_propagate(errp, local_err);
1087 return -EINVAL;
1090 /* Options given in the filename have lower priority than options
1091 * specified directly */
1092 qdict_join(*options, json_options, false);
1093 QDECREF(json_options);
1094 *pfilename = filename = NULL;
1097 /* Fetch the file name from the options QDict if necessary */
1098 if (protocol && filename) {
1099 if (!qdict_haskey(*options, "filename")) {
1100 qdict_put(*options, "filename", qstring_from_str(filename));
1101 parse_filename = true;
1102 } else {
1103 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1104 "the same time");
1105 return -EINVAL;
1109 /* Find the right block driver */
1110 filename = qdict_get_try_str(*options, "filename");
1111 drvname = qdict_get_try_str(*options, "driver");
1113 if (drv) {
1114 if (drvname) {
1115 error_setg(errp, "Driver specified twice");
1116 return -EINVAL;
1118 drvname = drv->format_name;
1119 qdict_put(*options, "driver", qstring_from_str(drvname));
1120 } else {
1121 if (!drvname && protocol) {
1122 if (filename) {
1123 drv = bdrv_find_protocol(filename, parse_filename);
1124 if (!drv) {
1125 error_setg(errp, "Unknown protocol");
1126 return -EINVAL;
1129 drvname = drv->format_name;
1130 qdict_put(*options, "driver", qstring_from_str(drvname));
1131 } else {
1132 error_setg(errp, "Must specify either driver or file");
1133 return -EINVAL;
1135 } else if (drvname) {
1136 drv = bdrv_find_format(drvname);
1137 if (!drv) {
1138 error_setg(errp, "Unknown driver '%s'", drvname);
1139 return -ENOENT;
1144 assert(drv || !protocol);
1146 /* Driver-specific filename parsing */
1147 if (drv && drv->bdrv_parse_filename && parse_filename) {
1148 drv->bdrv_parse_filename(filename, *options, &local_err);
1149 if (local_err) {
1150 error_propagate(errp, local_err);
1151 return -EINVAL;
1154 if (!drv->bdrv_needs_filename) {
1155 qdict_del(*options, "filename");
1159 return 0;
1162 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1165 if (bs->backing_hd) {
1166 assert(bs->backing_blocker);
1167 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1168 } else if (backing_hd) {
1169 error_setg(&bs->backing_blocker,
1170 "device is used as backing hd of '%s'",
1171 bdrv_get_device_name(bs));
1174 bs->backing_hd = backing_hd;
1175 if (!backing_hd) {
1176 error_free(bs->backing_blocker);
1177 bs->backing_blocker = NULL;
1178 goto out;
1180 bs->open_flags &= ~BDRV_O_NO_BACKING;
1181 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1182 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1183 backing_hd->drv ? backing_hd->drv->format_name : "");
1185 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1186 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1187 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1188 bs->backing_blocker);
1189 out:
1190 bdrv_refresh_limits(bs, NULL);
1194 * Opens the backing file for a BlockDriverState if not yet open
1196 * options is a QDict of options to pass to the block drivers, or NULL for an
1197 * empty set of options. The reference to the QDict is transferred to this
1198 * function (even on failure), so if the caller intends to reuse the dictionary,
1199 * it needs to use QINCREF() before calling bdrv_file_open.
1201 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1203 char *backing_filename = g_malloc0(PATH_MAX);
1204 int ret = 0;
1205 BlockDriver *back_drv = NULL;
1206 BlockDriverState *backing_hd;
1207 Error *local_err = NULL;
1209 if (bs->backing_hd != NULL) {
1210 QDECREF(options);
1211 goto free_exit;
1214 /* NULL means an empty set of options */
1215 if (options == NULL) {
1216 options = qdict_new();
1219 bs->open_flags &= ~BDRV_O_NO_BACKING;
1220 if (qdict_haskey(options, "file.filename")) {
1221 backing_filename[0] = '\0';
1222 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1223 QDECREF(options);
1224 goto free_exit;
1225 } else {
1226 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1229 if (!bs->drv || !bs->drv->supports_backing) {
1230 ret = -EINVAL;
1231 error_setg(errp, "Driver doesn't support backing files");
1232 QDECREF(options);
1233 goto free_exit;
1236 backing_hd = bdrv_new();
1238 if (bs->backing_format[0] != '\0') {
1239 back_drv = bdrv_find_format(bs->backing_format);
1242 assert(bs->backing_hd == NULL);
1243 ret = bdrv_open(&backing_hd,
1244 *backing_filename ? backing_filename : NULL, NULL, options,
1245 bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1246 if (ret < 0) {
1247 bdrv_unref(backing_hd);
1248 backing_hd = NULL;
1249 bs->open_flags |= BDRV_O_NO_BACKING;
1250 error_setg(errp, "Could not open backing file: %s",
1251 error_get_pretty(local_err));
1252 error_free(local_err);
1253 goto free_exit;
1255 bdrv_set_backing_hd(bs, backing_hd);
1257 free_exit:
1258 g_free(backing_filename);
1259 return ret;
1263 * Opens a disk image whose options are given as BlockdevRef in another block
1264 * device's options.
1266 * If allow_none is true, no image will be opened if filename is false and no
1267 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1269 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1270 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1271 * itself, all options starting with "${bdref_key}." are considered part of the
1272 * BlockdevRef.
1274 * The BlockdevRef will be removed from the options QDict.
1276 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1278 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1279 QDict *options, const char *bdref_key, int flags,
1280 bool allow_none, Error **errp)
1282 QDict *image_options;
1283 int ret;
1284 char *bdref_key_dot;
1285 const char *reference;
1287 assert(pbs);
1288 assert(*pbs == NULL);
1290 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1291 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1292 g_free(bdref_key_dot);
1294 reference = qdict_get_try_str(options, bdref_key);
1295 if (!filename && !reference && !qdict_size(image_options)) {
1296 if (allow_none) {
1297 ret = 0;
1298 } else {
1299 error_setg(errp, "A block device must be specified for \"%s\"",
1300 bdref_key);
1301 ret = -EINVAL;
1303 QDECREF(image_options);
1304 goto done;
1307 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1309 done:
1310 qdict_del(options, bdref_key);
1311 return ret;
1314 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1316 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1317 char *tmp_filename = g_malloc0(PATH_MAX + 1);
1318 int64_t total_size;
1319 BlockDriver *bdrv_qcow2;
1320 QemuOpts *opts = NULL;
1321 QDict *snapshot_options;
1322 BlockDriverState *bs_snapshot;
1323 Error *local_err;
1324 int ret;
1326 /* if snapshot, we create a temporary backing file and open it
1327 instead of opening 'filename' directly */
1329 /* Get the required size from the image */
1330 total_size = bdrv_getlength(bs);
1331 if (total_size < 0) {
1332 ret = total_size;
1333 error_setg_errno(errp, -total_size, "Could not get image size");
1334 goto out;
1337 /* Create the temporary image */
1338 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1339 if (ret < 0) {
1340 error_setg_errno(errp, -ret, "Could not get temporary filename");
1341 goto out;
1344 bdrv_qcow2 = bdrv_find_format("qcow2");
1345 opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
1346 &error_abort);
1347 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1348 ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
1349 qemu_opts_del(opts);
1350 if (ret < 0) {
1351 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1352 "'%s': %s", tmp_filename,
1353 error_get_pretty(local_err));
1354 error_free(local_err);
1355 goto out;
1358 /* Prepare a new options QDict for the temporary file */
1359 snapshot_options = qdict_new();
1360 qdict_put(snapshot_options, "file.driver",
1361 qstring_from_str("file"));
1362 qdict_put(snapshot_options, "file.filename",
1363 qstring_from_str(tmp_filename));
1365 bs_snapshot = bdrv_new();
1367 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1368 flags, bdrv_qcow2, &local_err);
1369 if (ret < 0) {
1370 error_propagate(errp, local_err);
1371 goto out;
1374 bdrv_append(bs_snapshot, bs);
1376 out:
1377 g_free(tmp_filename);
1378 return ret;
1382 * Opens a disk image (raw, qcow2, vmdk, ...)
1384 * options is a QDict of options to pass to the block drivers, or NULL for an
1385 * empty set of options. The reference to the QDict belongs to the block layer
1386 * after the call (even on failure), so if the caller intends to reuse the
1387 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1389 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1390 * If it is not NULL, the referenced BDS will be reused.
1392 * The reference parameter may be used to specify an existing block device which
1393 * should be opened. If specified, neither options nor a filename may be given,
1394 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1396 int bdrv_open(BlockDriverState **pbs, const char *filename,
1397 const char *reference, QDict *options, int flags,
1398 BlockDriver *drv, Error **errp)
1400 int ret;
1401 BlockDriverState *file = NULL, *bs;
1402 const char *drvname;
1403 Error *local_err = NULL;
1404 int snapshot_flags = 0;
1406 assert(pbs);
1408 if (reference) {
1409 bool options_non_empty = options ? qdict_size(options) : false;
1410 QDECREF(options);
1412 if (*pbs) {
1413 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1414 "another block device");
1415 return -EINVAL;
1418 if (filename || options_non_empty) {
1419 error_setg(errp, "Cannot reference an existing block device with "
1420 "additional options or a new filename");
1421 return -EINVAL;
1424 bs = bdrv_lookup_bs(reference, reference, errp);
1425 if (!bs) {
1426 return -ENODEV;
1428 bdrv_ref(bs);
1429 *pbs = bs;
1430 return 0;
1433 if (*pbs) {
1434 bs = *pbs;
1435 } else {
1436 bs = bdrv_new();
1439 /* NULL means an empty set of options */
1440 if (options == NULL) {
1441 options = qdict_new();
1444 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1445 if (local_err) {
1446 goto fail;
1449 /* Find the right image format driver */
1450 drv = NULL;
1451 drvname = qdict_get_try_str(options, "driver");
1452 if (drvname) {
1453 drv = bdrv_find_format(drvname);
1454 qdict_del(options, "driver");
1455 if (!drv) {
1456 error_setg(errp, "Unknown driver: '%s'", drvname);
1457 ret = -EINVAL;
1458 goto fail;
1462 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1463 if (drv && !drv->bdrv_file_open) {
1464 /* If the user explicitly wants a format driver here, we'll need to add
1465 * another layer for the protocol in bs->file */
1466 flags &= ~BDRV_O_PROTOCOL;
1469 bs->options = options;
1470 options = qdict_clone_shallow(options);
1472 /* Open image file without format layer */
1473 if ((flags & BDRV_O_PROTOCOL) == 0) {
1474 if (flags & BDRV_O_RDWR) {
1475 flags |= BDRV_O_ALLOW_RDWR;
1477 if (flags & BDRV_O_SNAPSHOT) {
1478 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1479 flags = bdrv_backing_flags(flags);
1482 assert(file == NULL);
1483 ret = bdrv_open_image(&file, filename, options, "file",
1484 bdrv_inherited_flags(flags),
1485 true, &local_err);
1486 if (ret < 0) {
1487 goto fail;
1491 /* Image format probing */
1492 if (!drv && file) {
1493 ret = find_image_format(file, filename, &drv, &local_err);
1494 if (ret < 0) {
1495 goto fail;
1497 } else if (!drv) {
1498 error_setg(errp, "Must specify either driver or file");
1499 ret = -EINVAL;
1500 goto fail;
1503 /* Open the image */
1504 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1505 if (ret < 0) {
1506 goto fail;
1509 if (file && (bs->file != file)) {
1510 bdrv_unref(file);
1511 file = NULL;
1514 /* If there is a backing file, use it */
1515 if ((flags & BDRV_O_NO_BACKING) == 0) {
1516 QDict *backing_options;
1518 qdict_extract_subqdict(options, &backing_options, "backing.");
1519 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1520 if (ret < 0) {
1521 goto close_and_fail;
1525 bdrv_refresh_filename(bs);
1527 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1528 * temporary snapshot afterwards. */
1529 if (snapshot_flags) {
1530 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1531 if (local_err) {
1532 goto close_and_fail;
1536 /* Check if any unknown options were used */
1537 if (options && (qdict_size(options) != 0)) {
1538 const QDictEntry *entry = qdict_first(options);
1539 if (flags & BDRV_O_PROTOCOL) {
1540 error_setg(errp, "Block protocol '%s' doesn't support the option "
1541 "'%s'", drv->format_name, entry->key);
1542 } else {
1543 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1544 "support the option '%s'", drv->format_name,
1545 bdrv_get_device_name(bs), entry->key);
1548 ret = -EINVAL;
1549 goto close_and_fail;
1552 if (!bdrv_key_required(bs)) {
1553 if (bs->blk) {
1554 blk_dev_change_media_cb(bs->blk, true);
1556 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1557 && !runstate_check(RUN_STATE_INMIGRATE)
1558 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1559 error_setg(errp,
1560 "Guest must be stopped for opening of encrypted image");
1561 ret = -EBUSY;
1562 goto close_and_fail;
1565 QDECREF(options);
1566 *pbs = bs;
1567 return 0;
1569 fail:
1570 if (file != NULL) {
1571 bdrv_unref(file);
1573 QDECREF(bs->options);
1574 QDECREF(options);
1575 bs->options = NULL;
1576 if (!*pbs) {
1577 /* If *pbs is NULL, a new BDS has been created in this function and
1578 needs to be freed now. Otherwise, it does not need to be closed,
1579 since it has not really been opened yet. */
1580 bdrv_unref(bs);
1582 if (local_err) {
1583 error_propagate(errp, local_err);
1585 return ret;
1587 close_and_fail:
1588 /* See fail path, but now the BDS has to be always closed */
1589 if (*pbs) {
1590 bdrv_close(bs);
1591 } else {
1592 bdrv_unref(bs);
1594 QDECREF(options);
1595 if (local_err) {
1596 error_propagate(errp, local_err);
1598 return ret;
1601 typedef struct BlockReopenQueueEntry {
1602 bool prepared;
1603 BDRVReopenState state;
1604 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1605 } BlockReopenQueueEntry;
1608 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1609 * reopen of multiple devices.
1611 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1612 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1613 * be created and initialized. This newly created BlockReopenQueue should be
1614 * passed back in for subsequent calls that are intended to be of the same
1615 * atomic 'set'.
1617 * bs is the BlockDriverState to add to the reopen queue.
1619 * flags contains the open flags for the associated bs
1621 * returns a pointer to bs_queue, which is either the newly allocated
1622 * bs_queue, or the existing bs_queue being used.
1625 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1626 BlockDriverState *bs, int flags)
1628 assert(bs != NULL);
1630 BlockReopenQueueEntry *bs_entry;
1631 if (bs_queue == NULL) {
1632 bs_queue = g_new0(BlockReopenQueue, 1);
1633 QSIMPLEQ_INIT(bs_queue);
1636 /* bdrv_open() masks this flag out */
1637 flags &= ~BDRV_O_PROTOCOL;
1639 if (bs->file) {
1640 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1643 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1644 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1646 bs_entry->state.bs = bs;
1647 bs_entry->state.flags = flags;
1649 return bs_queue;
1653 * Reopen multiple BlockDriverStates atomically & transactionally.
1655 * The queue passed in (bs_queue) must have been built up previous
1656 * via bdrv_reopen_queue().
1658 * Reopens all BDS specified in the queue, with the appropriate
1659 * flags. All devices are prepared for reopen, and failure of any
1660 * device will cause all device changes to be abandonded, and intermediate
1661 * data cleaned up.
1663 * If all devices prepare successfully, then the changes are committed
1664 * to all devices.
1667 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1669 int ret = -1;
1670 BlockReopenQueueEntry *bs_entry, *next;
1671 Error *local_err = NULL;
1673 assert(bs_queue != NULL);
1675 bdrv_drain_all();
1677 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1678 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1679 error_propagate(errp, local_err);
1680 goto cleanup;
1682 bs_entry->prepared = true;
1685 /* If we reach this point, we have success and just need to apply the
1686 * changes
1688 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1689 bdrv_reopen_commit(&bs_entry->state);
1692 ret = 0;
1694 cleanup:
1695 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1696 if (ret && bs_entry->prepared) {
1697 bdrv_reopen_abort(&bs_entry->state);
1699 g_free(bs_entry);
1701 g_free(bs_queue);
1702 return ret;
1706 /* Reopen a single BlockDriverState with the specified flags. */
1707 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1709 int ret = -1;
1710 Error *local_err = NULL;
1711 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1713 ret = bdrv_reopen_multiple(queue, &local_err);
1714 if (local_err != NULL) {
1715 error_propagate(errp, local_err);
1717 return ret;
1722 * Prepares a BlockDriverState for reopen. All changes are staged in the
1723 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1724 * the block driver layer .bdrv_reopen_prepare()
1726 * bs is the BlockDriverState to reopen
1727 * flags are the new open flags
1728 * queue is the reopen queue
1730 * Returns 0 on success, non-zero on error. On error errp will be set
1731 * as well.
1733 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1734 * It is the responsibility of the caller to then call the abort() or
1735 * commit() for any other BDS that have been left in a prepare() state
1738 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1739 Error **errp)
1741 int ret = -1;
1742 Error *local_err = NULL;
1743 BlockDriver *drv;
1745 assert(reopen_state != NULL);
1746 assert(reopen_state->bs->drv != NULL);
1747 drv = reopen_state->bs->drv;
1749 /* if we are to stay read-only, do not allow permission change
1750 * to r/w */
1751 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1752 reopen_state->flags & BDRV_O_RDWR) {
1753 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1754 bdrv_get_device_name(reopen_state->bs));
1755 goto error;
1759 ret = bdrv_flush(reopen_state->bs);
1760 if (ret) {
1761 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1762 strerror(-ret));
1763 goto error;
1766 if (drv->bdrv_reopen_prepare) {
1767 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1768 if (ret) {
1769 if (local_err != NULL) {
1770 error_propagate(errp, local_err);
1771 } else {
1772 error_setg(errp, "failed while preparing to reopen image '%s'",
1773 reopen_state->bs->filename);
1775 goto error;
1777 } else {
1778 /* It is currently mandatory to have a bdrv_reopen_prepare()
1779 * handler for each supported drv. */
1780 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1781 drv->format_name, bdrv_get_device_name(reopen_state->bs),
1782 "reopening of file");
1783 ret = -1;
1784 goto error;
1787 ret = 0;
1789 error:
1790 return ret;
1794 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1795 * makes them final by swapping the staging BlockDriverState contents into
1796 * the active BlockDriverState contents.
1798 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1800 BlockDriver *drv;
1802 assert(reopen_state != NULL);
1803 drv = reopen_state->bs->drv;
1804 assert(drv != NULL);
1806 /* If there are any driver level actions to take */
1807 if (drv->bdrv_reopen_commit) {
1808 drv->bdrv_reopen_commit(reopen_state);
1811 /* set BDS specific flags now */
1812 reopen_state->bs->open_flags = reopen_state->flags;
1813 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1814 BDRV_O_CACHE_WB);
1815 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1817 bdrv_refresh_limits(reopen_state->bs, NULL);
1821 * Abort the reopen, and delete and free the staged changes in
1822 * reopen_state
1824 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1826 BlockDriver *drv;
1828 assert(reopen_state != NULL);
1829 drv = reopen_state->bs->drv;
1830 assert(drv != NULL);
1832 if (drv->bdrv_reopen_abort) {
1833 drv->bdrv_reopen_abort(reopen_state);
1838 void bdrv_close(BlockDriverState *bs)
1840 BdrvAioNotifier *ban, *ban_next;
1842 if (bs->job) {
1843 block_job_cancel_sync(bs->job);
1845 bdrv_drain_all(); /* complete I/O */
1846 bdrv_flush(bs);
1847 bdrv_drain_all(); /* in case flush left pending I/O */
1848 notifier_list_notify(&bs->close_notifiers, bs);
1850 if (bs->drv) {
1851 if (bs->backing_hd) {
1852 BlockDriverState *backing_hd = bs->backing_hd;
1853 bdrv_set_backing_hd(bs, NULL);
1854 bdrv_unref(backing_hd);
1856 bs->drv->bdrv_close(bs);
1857 g_free(bs->opaque);
1858 bs->opaque = NULL;
1859 bs->drv = NULL;
1860 bs->copy_on_read = 0;
1861 bs->backing_file[0] = '\0';
1862 bs->backing_format[0] = '\0';
1863 bs->total_sectors = 0;
1864 bs->encrypted = 0;
1865 bs->valid_key = 0;
1866 bs->sg = 0;
1867 bs->growable = 0;
1868 bs->zero_beyond_eof = false;
1869 QDECREF(bs->options);
1870 bs->options = NULL;
1871 QDECREF(bs->full_open_options);
1872 bs->full_open_options = NULL;
1874 if (bs->file != NULL) {
1875 bdrv_unref(bs->file);
1876 bs->file = NULL;
1880 if (bs->blk) {
1881 blk_dev_change_media_cb(bs->blk, false);
1884 /*throttling disk I/O limits*/
1885 if (bs->io_limits_enabled) {
1886 bdrv_io_limits_disable(bs);
1889 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1890 g_free(ban);
1892 QLIST_INIT(&bs->aio_notifiers);
1895 void bdrv_close_all(void)
1897 BlockDriverState *bs;
1899 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1900 AioContext *aio_context = bdrv_get_aio_context(bs);
1902 aio_context_acquire(aio_context);
1903 bdrv_close(bs);
1904 aio_context_release(aio_context);
1908 /* Check if any requests are in-flight (including throttled requests) */
1909 static bool bdrv_requests_pending(BlockDriverState *bs)
1911 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1912 return true;
1914 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1915 return true;
1917 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1918 return true;
1920 if (bs->file && bdrv_requests_pending(bs->file)) {
1921 return true;
1923 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1924 return true;
1926 return false;
1929 static bool bdrv_drain_one(BlockDriverState *bs)
1931 bool bs_busy;
1933 bdrv_flush_io_queue(bs);
1934 bdrv_start_throttled_reqs(bs);
1935 bs_busy = bdrv_requests_pending(bs);
1936 bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
1937 return bs_busy;
1941 * Wait for pending requests to complete on a single BlockDriverState subtree
1943 * See the warning in bdrv_drain_all(). This function can only be called if
1944 * you are sure nothing can generate I/O because you have op blockers
1945 * installed.
1947 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
1948 * AioContext.
1950 void bdrv_drain(BlockDriverState *bs)
1952 while (bdrv_drain_one(bs)) {
1953 /* Keep iterating */
1958 * Wait for pending requests to complete across all BlockDriverStates
1960 * This function does not flush data to disk, use bdrv_flush_all() for that
1961 * after calling this function.
1963 * Note that completion of an asynchronous I/O operation can trigger any
1964 * number of other I/O operations on other devices---for example a coroutine
1965 * can be arbitrarily complex and a constant flow of I/O can come until the
1966 * coroutine is complete. Because of this, it is not possible to have a
1967 * function to drain a single device's I/O queue.
1969 void bdrv_drain_all(void)
1971 /* Always run first iteration so any pending completion BHs run */
1972 bool busy = true;
1973 BlockDriverState *bs;
1975 while (busy) {
1976 busy = false;
1978 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1979 AioContext *aio_context = bdrv_get_aio_context(bs);
1981 aio_context_acquire(aio_context);
1982 busy |= bdrv_drain_one(bs);
1983 aio_context_release(aio_context);
1988 /* make a BlockDriverState anonymous by removing from bdrv_state and
1989 * graph_bdrv_state list.
1990 Also, NULL terminate the device_name to prevent double remove */
1991 void bdrv_make_anon(BlockDriverState *bs)
1994 * Take care to remove bs from bdrv_states only when it's actually
1995 * in it. Note that bs->device_list.tqe_prev is initially null,
1996 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
1997 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1998 * resetting it to null on remove.
2000 if (bs->device_list.tqe_prev) {
2001 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
2002 bs->device_list.tqe_prev = NULL;
2004 if (bs->node_name[0] != '\0') {
2005 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
2007 bs->node_name[0] = '\0';
2010 static void bdrv_rebind(BlockDriverState *bs)
2012 if (bs->drv && bs->drv->bdrv_rebind) {
2013 bs->drv->bdrv_rebind(bs);
2017 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2018 BlockDriverState *bs_src)
2020 /* move some fields that need to stay attached to the device */
2022 /* dev info */
2023 bs_dest->guest_block_size = bs_src->guest_block_size;
2024 bs_dest->copy_on_read = bs_src->copy_on_read;
2026 bs_dest->enable_write_cache = bs_src->enable_write_cache;
2028 /* i/o throttled req */
2029 memcpy(&bs_dest->throttle_state,
2030 &bs_src->throttle_state,
2031 sizeof(ThrottleState));
2032 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
2033 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
2034 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
2036 /* r/w error */
2037 bs_dest->on_read_error = bs_src->on_read_error;
2038 bs_dest->on_write_error = bs_src->on_write_error;
2040 /* i/o status */
2041 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2042 bs_dest->iostatus = bs_src->iostatus;
2044 /* dirty bitmap */
2045 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
2047 /* reference count */
2048 bs_dest->refcnt = bs_src->refcnt;
2050 /* job */
2051 bs_dest->job = bs_src->job;
2053 /* keep the same entry in bdrv_states */
2054 bs_dest->device_list = bs_src->device_list;
2055 bs_dest->blk = bs_src->blk;
2057 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2058 sizeof(bs_dest->op_blockers));
2062 * Swap bs contents for two image chains while they are live,
2063 * while keeping required fields on the BlockDriverState that is
2064 * actually attached to a device.
2066 * This will modify the BlockDriverState fields, and swap contents
2067 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2069 * bs_new must not be attached to a BlockBackend.
2071 * This function does not create any image files.
2073 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2075 BlockDriverState tmp;
2077 /* The code needs to swap the node_name but simply swapping node_list won't
2078 * work so first remove the nodes from the graph list, do the swap then
2079 * insert them back if needed.
2081 if (bs_new->node_name[0] != '\0') {
2082 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2084 if (bs_old->node_name[0] != '\0') {
2085 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2088 /* bs_new must be unattached and shouldn't have anything fancy enabled */
2089 assert(!bs_new->blk);
2090 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2091 assert(bs_new->job == NULL);
2092 assert(bs_new->io_limits_enabled == false);
2093 assert(!throttle_have_timer(&bs_new->throttle_state));
2095 tmp = *bs_new;
2096 *bs_new = *bs_old;
2097 *bs_old = tmp;
2099 /* there are some fields that should not be swapped, move them back */
2100 bdrv_move_feature_fields(&tmp, bs_old);
2101 bdrv_move_feature_fields(bs_old, bs_new);
2102 bdrv_move_feature_fields(bs_new, &tmp);
2104 /* bs_new must remain unattached */
2105 assert(!bs_new->blk);
2107 /* Check a few fields that should remain attached to the device */
2108 assert(bs_new->job == NULL);
2109 assert(bs_new->io_limits_enabled == false);
2110 assert(!throttle_have_timer(&bs_new->throttle_state));
2112 /* insert the nodes back into the graph node list if needed */
2113 if (bs_new->node_name[0] != '\0') {
2114 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2116 if (bs_old->node_name[0] != '\0') {
2117 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2120 bdrv_rebind(bs_new);
2121 bdrv_rebind(bs_old);
2125 * Add new bs contents at the top of an image chain while the chain is
2126 * live, while keeping required fields on the top layer.
2128 * This will modify the BlockDriverState fields, and swap contents
2129 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2131 * bs_new must not be attached to a BlockBackend.
2133 * This function does not create any image files.
2135 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2137 bdrv_swap(bs_new, bs_top);
2139 /* The contents of 'tmp' will become bs_top, as we are
2140 * swapping bs_new and bs_top contents. */
2141 bdrv_set_backing_hd(bs_top, bs_new);
2144 static void bdrv_delete(BlockDriverState *bs)
2146 assert(!bs->job);
2147 assert(bdrv_op_blocker_is_empty(bs));
2148 assert(!bs->refcnt);
2149 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2151 bdrv_close(bs);
2153 /* remove from list, if necessary */
2154 bdrv_make_anon(bs);
2156 g_free(bs);
2160 * Run consistency checks on an image
2162 * Returns 0 if the check could be completed (it doesn't mean that the image is
2163 * free of errors) or -errno when an internal error occurred. The results of the
2164 * check are stored in res.
2166 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2168 if (bs->drv == NULL) {
2169 return -ENOMEDIUM;
2171 if (bs->drv->bdrv_check == NULL) {
2172 return -ENOTSUP;
2175 memset(res, 0, sizeof(*res));
2176 return bs->drv->bdrv_check(bs, res, fix);
2179 #define COMMIT_BUF_SECTORS 2048
2181 /* commit COW file into the raw image */
2182 int bdrv_commit(BlockDriverState *bs)
2184 BlockDriver *drv = bs->drv;
2185 int64_t sector, total_sectors, length, backing_length;
2186 int n, ro, open_flags;
2187 int ret = 0;
2188 uint8_t *buf = NULL;
2189 char filename[PATH_MAX];
2191 if (!drv)
2192 return -ENOMEDIUM;
2194 if (!bs->backing_hd) {
2195 return -ENOTSUP;
2198 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2199 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2200 return -EBUSY;
2203 ro = bs->backing_hd->read_only;
2204 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2205 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2206 open_flags = bs->backing_hd->open_flags;
2208 if (ro) {
2209 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2210 return -EACCES;
2214 length = bdrv_getlength(bs);
2215 if (length < 0) {
2216 ret = length;
2217 goto ro_cleanup;
2220 backing_length = bdrv_getlength(bs->backing_hd);
2221 if (backing_length < 0) {
2222 ret = backing_length;
2223 goto ro_cleanup;
2226 /* If our top snapshot is larger than the backing file image,
2227 * grow the backing file image if possible. If not possible,
2228 * we must return an error */
2229 if (length > backing_length) {
2230 ret = bdrv_truncate(bs->backing_hd, length);
2231 if (ret < 0) {
2232 goto ro_cleanup;
2236 total_sectors = length >> BDRV_SECTOR_BITS;
2238 /* qemu_try_blockalign() for bs will choose an alignment that works for
2239 * bs->backing_hd as well, so no need to compare the alignment manually. */
2240 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2241 if (buf == NULL) {
2242 ret = -ENOMEM;
2243 goto ro_cleanup;
2246 for (sector = 0; sector < total_sectors; sector += n) {
2247 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2248 if (ret < 0) {
2249 goto ro_cleanup;
2251 if (ret) {
2252 ret = bdrv_read(bs, sector, buf, n);
2253 if (ret < 0) {
2254 goto ro_cleanup;
2257 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2258 if (ret < 0) {
2259 goto ro_cleanup;
2264 if (drv->bdrv_make_empty) {
2265 ret = drv->bdrv_make_empty(bs);
2266 if (ret < 0) {
2267 goto ro_cleanup;
2269 bdrv_flush(bs);
2273 * Make sure all data we wrote to the backing device is actually
2274 * stable on disk.
2276 if (bs->backing_hd) {
2277 bdrv_flush(bs->backing_hd);
2280 ret = 0;
2281 ro_cleanup:
2282 qemu_vfree(buf);
2284 if (ro) {
2285 /* ignoring error return here */
2286 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2289 return ret;
2292 int bdrv_commit_all(void)
2294 BlockDriverState *bs;
2296 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2297 AioContext *aio_context = bdrv_get_aio_context(bs);
2299 aio_context_acquire(aio_context);
2300 if (bs->drv && bs->backing_hd) {
2301 int ret = bdrv_commit(bs);
2302 if (ret < 0) {
2303 aio_context_release(aio_context);
2304 return ret;
2307 aio_context_release(aio_context);
2309 return 0;
2313 * Remove an active request from the tracked requests list
2315 * This function should be called when a tracked request is completing.
2317 static void tracked_request_end(BdrvTrackedRequest *req)
2319 if (req->serialising) {
2320 req->bs->serialising_in_flight--;
2323 QLIST_REMOVE(req, list);
2324 qemu_co_queue_restart_all(&req->wait_queue);
2328 * Add an active request to the tracked requests list
2330 static void tracked_request_begin(BdrvTrackedRequest *req,
2331 BlockDriverState *bs,
2332 int64_t offset,
2333 unsigned int bytes, bool is_write)
2335 *req = (BdrvTrackedRequest){
2336 .bs = bs,
2337 .offset = offset,
2338 .bytes = bytes,
2339 .is_write = is_write,
2340 .co = qemu_coroutine_self(),
2341 .serialising = false,
2342 .overlap_offset = offset,
2343 .overlap_bytes = bytes,
2346 qemu_co_queue_init(&req->wait_queue);
2348 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2351 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2353 int64_t overlap_offset = req->offset & ~(align - 1);
2354 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2355 - overlap_offset;
2357 if (!req->serialising) {
2358 req->bs->serialising_in_flight++;
2359 req->serialising = true;
2362 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2363 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2367 * Round a region to cluster boundaries
2369 void bdrv_round_to_clusters(BlockDriverState *bs,
2370 int64_t sector_num, int nb_sectors,
2371 int64_t *cluster_sector_num,
2372 int *cluster_nb_sectors)
2374 BlockDriverInfo bdi;
2376 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2377 *cluster_sector_num = sector_num;
2378 *cluster_nb_sectors = nb_sectors;
2379 } else {
2380 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2381 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2382 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2383 nb_sectors, c);
2387 static int bdrv_get_cluster_size(BlockDriverState *bs)
2389 BlockDriverInfo bdi;
2390 int ret;
2392 ret = bdrv_get_info(bs, &bdi);
2393 if (ret < 0 || bdi.cluster_size == 0) {
2394 return bs->request_alignment;
2395 } else {
2396 return bdi.cluster_size;
2400 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2401 int64_t offset, unsigned int bytes)
2403 /* aaaa bbbb */
2404 if (offset >= req->overlap_offset + req->overlap_bytes) {
2405 return false;
2407 /* bbbb aaaa */
2408 if (req->overlap_offset >= offset + bytes) {
2409 return false;
2411 return true;
2414 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2416 BlockDriverState *bs = self->bs;
2417 BdrvTrackedRequest *req;
2418 bool retry;
2419 bool waited = false;
2421 if (!bs->serialising_in_flight) {
2422 return false;
2425 do {
2426 retry = false;
2427 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2428 if (req == self || (!req->serialising && !self->serialising)) {
2429 continue;
2431 if (tracked_request_overlaps(req, self->overlap_offset,
2432 self->overlap_bytes))
2434 /* Hitting this means there was a reentrant request, for
2435 * example, a block driver issuing nested requests. This must
2436 * never happen since it means deadlock.
2438 assert(qemu_coroutine_self() != req->co);
2440 /* If the request is already (indirectly) waiting for us, or
2441 * will wait for us as soon as it wakes up, then just go on
2442 * (instead of producing a deadlock in the former case). */
2443 if (!req->waiting_for) {
2444 self->waiting_for = req;
2445 qemu_co_queue_wait(&req->wait_queue);
2446 self->waiting_for = NULL;
2447 retry = true;
2448 waited = true;
2449 break;
2453 } while (retry);
2455 return waited;
2459 * Return values:
2460 * 0 - success
2461 * -EINVAL - backing format specified, but no file
2462 * -ENOSPC - can't update the backing file because no space is left in the
2463 * image file header
2464 * -ENOTSUP - format driver doesn't support changing the backing file
2466 int bdrv_change_backing_file(BlockDriverState *bs,
2467 const char *backing_file, const char *backing_fmt)
2469 BlockDriver *drv = bs->drv;
2470 int ret;
2472 /* Backing file format doesn't make sense without a backing file */
2473 if (backing_fmt && !backing_file) {
2474 return -EINVAL;
2477 if (drv->bdrv_change_backing_file != NULL) {
2478 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2479 } else {
2480 ret = -ENOTSUP;
2483 if (ret == 0) {
2484 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2485 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2487 return ret;
2491 * Finds the image layer in the chain that has 'bs' as its backing file.
2493 * active is the current topmost image.
2495 * Returns NULL if bs is not found in active's image chain,
2496 * or if active == bs.
2498 * Returns the bottommost base image if bs == NULL.
2500 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2501 BlockDriverState *bs)
2503 while (active && bs != active->backing_hd) {
2504 active = active->backing_hd;
2507 return active;
2510 /* Given a BDS, searches for the base layer. */
2511 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2513 return bdrv_find_overlay(bs, NULL);
2516 typedef struct BlkIntermediateStates {
2517 BlockDriverState *bs;
2518 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2519 } BlkIntermediateStates;
2523 * Drops images above 'base' up to and including 'top', and sets the image
2524 * above 'top' to have base as its backing file.
2526 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2527 * information in 'bs' can be properly updated.
2529 * E.g., this will convert the following chain:
2530 * bottom <- base <- intermediate <- top <- active
2532 * to
2534 * bottom <- base <- active
2536 * It is allowed for bottom==base, in which case it converts:
2538 * base <- intermediate <- top <- active
2540 * to
2542 * base <- active
2544 * If backing_file_str is non-NULL, it will be used when modifying top's
2545 * overlay image metadata.
2547 * Error conditions:
2548 * if active == top, that is considered an error
2551 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2552 BlockDriverState *base, const char *backing_file_str)
2554 BlockDriverState *intermediate;
2555 BlockDriverState *base_bs = NULL;
2556 BlockDriverState *new_top_bs = NULL;
2557 BlkIntermediateStates *intermediate_state, *next;
2558 int ret = -EIO;
2560 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2561 QSIMPLEQ_INIT(&states_to_delete);
2563 if (!top->drv || !base->drv) {
2564 goto exit;
2567 new_top_bs = bdrv_find_overlay(active, top);
2569 if (new_top_bs == NULL) {
2570 /* we could not find the image above 'top', this is an error */
2571 goto exit;
2574 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2575 * to do, no intermediate images */
2576 if (new_top_bs->backing_hd == base) {
2577 ret = 0;
2578 goto exit;
2581 intermediate = top;
2583 /* now we will go down through the list, and add each BDS we find
2584 * into our deletion queue, until we hit the 'base'
2586 while (intermediate) {
2587 intermediate_state = g_new0(BlkIntermediateStates, 1);
2588 intermediate_state->bs = intermediate;
2589 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2591 if (intermediate->backing_hd == base) {
2592 base_bs = intermediate->backing_hd;
2593 break;
2595 intermediate = intermediate->backing_hd;
2597 if (base_bs == NULL) {
2598 /* something went wrong, we did not end at the base. safely
2599 * unravel everything, and exit with error */
2600 goto exit;
2603 /* success - we can delete the intermediate states, and link top->base */
2604 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2605 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2606 base_bs->drv ? base_bs->drv->format_name : "");
2607 if (ret) {
2608 goto exit;
2610 bdrv_set_backing_hd(new_top_bs, base_bs);
2612 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2613 /* so that bdrv_close() does not recursively close the chain */
2614 bdrv_set_backing_hd(intermediate_state->bs, NULL);
2615 bdrv_unref(intermediate_state->bs);
2617 ret = 0;
2619 exit:
2620 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2621 g_free(intermediate_state);
2623 return ret;
2627 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2628 size_t size)
2630 int64_t len;
2632 if (size > INT_MAX) {
2633 return -EIO;
2636 if (!bdrv_is_inserted(bs))
2637 return -ENOMEDIUM;
2639 if (bs->growable)
2640 return 0;
2642 len = bdrv_getlength(bs);
2644 if (offset < 0)
2645 return -EIO;
2647 if ((offset > len) || (len - offset < size))
2648 return -EIO;
2650 return 0;
2653 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2654 int nb_sectors)
2656 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2657 return -EIO;
2660 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2661 nb_sectors * BDRV_SECTOR_SIZE);
2664 typedef struct RwCo {
2665 BlockDriverState *bs;
2666 int64_t offset;
2667 QEMUIOVector *qiov;
2668 bool is_write;
2669 int ret;
2670 BdrvRequestFlags flags;
2671 } RwCo;
2673 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2675 RwCo *rwco = opaque;
2677 if (!rwco->is_write) {
2678 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2679 rwco->qiov->size, rwco->qiov,
2680 rwco->flags);
2681 } else {
2682 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2683 rwco->qiov->size, rwco->qiov,
2684 rwco->flags);
2689 * Process a vectored synchronous request using coroutines
2691 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2692 QEMUIOVector *qiov, bool is_write,
2693 BdrvRequestFlags flags)
2695 Coroutine *co;
2696 RwCo rwco = {
2697 .bs = bs,
2698 .offset = offset,
2699 .qiov = qiov,
2700 .is_write = is_write,
2701 .ret = NOT_DONE,
2702 .flags = flags,
2706 * In sync call context, when the vcpu is blocked, this throttling timer
2707 * will not fire; so the I/O throttling function has to be disabled here
2708 * if it has been enabled.
2710 if (bs->io_limits_enabled) {
2711 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2712 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2713 bdrv_io_limits_disable(bs);
2716 if (qemu_in_coroutine()) {
2717 /* Fast-path if already in coroutine context */
2718 bdrv_rw_co_entry(&rwco);
2719 } else {
2720 AioContext *aio_context = bdrv_get_aio_context(bs);
2722 co = qemu_coroutine_create(bdrv_rw_co_entry);
2723 qemu_coroutine_enter(co, &rwco);
2724 while (rwco.ret == NOT_DONE) {
2725 aio_poll(aio_context, true);
2728 return rwco.ret;
2732 * Process a synchronous request using coroutines
2734 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2735 int nb_sectors, bool is_write, BdrvRequestFlags flags)
2737 QEMUIOVector qiov;
2738 struct iovec iov = {
2739 .iov_base = (void *)buf,
2740 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2743 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2744 return -EINVAL;
2747 qemu_iovec_init_external(&qiov, &iov, 1);
2748 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2749 &qiov, is_write, flags);
2752 /* return < 0 if error. See bdrv_write() for the return codes */
2753 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2754 uint8_t *buf, int nb_sectors)
2756 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2759 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2760 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2761 uint8_t *buf, int nb_sectors)
2763 bool enabled;
2764 int ret;
2766 enabled = bs->io_limits_enabled;
2767 bs->io_limits_enabled = false;
2768 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2769 bs->io_limits_enabled = enabled;
2770 return ret;
2773 /* Return < 0 if error. Important errors are:
2774 -EIO generic I/O error (may happen for all errors)
2775 -ENOMEDIUM No media inserted.
2776 -EINVAL Invalid sector number or nb_sectors
2777 -EACCES Trying to write a read-only device
2779 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2780 const uint8_t *buf, int nb_sectors)
2782 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2785 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2786 int nb_sectors, BdrvRequestFlags flags)
2788 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2789 BDRV_REQ_ZERO_WRITE | flags);
2793 * Completely zero out a block device with the help of bdrv_write_zeroes.
2794 * The operation is sped up by checking the block status and only writing
2795 * zeroes to the device if they currently do not return zeroes. Optional
2796 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2798 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2800 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2802 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2803 int n;
2805 target_sectors = bdrv_nb_sectors(bs);
2806 if (target_sectors < 0) {
2807 return target_sectors;
2810 for (;;) {
2811 nb_sectors = target_sectors - sector_num;
2812 if (nb_sectors <= 0) {
2813 return 0;
2815 if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2816 nb_sectors = INT_MAX / BDRV_SECTOR_SIZE;
2818 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2819 if (ret < 0) {
2820 error_report("error getting block status at sector %" PRId64 ": %s",
2821 sector_num, strerror(-ret));
2822 return ret;
2824 if (ret & BDRV_BLOCK_ZERO) {
2825 sector_num += n;
2826 continue;
2828 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2829 if (ret < 0) {
2830 error_report("error writing zeroes at sector %" PRId64 ": %s",
2831 sector_num, strerror(-ret));
2832 return ret;
2834 sector_num += n;
2838 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2840 QEMUIOVector qiov;
2841 struct iovec iov = {
2842 .iov_base = (void *)buf,
2843 .iov_len = bytes,
2845 int ret;
2847 if (bytes < 0) {
2848 return -EINVAL;
2851 qemu_iovec_init_external(&qiov, &iov, 1);
2852 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2853 if (ret < 0) {
2854 return ret;
2857 return bytes;
2860 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2862 int ret;
2864 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2865 if (ret < 0) {
2866 return ret;
2869 return qiov->size;
2872 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2873 const void *buf, int bytes)
2875 QEMUIOVector qiov;
2876 struct iovec iov = {
2877 .iov_base = (void *) buf,
2878 .iov_len = bytes,
2881 if (bytes < 0) {
2882 return -EINVAL;
2885 qemu_iovec_init_external(&qiov, &iov, 1);
2886 return bdrv_pwritev(bs, offset, &qiov);
2890 * Writes to the file and ensures that no writes are reordered across this
2891 * request (acts as a barrier)
2893 * Returns 0 on success, -errno in error cases.
2895 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2896 const void *buf, int count)
2898 int ret;
2900 ret = bdrv_pwrite(bs, offset, buf, count);
2901 if (ret < 0) {
2902 return ret;
2905 /* No flush needed for cache modes that already do it */
2906 if (bs->enable_write_cache) {
2907 bdrv_flush(bs);
2910 return 0;
2913 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2914 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2916 /* Perform I/O through a temporary buffer so that users who scribble over
2917 * their read buffer while the operation is in progress do not end up
2918 * modifying the image file. This is critical for zero-copy guest I/O
2919 * where anything might happen inside guest memory.
2921 void *bounce_buffer;
2923 BlockDriver *drv = bs->drv;
2924 struct iovec iov;
2925 QEMUIOVector bounce_qiov;
2926 int64_t cluster_sector_num;
2927 int cluster_nb_sectors;
2928 size_t skip_bytes;
2929 int ret;
2931 /* Cover entire cluster so no additional backing file I/O is required when
2932 * allocating cluster in the image file.
2934 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2935 &cluster_sector_num, &cluster_nb_sectors);
2937 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2938 cluster_sector_num, cluster_nb_sectors);
2940 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2941 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2942 if (bounce_buffer == NULL) {
2943 ret = -ENOMEM;
2944 goto err;
2947 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2949 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2950 &bounce_qiov);
2951 if (ret < 0) {
2952 goto err;
2955 if (drv->bdrv_co_write_zeroes &&
2956 buffer_is_zero(bounce_buffer, iov.iov_len)) {
2957 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2958 cluster_nb_sectors, 0);
2959 } else {
2960 /* This does not change the data on the disk, it is not necessary
2961 * to flush even in cache=writethrough mode.
2963 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2964 &bounce_qiov);
2967 if (ret < 0) {
2968 /* It might be okay to ignore write errors for guest requests. If this
2969 * is a deliberate copy-on-read then we don't want to ignore the error.
2970 * Simply report it in all cases.
2972 goto err;
2975 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2976 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2977 nb_sectors * BDRV_SECTOR_SIZE);
2979 err:
2980 qemu_vfree(bounce_buffer);
2981 return ret;
2985 * Forwards an already correctly aligned request to the BlockDriver. This
2986 * handles copy on read and zeroing after EOF; any other features must be
2987 * implemented by the caller.
2989 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2990 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2991 int64_t align, QEMUIOVector *qiov, int flags)
2993 BlockDriver *drv = bs->drv;
2994 int ret;
2996 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2997 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2999 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3000 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3001 assert(!qiov || bytes == qiov->size);
3003 /* Handle Copy on Read and associated serialisation */
3004 if (flags & BDRV_REQ_COPY_ON_READ) {
3005 /* If we touch the same cluster it counts as an overlap. This
3006 * guarantees that allocating writes will be serialized and not race
3007 * with each other for the same cluster. For example, in copy-on-read
3008 * it ensures that the CoR read and write operations are atomic and
3009 * guest writes cannot interleave between them. */
3010 mark_request_serialising(req, bdrv_get_cluster_size(bs));
3013 wait_serialising_requests(req);
3015 if (flags & BDRV_REQ_COPY_ON_READ) {
3016 int pnum;
3018 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3019 if (ret < 0) {
3020 goto out;
3023 if (!ret || pnum != nb_sectors) {
3024 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3025 goto out;
3029 /* Forward the request to the BlockDriver */
3030 if (!(bs->zero_beyond_eof && bs->growable)) {
3031 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3032 } else {
3033 /* Read zeros after EOF of growable BDSes */
3034 int64_t total_sectors, max_nb_sectors;
3036 total_sectors = bdrv_nb_sectors(bs);
3037 if (total_sectors < 0) {
3038 ret = total_sectors;
3039 goto out;
3042 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3043 align >> BDRV_SECTOR_BITS);
3044 if (max_nb_sectors > 0) {
3045 QEMUIOVector local_qiov;
3046 size_t local_sectors;
3048 max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3049 local_sectors = MIN(max_nb_sectors, nb_sectors);
3051 qemu_iovec_init(&local_qiov, qiov->niov);
3052 qemu_iovec_concat(&local_qiov, qiov, 0,
3053 local_sectors * BDRV_SECTOR_SIZE);
3055 ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3056 &local_qiov);
3058 qemu_iovec_destroy(&local_qiov);
3059 } else {
3060 ret = 0;
3063 /* Reading beyond end of file is supposed to produce zeroes */
3064 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3065 uint64_t offset = MAX(0, total_sectors - sector_num);
3066 uint64_t bytes = (sector_num + nb_sectors - offset) *
3067 BDRV_SECTOR_SIZE;
3068 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3072 out:
3073 return ret;
3077 * Handle a read request in coroutine context
3079 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3080 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3081 BdrvRequestFlags flags)
3083 BlockDriver *drv = bs->drv;
3084 BdrvTrackedRequest req;
3086 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3087 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3088 uint8_t *head_buf = NULL;
3089 uint8_t *tail_buf = NULL;
3090 QEMUIOVector local_qiov;
3091 bool use_local_qiov = false;
3092 int ret;
3094 if (!drv) {
3095 return -ENOMEDIUM;
3097 if (bdrv_check_byte_request(bs, offset, bytes)) {
3098 return -EIO;
3101 if (bs->copy_on_read) {
3102 flags |= BDRV_REQ_COPY_ON_READ;
3105 /* throttling disk I/O */
3106 if (bs->io_limits_enabled) {
3107 bdrv_io_limits_intercept(bs, bytes, false);
3110 /* Align read if necessary by padding qiov */
3111 if (offset & (align - 1)) {
3112 head_buf = qemu_blockalign(bs, align);
3113 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3114 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3115 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3116 use_local_qiov = true;
3118 bytes += offset & (align - 1);
3119 offset = offset & ~(align - 1);
3122 if ((offset + bytes) & (align - 1)) {
3123 if (!use_local_qiov) {
3124 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3125 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3126 use_local_qiov = true;
3128 tail_buf = qemu_blockalign(bs, align);
3129 qemu_iovec_add(&local_qiov, tail_buf,
3130 align - ((offset + bytes) & (align - 1)));
3132 bytes = ROUND_UP(bytes, align);
3135 tracked_request_begin(&req, bs, offset, bytes, false);
3136 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3137 use_local_qiov ? &local_qiov : qiov,
3138 flags);
3139 tracked_request_end(&req);
3141 if (use_local_qiov) {
3142 qemu_iovec_destroy(&local_qiov);
3143 qemu_vfree(head_buf);
3144 qemu_vfree(tail_buf);
3147 return ret;
3150 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3151 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3152 BdrvRequestFlags flags)
3154 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3155 return -EINVAL;
3158 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3159 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3162 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3163 int nb_sectors, QEMUIOVector *qiov)
3165 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3167 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3170 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3171 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3173 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3175 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3176 BDRV_REQ_COPY_ON_READ);
3179 /* if no limit is specified in the BlockLimits use a default
3180 * of 32768 512-byte sectors (16 MiB) per request.
3182 #define MAX_WRITE_ZEROES_DEFAULT 32768
3184 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3185 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3187 BlockDriver *drv = bs->drv;
3188 QEMUIOVector qiov;
3189 struct iovec iov = {0};
3190 int ret = 0;
3192 int max_write_zeroes = bs->bl.max_write_zeroes ?
3193 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3195 while (nb_sectors > 0 && !ret) {
3196 int num = nb_sectors;
3198 /* Align request. Block drivers can expect the "bulk" of the request
3199 * to be aligned.
3201 if (bs->bl.write_zeroes_alignment
3202 && num > bs->bl.write_zeroes_alignment) {
3203 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3204 /* Make a small request up to the first aligned sector. */
3205 num = bs->bl.write_zeroes_alignment;
3206 num -= sector_num % bs->bl.write_zeroes_alignment;
3207 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3208 /* Shorten the request to the last aligned sector. num cannot
3209 * underflow because num > bs->bl.write_zeroes_alignment.
3211 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3215 /* limit request size */
3216 if (num > max_write_zeroes) {
3217 num = max_write_zeroes;
3220 ret = -ENOTSUP;
3221 /* First try the efficient write zeroes operation */
3222 if (drv->bdrv_co_write_zeroes) {
3223 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3226 if (ret == -ENOTSUP) {
3227 /* Fall back to bounce buffer if write zeroes is unsupported */
3228 iov.iov_len = num * BDRV_SECTOR_SIZE;
3229 if (iov.iov_base == NULL) {
3230 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3231 if (iov.iov_base == NULL) {
3232 ret = -ENOMEM;
3233 goto fail;
3235 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3237 qemu_iovec_init_external(&qiov, &iov, 1);
3239 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3241 /* Keep bounce buffer around if it is big enough for all
3242 * all future requests.
3244 if (num < max_write_zeroes) {
3245 qemu_vfree(iov.iov_base);
3246 iov.iov_base = NULL;
3250 sector_num += num;
3251 nb_sectors -= num;
3254 fail:
3255 qemu_vfree(iov.iov_base);
3256 return ret;
3260 * Forwards an already correctly aligned write request to the BlockDriver.
3262 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3263 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3264 QEMUIOVector *qiov, int flags)
3266 BlockDriver *drv = bs->drv;
3267 bool waited;
3268 int ret;
3270 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3271 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3273 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3274 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3275 assert(!qiov || bytes == qiov->size);
3277 waited = wait_serialising_requests(req);
3278 assert(!waited || !req->serialising);
3279 assert(req->overlap_offset <= offset);
3280 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3282 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3284 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3285 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3286 qemu_iovec_is_zero(qiov)) {
3287 flags |= BDRV_REQ_ZERO_WRITE;
3288 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3289 flags |= BDRV_REQ_MAY_UNMAP;
3293 if (ret < 0) {
3294 /* Do nothing, write notifier decided to fail this request */
3295 } else if (flags & BDRV_REQ_ZERO_WRITE) {
3296 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3297 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3298 } else {
3299 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3300 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3302 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3304 if (ret == 0 && !bs->enable_write_cache) {
3305 ret = bdrv_co_flush(bs);
3308 bdrv_set_dirty(bs, sector_num, nb_sectors);
3310 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3312 if (bs->growable && ret >= 0) {
3313 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3316 return ret;
3320 * Handle a write request in coroutine context
3322 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3323 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3324 BdrvRequestFlags flags)
3326 BdrvTrackedRequest req;
3327 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3328 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3329 uint8_t *head_buf = NULL;
3330 uint8_t *tail_buf = NULL;
3331 QEMUIOVector local_qiov;
3332 bool use_local_qiov = false;
3333 int ret;
3335 if (!bs->drv) {
3336 return -ENOMEDIUM;
3338 if (bs->read_only) {
3339 return -EACCES;
3341 if (bdrv_check_byte_request(bs, offset, bytes)) {
3342 return -EIO;
3345 /* throttling disk I/O */
3346 if (bs->io_limits_enabled) {
3347 bdrv_io_limits_intercept(bs, bytes, true);
3351 * Align write if necessary by performing a read-modify-write cycle.
3352 * Pad qiov with the read parts and be sure to have a tracked request not
3353 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3355 tracked_request_begin(&req, bs, offset, bytes, true);
3357 if (offset & (align - 1)) {
3358 QEMUIOVector head_qiov;
3359 struct iovec head_iov;
3361 mark_request_serialising(&req, align);
3362 wait_serialising_requests(&req);
3364 head_buf = qemu_blockalign(bs, align);
3365 head_iov = (struct iovec) {
3366 .iov_base = head_buf,
3367 .iov_len = align,
3369 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3371 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3372 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3373 align, &head_qiov, 0);
3374 if (ret < 0) {
3375 goto fail;
3377 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3379 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3380 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3381 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3382 use_local_qiov = true;
3384 bytes += offset & (align - 1);
3385 offset = offset & ~(align - 1);
3388 if ((offset + bytes) & (align - 1)) {
3389 QEMUIOVector tail_qiov;
3390 struct iovec tail_iov;
3391 size_t tail_bytes;
3392 bool waited;
3394 mark_request_serialising(&req, align);
3395 waited = wait_serialising_requests(&req);
3396 assert(!waited || !use_local_qiov);
3398 tail_buf = qemu_blockalign(bs, align);
3399 tail_iov = (struct iovec) {
3400 .iov_base = tail_buf,
3401 .iov_len = align,
3403 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3405 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3406 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3407 align, &tail_qiov, 0);
3408 if (ret < 0) {
3409 goto fail;
3411 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3413 if (!use_local_qiov) {
3414 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3415 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3416 use_local_qiov = true;
3419 tail_bytes = (offset + bytes) & (align - 1);
3420 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3422 bytes = ROUND_UP(bytes, align);
3425 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3426 use_local_qiov ? &local_qiov : qiov,
3427 flags);
3429 fail:
3430 tracked_request_end(&req);
3432 if (use_local_qiov) {
3433 qemu_iovec_destroy(&local_qiov);
3435 qemu_vfree(head_buf);
3436 qemu_vfree(tail_buf);
3438 return ret;
3441 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3442 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3443 BdrvRequestFlags flags)
3445 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3446 return -EINVAL;
3449 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3450 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3453 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3454 int nb_sectors, QEMUIOVector *qiov)
3456 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3458 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3461 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3462 int64_t sector_num, int nb_sectors,
3463 BdrvRequestFlags flags)
3465 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3467 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3468 flags &= ~BDRV_REQ_MAY_UNMAP;
3471 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3472 BDRV_REQ_ZERO_WRITE | flags);
3476 * Truncate file to 'offset' bytes (needed only for file protocols)
3478 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3480 BlockDriver *drv = bs->drv;
3481 int ret;
3482 if (!drv)
3483 return -ENOMEDIUM;
3484 if (!drv->bdrv_truncate)
3485 return -ENOTSUP;
3486 if (bs->read_only)
3487 return -EACCES;
3489 ret = drv->bdrv_truncate(bs, offset);
3490 if (ret == 0) {
3491 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3492 if (bs->blk) {
3493 blk_dev_resize_cb(bs->blk);
3496 return ret;
3500 * Length of a allocated file in bytes. Sparse files are counted by actual
3501 * allocated space. Return < 0 if error or unknown.
3503 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3505 BlockDriver *drv = bs->drv;
3506 if (!drv) {
3507 return -ENOMEDIUM;
3509 if (drv->bdrv_get_allocated_file_size) {
3510 return drv->bdrv_get_allocated_file_size(bs);
3512 if (bs->file) {
3513 return bdrv_get_allocated_file_size(bs->file);
3515 return -ENOTSUP;
3519 * Return number of sectors on success, -errno on error.
3521 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3523 BlockDriver *drv = bs->drv;
3525 if (!drv)
3526 return -ENOMEDIUM;
3528 if (drv->has_variable_length) {
3529 int ret = refresh_total_sectors(bs, bs->total_sectors);
3530 if (ret < 0) {
3531 return ret;
3534 return bs->total_sectors;
3538 * Return length in bytes on success, -errno on error.
3539 * The length is always a multiple of BDRV_SECTOR_SIZE.
3541 int64_t bdrv_getlength(BlockDriverState *bs)
3543 int64_t ret = bdrv_nb_sectors(bs);
3545 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3548 /* return 0 as number of sectors if no device present or error */
3549 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3551 int64_t nb_sectors = bdrv_nb_sectors(bs);
3553 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3556 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3557 BlockdevOnError on_write_error)
3559 bs->on_read_error = on_read_error;
3560 bs->on_write_error = on_write_error;
3563 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3565 return is_read ? bs->on_read_error : bs->on_write_error;
3568 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3570 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3572 switch (on_err) {
3573 case BLOCKDEV_ON_ERROR_ENOSPC:
3574 return (error == ENOSPC) ?
3575 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3576 case BLOCKDEV_ON_ERROR_STOP:
3577 return BLOCK_ERROR_ACTION_STOP;
3578 case BLOCKDEV_ON_ERROR_REPORT:
3579 return BLOCK_ERROR_ACTION_REPORT;
3580 case BLOCKDEV_ON_ERROR_IGNORE:
3581 return BLOCK_ERROR_ACTION_IGNORE;
3582 default:
3583 abort();
3587 static void send_qmp_error_event(BlockDriverState *bs,
3588 BlockErrorAction action,
3589 bool is_read, int error)
3591 IoOperationType optype;
3593 optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3594 qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
3595 bdrv_iostatus_is_enabled(bs),
3596 error == ENOSPC, strerror(error),
3597 &error_abort);
3600 /* This is done by device models because, while the block layer knows
3601 * about the error, it does not know whether an operation comes from
3602 * the device or the block layer (from a job, for example).
3604 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3605 bool is_read, int error)
3607 assert(error >= 0);
3609 if (action == BLOCK_ERROR_ACTION_STOP) {
3610 /* First set the iostatus, so that "info block" returns an iostatus
3611 * that matches the events raised so far (an additional error iostatus
3612 * is fine, but not a lost one).
3614 bdrv_iostatus_set_err(bs, error);
3616 /* Then raise the request to stop the VM and the event.
3617 * qemu_system_vmstop_request_prepare has two effects. First,
3618 * it ensures that the STOP event always comes after the
3619 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3620 * can observe the STOP event and do a "cont" before the STOP
3621 * event is issued, the VM will not stop. In this case, vm_start()
3622 * also ensures that the STOP/RESUME pair of events is emitted.
3624 qemu_system_vmstop_request_prepare();
3625 send_qmp_error_event(bs, action, is_read, error);
3626 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3627 } else {
3628 send_qmp_error_event(bs, action, is_read, error);
3632 int bdrv_is_read_only(BlockDriverState *bs)
3634 return bs->read_only;
3637 int bdrv_is_sg(BlockDriverState *bs)
3639 return bs->sg;
3642 int bdrv_enable_write_cache(BlockDriverState *bs)
3644 return bs->enable_write_cache;
3647 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3649 bs->enable_write_cache = wce;
3651 /* so a reopen() will preserve wce */
3652 if (wce) {
3653 bs->open_flags |= BDRV_O_CACHE_WB;
3654 } else {
3655 bs->open_flags &= ~BDRV_O_CACHE_WB;
3659 int bdrv_is_encrypted(BlockDriverState *bs)
3661 if (bs->backing_hd && bs->backing_hd->encrypted)
3662 return 1;
3663 return bs->encrypted;
3666 int bdrv_key_required(BlockDriverState *bs)
3668 BlockDriverState *backing_hd = bs->backing_hd;
3670 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3671 return 1;
3672 return (bs->encrypted && !bs->valid_key);
3675 int bdrv_set_key(BlockDriverState *bs, const char *key)
3677 int ret;
3678 if (bs->backing_hd && bs->backing_hd->encrypted) {
3679 ret = bdrv_set_key(bs->backing_hd, key);
3680 if (ret < 0)
3681 return ret;
3682 if (!bs->encrypted)
3683 return 0;
3685 if (!bs->encrypted) {
3686 return -EINVAL;
3687 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3688 return -ENOMEDIUM;
3690 ret = bs->drv->bdrv_set_key(bs, key);
3691 if (ret < 0) {
3692 bs->valid_key = 0;
3693 } else if (!bs->valid_key) {
3694 bs->valid_key = 1;
3695 if (bs->blk) {
3696 /* call the change callback now, we skipped it on open */
3697 blk_dev_change_media_cb(bs->blk, true);
3700 return ret;
3703 const char *bdrv_get_format_name(BlockDriverState *bs)
3705 return bs->drv ? bs->drv->format_name : NULL;
3708 static int qsort_strcmp(const void *a, const void *b)
3710 return strcmp(a, b);
3713 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3714 void *opaque)
3716 BlockDriver *drv;
3717 int count = 0;
3718 int i;
3719 const char **formats = NULL;
3721 QLIST_FOREACH(drv, &bdrv_drivers, list) {
3722 if (drv->format_name) {
3723 bool found = false;
3724 int i = count;
3725 while (formats && i && !found) {
3726 found = !strcmp(formats[--i], drv->format_name);
3729 if (!found) {
3730 formats = g_renew(const char *, formats, count + 1);
3731 formats[count++] = drv->format_name;
3736 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3738 for (i = 0; i < count; i++) {
3739 it(opaque, formats[i]);
3742 g_free(formats);
3745 /* This function is to find block backend bs */
3746 /* TODO convert callers to blk_by_name(), then remove */
3747 BlockDriverState *bdrv_find(const char *name)
3749 BlockBackend *blk = blk_by_name(name);
3751 return blk ? blk_bs(blk) : NULL;
3754 /* This function is to find a node in the bs graph */
3755 BlockDriverState *bdrv_find_node(const char *node_name)
3757 BlockDriverState *bs;
3759 assert(node_name);
3761 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3762 if (!strcmp(node_name, bs->node_name)) {
3763 return bs;
3766 return NULL;
3769 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3770 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3772 BlockDeviceInfoList *list, *entry;
3773 BlockDriverState *bs;
3775 list = NULL;
3776 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3777 entry = g_malloc0(sizeof(*entry));
3778 entry->value = bdrv_block_device_info(bs);
3779 entry->next = list;
3780 list = entry;
3783 return list;
3786 BlockDriverState *bdrv_lookup_bs(const char *device,
3787 const char *node_name,
3788 Error **errp)
3790 BlockBackend *blk;
3791 BlockDriverState *bs;
3793 if (device) {
3794 blk = blk_by_name(device);
3796 if (blk) {
3797 return blk_bs(blk);
3801 if (node_name) {
3802 bs = bdrv_find_node(node_name);
3804 if (bs) {
3805 return bs;
3809 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3810 device ? device : "",
3811 node_name ? node_name : "");
3812 return NULL;
3815 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3816 * return false. If either argument is NULL, return false. */
3817 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3819 while (top && top != base) {
3820 top = top->backing_hd;
3823 return top != NULL;
3826 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3828 if (!bs) {
3829 return QTAILQ_FIRST(&graph_bdrv_states);
3831 return QTAILQ_NEXT(bs, node_list);
3834 BlockDriverState *bdrv_next(BlockDriverState *bs)
3836 if (!bs) {
3837 return QTAILQ_FIRST(&bdrv_states);
3839 return QTAILQ_NEXT(bs, device_list);
3842 const char *bdrv_get_node_name(const BlockDriverState *bs)
3844 return bs->node_name;
3847 /* TODO check what callers really want: bs->node_name or blk_name() */
3848 const char *bdrv_get_device_name(const BlockDriverState *bs)
3850 return bs->blk ? blk_name(bs->blk) : "";
3853 int bdrv_get_flags(BlockDriverState *bs)
3855 return bs->open_flags;
3858 int bdrv_flush_all(void)
3860 BlockDriverState *bs;
3861 int result = 0;
3863 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3864 AioContext *aio_context = bdrv_get_aio_context(bs);
3865 int ret;
3867 aio_context_acquire(aio_context);
3868 ret = bdrv_flush(bs);
3869 if (ret < 0 && !result) {
3870 result = ret;
3872 aio_context_release(aio_context);
3875 return result;
3878 int bdrv_has_zero_init_1(BlockDriverState *bs)
3880 return 1;
3883 int bdrv_has_zero_init(BlockDriverState *bs)
3885 assert(bs->drv);
3887 /* If BS is a copy on write image, it is initialized to
3888 the contents of the base image, which may not be zeroes. */
3889 if (bs->backing_hd) {
3890 return 0;
3892 if (bs->drv->bdrv_has_zero_init) {
3893 return bs->drv->bdrv_has_zero_init(bs);
3896 /* safe default */
3897 return 0;
3900 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3902 BlockDriverInfo bdi;
3904 if (bs->backing_hd) {
3905 return false;
3908 if (bdrv_get_info(bs, &bdi) == 0) {
3909 return bdi.unallocated_blocks_are_zero;
3912 return false;
3915 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3917 BlockDriverInfo bdi;
3919 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3920 return false;
3923 if (bdrv_get_info(bs, &bdi) == 0) {
3924 return bdi.can_write_zeroes_with_unmap;
3927 return false;
3930 typedef struct BdrvCoGetBlockStatusData {
3931 BlockDriverState *bs;
3932 BlockDriverState *base;
3933 int64_t sector_num;
3934 int nb_sectors;
3935 int *pnum;
3936 int64_t ret;
3937 bool done;
3938 } BdrvCoGetBlockStatusData;
3941 * Returns the allocation status of the specified sectors.
3942 * Drivers not implementing the functionality are assumed to not support
3943 * backing files, hence all their sectors are reported as allocated.
3945 * If 'sector_num' is beyond the end of the disk image the return value is 0
3946 * and 'pnum' is set to 0.
3948 * 'pnum' is set to the number of sectors (including and immediately following
3949 * the specified sector) that are known to be in the same
3950 * allocated/unallocated state.
3952 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3953 * beyond the end of the disk image it will be clamped.
3955 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3956 int64_t sector_num,
3957 int nb_sectors, int *pnum)
3959 int64_t total_sectors;
3960 int64_t n;
3961 int64_t ret, ret2;
3963 total_sectors = bdrv_nb_sectors(bs);
3964 if (total_sectors < 0) {
3965 return total_sectors;
3968 if (sector_num >= total_sectors) {
3969 *pnum = 0;
3970 return 0;
3973 n = total_sectors - sector_num;
3974 if (n < nb_sectors) {
3975 nb_sectors = n;
3978 if (!bs->drv->bdrv_co_get_block_status) {
3979 *pnum = nb_sectors;
3980 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3981 if (bs->drv->protocol_name) {
3982 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3984 return ret;
3987 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3988 if (ret < 0) {
3989 *pnum = 0;
3990 return ret;
3993 if (ret & BDRV_BLOCK_RAW) {
3994 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3995 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3996 *pnum, pnum);
3999 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4000 ret |= BDRV_BLOCK_ALLOCATED;
4003 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4004 if (bdrv_unallocated_blocks_are_zero(bs)) {
4005 ret |= BDRV_BLOCK_ZERO;
4006 } else if (bs->backing_hd) {
4007 BlockDriverState *bs2 = bs->backing_hd;
4008 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4009 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4010 ret |= BDRV_BLOCK_ZERO;
4015 if (bs->file &&
4016 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4017 (ret & BDRV_BLOCK_OFFSET_VALID)) {
4018 int file_pnum;
4020 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4021 *pnum, &file_pnum);
4022 if (ret2 >= 0) {
4023 /* Ignore errors. This is just providing extra information, it
4024 * is useful but not necessary.
4026 if (!file_pnum) {
4027 /* !file_pnum indicates an offset at or beyond the EOF; it is
4028 * perfectly valid for the format block driver to point to such
4029 * offsets, so catch it and mark everything as zero */
4030 ret |= BDRV_BLOCK_ZERO;
4031 } else {
4032 /* Limit request to the range reported by the protocol driver */
4033 *pnum = file_pnum;
4034 ret |= (ret2 & BDRV_BLOCK_ZERO);
4039 return ret;
4042 /* Coroutine wrapper for bdrv_get_block_status() */
4043 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4045 BdrvCoGetBlockStatusData *data = opaque;
4046 BlockDriverState *bs = data->bs;
4048 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4049 data->pnum);
4050 data->done = true;
4054 * Synchronous wrapper around bdrv_co_get_block_status().
4056 * See bdrv_co_get_block_status() for details.
4058 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4059 int nb_sectors, int *pnum)
4061 Coroutine *co;
4062 BdrvCoGetBlockStatusData data = {
4063 .bs = bs,
4064 .sector_num = sector_num,
4065 .nb_sectors = nb_sectors,
4066 .pnum = pnum,
4067 .done = false,
4070 if (qemu_in_coroutine()) {
4071 /* Fast-path if already in coroutine context */
4072 bdrv_get_block_status_co_entry(&data);
4073 } else {
4074 AioContext *aio_context = bdrv_get_aio_context(bs);
4076 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4077 qemu_coroutine_enter(co, &data);
4078 while (!data.done) {
4079 aio_poll(aio_context, true);
4082 return data.ret;
4085 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4086 int nb_sectors, int *pnum)
4088 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4089 if (ret < 0) {
4090 return ret;
4092 return !!(ret & BDRV_BLOCK_ALLOCATED);
4096 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4098 * Return true if the given sector is allocated in any image between
4099 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4100 * sector is allocated in any image of the chain. Return false otherwise.
4102 * 'pnum' is set to the number of sectors (including and immediately following
4103 * the specified sector) that are known to be in the same
4104 * allocated/unallocated state.
4107 int bdrv_is_allocated_above(BlockDriverState *top,
4108 BlockDriverState *base,
4109 int64_t sector_num,
4110 int nb_sectors, int *pnum)
4112 BlockDriverState *intermediate;
4113 int ret, n = nb_sectors;
4115 intermediate = top;
4116 while (intermediate && intermediate != base) {
4117 int pnum_inter;
4118 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4119 &pnum_inter);
4120 if (ret < 0) {
4121 return ret;
4122 } else if (ret) {
4123 *pnum = pnum_inter;
4124 return 1;
4128 * [sector_num, nb_sectors] is unallocated on top but intermediate
4129 * might have
4131 * [sector_num+x, nr_sectors] allocated.
4133 if (n > pnum_inter &&
4134 (intermediate == top ||
4135 sector_num + pnum_inter < intermediate->total_sectors)) {
4136 n = pnum_inter;
4139 intermediate = intermediate->backing_hd;
4142 *pnum = n;
4143 return 0;
4146 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4148 if (bs->backing_hd && bs->backing_hd->encrypted)
4149 return bs->backing_file;
4150 else if (bs->encrypted)
4151 return bs->filename;
4152 else
4153 return NULL;
4156 void bdrv_get_backing_filename(BlockDriverState *bs,
4157 char *filename, int filename_size)
4159 pstrcpy(filename, filename_size, bs->backing_file);
4162 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4163 const uint8_t *buf, int nb_sectors)
4165 BlockDriver *drv = bs->drv;
4166 if (!drv)
4167 return -ENOMEDIUM;
4168 if (!drv->bdrv_write_compressed)
4169 return -ENOTSUP;
4170 if (bdrv_check_request(bs, sector_num, nb_sectors))
4171 return -EIO;
4173 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4175 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4178 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4180 BlockDriver *drv = bs->drv;
4181 if (!drv)
4182 return -ENOMEDIUM;
4183 if (!drv->bdrv_get_info)
4184 return -ENOTSUP;
4185 memset(bdi, 0, sizeof(*bdi));
4186 return drv->bdrv_get_info(bs, bdi);
4189 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4191 BlockDriver *drv = bs->drv;
4192 if (drv && drv->bdrv_get_specific_info) {
4193 return drv->bdrv_get_specific_info(bs);
4195 return NULL;
4198 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4199 int64_t pos, int size)
4201 QEMUIOVector qiov;
4202 struct iovec iov = {
4203 .iov_base = (void *) buf,
4204 .iov_len = size,
4207 qemu_iovec_init_external(&qiov, &iov, 1);
4208 return bdrv_writev_vmstate(bs, &qiov, pos);
4211 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4213 BlockDriver *drv = bs->drv;
4215 if (!drv) {
4216 return -ENOMEDIUM;
4217 } else if (drv->bdrv_save_vmstate) {
4218 return drv->bdrv_save_vmstate(bs, qiov, pos);
4219 } else if (bs->file) {
4220 return bdrv_writev_vmstate(bs->file, qiov, pos);
4223 return -ENOTSUP;
4226 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4227 int64_t pos, int size)
4229 BlockDriver *drv = bs->drv;
4230 if (!drv)
4231 return -ENOMEDIUM;
4232 if (drv->bdrv_load_vmstate)
4233 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4234 if (bs->file)
4235 return bdrv_load_vmstate(bs->file, buf, pos, size);
4236 return -ENOTSUP;
4239 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4241 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4242 return;
4245 bs->drv->bdrv_debug_event(bs, event);
4248 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4249 const char *tag)
4251 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4252 bs = bs->file;
4255 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4256 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4259 return -ENOTSUP;
4262 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4264 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4265 bs = bs->file;
4268 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4269 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4272 return -ENOTSUP;
4275 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4277 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4278 bs = bs->file;
4281 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4282 return bs->drv->bdrv_debug_resume(bs, tag);
4285 return -ENOTSUP;
4288 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4290 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4291 bs = bs->file;
4294 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4295 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4298 return false;
4301 int bdrv_is_snapshot(BlockDriverState *bs)
4303 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4306 /* backing_file can either be relative, or absolute, or a protocol. If it is
4307 * relative, it must be relative to the chain. So, passing in bs->filename
4308 * from a BDS as backing_file should not be done, as that may be relative to
4309 * the CWD rather than the chain. */
4310 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4311 const char *backing_file)
4313 char *filename_full = NULL;
4314 char *backing_file_full = NULL;
4315 char *filename_tmp = NULL;
4316 int is_protocol = 0;
4317 BlockDriverState *curr_bs = NULL;
4318 BlockDriverState *retval = NULL;
4320 if (!bs || !bs->drv || !backing_file) {
4321 return NULL;
4324 filename_full = g_malloc(PATH_MAX);
4325 backing_file_full = g_malloc(PATH_MAX);
4326 filename_tmp = g_malloc(PATH_MAX);
4328 is_protocol = path_has_protocol(backing_file);
4330 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4332 /* If either of the filename paths is actually a protocol, then
4333 * compare unmodified paths; otherwise make paths relative */
4334 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4335 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4336 retval = curr_bs->backing_hd;
4337 break;
4339 } else {
4340 /* If not an absolute filename path, make it relative to the current
4341 * image's filename path */
4342 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4343 backing_file);
4345 /* We are going to compare absolute pathnames */
4346 if (!realpath(filename_tmp, filename_full)) {
4347 continue;
4350 /* We need to make sure the backing filename we are comparing against
4351 * is relative to the current image filename (or absolute) */
4352 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4353 curr_bs->backing_file);
4355 if (!realpath(filename_tmp, backing_file_full)) {
4356 continue;
4359 if (strcmp(backing_file_full, filename_full) == 0) {
4360 retval = curr_bs->backing_hd;
4361 break;
4366 g_free(filename_full);
4367 g_free(backing_file_full);
4368 g_free(filename_tmp);
4369 return retval;
4372 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4374 if (!bs->drv) {
4375 return 0;
4378 if (!bs->backing_hd) {
4379 return 0;
4382 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4385 /**************************************************************/
4386 /* async I/Os */
4388 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4389 QEMUIOVector *qiov, int nb_sectors,
4390 BlockCompletionFunc *cb, void *opaque)
4392 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4394 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4395 cb, opaque, false);
4398 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4399 QEMUIOVector *qiov, int nb_sectors,
4400 BlockCompletionFunc *cb, void *opaque)
4402 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4404 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4405 cb, opaque, true);
4408 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4409 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4410 BlockCompletionFunc *cb, void *opaque)
4412 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4414 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4415 BDRV_REQ_ZERO_WRITE | flags,
4416 cb, opaque, true);
4420 typedef struct MultiwriteCB {
4421 int error;
4422 int num_requests;
4423 int num_callbacks;
4424 struct {
4425 BlockCompletionFunc *cb;
4426 void *opaque;
4427 QEMUIOVector *free_qiov;
4428 } callbacks[];
4429 } MultiwriteCB;
4431 static void multiwrite_user_cb(MultiwriteCB *mcb)
4433 int i;
4435 for (i = 0; i < mcb->num_callbacks; i++) {
4436 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4437 if (mcb->callbacks[i].free_qiov) {
4438 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4440 g_free(mcb->callbacks[i].free_qiov);
4444 static void multiwrite_cb(void *opaque, int ret)
4446 MultiwriteCB *mcb = opaque;
4448 trace_multiwrite_cb(mcb, ret);
4450 if (ret < 0 && !mcb->error) {
4451 mcb->error = ret;
4454 mcb->num_requests--;
4455 if (mcb->num_requests == 0) {
4456 multiwrite_user_cb(mcb);
4457 g_free(mcb);
4461 static int multiwrite_req_compare(const void *a, const void *b)
4463 const BlockRequest *req1 = a, *req2 = b;
4466 * Note that we can't simply subtract req2->sector from req1->sector
4467 * here as that could overflow the return value.
4469 if (req1->sector > req2->sector) {
4470 return 1;
4471 } else if (req1->sector < req2->sector) {
4472 return -1;
4473 } else {
4474 return 0;
4479 * Takes a bunch of requests and tries to merge them. Returns the number of
4480 * requests that remain after merging.
4482 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4483 int num_reqs, MultiwriteCB *mcb)
4485 int i, outidx;
4487 // Sort requests by start sector
4488 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4490 // Check if adjacent requests touch the same clusters. If so, combine them,
4491 // filling up gaps with zero sectors.
4492 outidx = 0;
4493 for (i = 1; i < num_reqs; i++) {
4494 int merge = 0;
4495 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4497 // Handle exactly sequential writes and overlapping writes.
4498 if (reqs[i].sector <= oldreq_last) {
4499 merge = 1;
4502 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4503 merge = 0;
4506 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4507 reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4508 merge = 0;
4511 if (merge) {
4512 size_t size;
4513 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4514 qemu_iovec_init(qiov,
4515 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4517 // Add the first request to the merged one. If the requests are
4518 // overlapping, drop the last sectors of the first request.
4519 size = (reqs[i].sector - reqs[outidx].sector) << 9;
4520 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4522 // We should need to add any zeros between the two requests
4523 assert (reqs[i].sector <= oldreq_last);
4525 // Add the second request
4526 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4528 // Add tail of first request, if necessary
4529 if (qiov->size < reqs[outidx].qiov->size) {
4530 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4531 reqs[outidx].qiov->size - qiov->size);
4534 reqs[outidx].nb_sectors = qiov->size >> 9;
4535 reqs[outidx].qiov = qiov;
4537 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4538 } else {
4539 outidx++;
4540 reqs[outidx].sector = reqs[i].sector;
4541 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4542 reqs[outidx].qiov = reqs[i].qiov;
4546 return outidx + 1;
4550 * Submit multiple AIO write requests at once.
4552 * On success, the function returns 0 and all requests in the reqs array have
4553 * been submitted. In error case this function returns -1, and any of the
4554 * requests may or may not be submitted yet. In particular, this means that the
4555 * callback will be called for some of the requests, for others it won't. The
4556 * caller must check the error field of the BlockRequest to wait for the right
4557 * callbacks (if error != 0, no callback will be called).
4559 * The implementation may modify the contents of the reqs array, e.g. to merge
4560 * requests. However, the fields opaque and error are left unmodified as they
4561 * are used to signal failure for a single request to the caller.
4563 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4565 MultiwriteCB *mcb;
4566 int i;
4568 /* don't submit writes if we don't have a medium */
4569 if (bs->drv == NULL) {
4570 for (i = 0; i < num_reqs; i++) {
4571 reqs[i].error = -ENOMEDIUM;
4573 return -1;
4576 if (num_reqs == 0) {
4577 return 0;
4580 // Create MultiwriteCB structure
4581 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4582 mcb->num_requests = 0;
4583 mcb->num_callbacks = num_reqs;
4585 for (i = 0; i < num_reqs; i++) {
4586 mcb->callbacks[i].cb = reqs[i].cb;
4587 mcb->callbacks[i].opaque = reqs[i].opaque;
4590 // Check for mergable requests
4591 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4593 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4595 /* Run the aio requests. */
4596 mcb->num_requests = num_reqs;
4597 for (i = 0; i < num_reqs; i++) {
4598 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4599 reqs[i].nb_sectors, reqs[i].flags,
4600 multiwrite_cb, mcb,
4601 true);
4604 return 0;
4607 void bdrv_aio_cancel(BlockAIOCB *acb)
4609 qemu_aio_ref(acb);
4610 bdrv_aio_cancel_async(acb);
4611 while (acb->refcnt > 1) {
4612 if (acb->aiocb_info->get_aio_context) {
4613 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4614 } else if (acb->bs) {
4615 aio_poll(bdrv_get_aio_context(acb->bs), true);
4616 } else {
4617 abort();
4620 qemu_aio_unref(acb);
4623 /* Async version of aio cancel. The caller is not blocked if the acb implements
4624 * cancel_async, otherwise we do nothing and let the request normally complete.
4625 * In either case the completion callback must be called. */
4626 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4628 if (acb->aiocb_info->cancel_async) {
4629 acb->aiocb_info->cancel_async(acb);
4633 /**************************************************************/
4634 /* async block device emulation */
4636 typedef struct BlockAIOCBSync {
4637 BlockAIOCB common;
4638 QEMUBH *bh;
4639 int ret;
4640 /* vector translation state */
4641 QEMUIOVector *qiov;
4642 uint8_t *bounce;
4643 int is_write;
4644 } BlockAIOCBSync;
4646 static const AIOCBInfo bdrv_em_aiocb_info = {
4647 .aiocb_size = sizeof(BlockAIOCBSync),
4650 static void bdrv_aio_bh_cb(void *opaque)
4652 BlockAIOCBSync *acb = opaque;
4654 if (!acb->is_write && acb->ret >= 0) {
4655 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4657 qemu_vfree(acb->bounce);
4658 acb->common.cb(acb->common.opaque, acb->ret);
4659 qemu_bh_delete(acb->bh);
4660 acb->bh = NULL;
4661 qemu_aio_unref(acb);
4664 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4665 int64_t sector_num,
4666 QEMUIOVector *qiov,
4667 int nb_sectors,
4668 BlockCompletionFunc *cb,
4669 void *opaque,
4670 int is_write)
4673 BlockAIOCBSync *acb;
4675 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4676 acb->is_write = is_write;
4677 acb->qiov = qiov;
4678 acb->bounce = qemu_try_blockalign(bs, qiov->size);
4679 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4681 if (acb->bounce == NULL) {
4682 acb->ret = -ENOMEM;
4683 } else if (is_write) {
4684 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4685 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4686 } else {
4687 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4690 qemu_bh_schedule(acb->bh);
4692 return &acb->common;
4695 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4696 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4697 BlockCompletionFunc *cb, void *opaque)
4699 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4702 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4703 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4704 BlockCompletionFunc *cb, void *opaque)
4706 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4710 typedef struct BlockAIOCBCoroutine {
4711 BlockAIOCB common;
4712 BlockRequest req;
4713 bool is_write;
4714 bool *done;
4715 QEMUBH* bh;
4716 } BlockAIOCBCoroutine;
4718 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4719 .aiocb_size = sizeof(BlockAIOCBCoroutine),
4722 static void bdrv_co_em_bh(void *opaque)
4724 BlockAIOCBCoroutine *acb = opaque;
4726 acb->common.cb(acb->common.opaque, acb->req.error);
4728 qemu_bh_delete(acb->bh);
4729 qemu_aio_unref(acb);
4732 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4733 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4735 BlockAIOCBCoroutine *acb = opaque;
4736 BlockDriverState *bs = acb->common.bs;
4738 if (!acb->is_write) {
4739 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4740 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4741 } else {
4742 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4743 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4746 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4747 qemu_bh_schedule(acb->bh);
4750 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4751 int64_t sector_num,
4752 QEMUIOVector *qiov,
4753 int nb_sectors,
4754 BdrvRequestFlags flags,
4755 BlockCompletionFunc *cb,
4756 void *opaque,
4757 bool is_write)
4759 Coroutine *co;
4760 BlockAIOCBCoroutine *acb;
4762 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4763 acb->req.sector = sector_num;
4764 acb->req.nb_sectors = nb_sectors;
4765 acb->req.qiov = qiov;
4766 acb->req.flags = flags;
4767 acb->is_write = is_write;
4769 co = qemu_coroutine_create(bdrv_co_do_rw);
4770 qemu_coroutine_enter(co, acb);
4772 return &acb->common;
4775 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4777 BlockAIOCBCoroutine *acb = opaque;
4778 BlockDriverState *bs = acb->common.bs;
4780 acb->req.error = bdrv_co_flush(bs);
4781 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4782 qemu_bh_schedule(acb->bh);
4785 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4786 BlockCompletionFunc *cb, void *opaque)
4788 trace_bdrv_aio_flush(bs, opaque);
4790 Coroutine *co;
4791 BlockAIOCBCoroutine *acb;
4793 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4795 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4796 qemu_coroutine_enter(co, acb);
4798 return &acb->common;
4801 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4803 BlockAIOCBCoroutine *acb = opaque;
4804 BlockDriverState *bs = acb->common.bs;
4806 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4807 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4808 qemu_bh_schedule(acb->bh);
4811 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4812 int64_t sector_num, int nb_sectors,
4813 BlockCompletionFunc *cb, void *opaque)
4815 Coroutine *co;
4816 BlockAIOCBCoroutine *acb;
4818 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4820 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4821 acb->req.sector = sector_num;
4822 acb->req.nb_sectors = nb_sectors;
4823 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4824 qemu_coroutine_enter(co, acb);
4826 return &acb->common;
4829 void bdrv_init(void)
4831 module_call_init(MODULE_INIT_BLOCK);
4834 void bdrv_init_with_whitelist(void)
4836 use_bdrv_whitelist = 1;
4837 bdrv_init();
4840 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4841 BlockCompletionFunc *cb, void *opaque)
4843 BlockAIOCB *acb;
4845 acb = g_slice_alloc(aiocb_info->aiocb_size);
4846 acb->aiocb_info = aiocb_info;
4847 acb->bs = bs;
4848 acb->cb = cb;
4849 acb->opaque = opaque;
4850 acb->refcnt = 1;
4851 return acb;
4854 void qemu_aio_ref(void *p)
4856 BlockAIOCB *acb = p;
4857 acb->refcnt++;
4860 void qemu_aio_unref(void *p)
4862 BlockAIOCB *acb = p;
4863 assert(acb->refcnt > 0);
4864 if (--acb->refcnt == 0) {
4865 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4869 /**************************************************************/
4870 /* Coroutine block device emulation */
4872 typedef struct CoroutineIOCompletion {
4873 Coroutine *coroutine;
4874 int ret;
4875 } CoroutineIOCompletion;
4877 static void bdrv_co_io_em_complete(void *opaque, int ret)
4879 CoroutineIOCompletion *co = opaque;
4881 co->ret = ret;
4882 qemu_coroutine_enter(co->coroutine, NULL);
4885 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4886 int nb_sectors, QEMUIOVector *iov,
4887 bool is_write)
4889 CoroutineIOCompletion co = {
4890 .coroutine = qemu_coroutine_self(),
4892 BlockAIOCB *acb;
4894 if (is_write) {
4895 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4896 bdrv_co_io_em_complete, &co);
4897 } else {
4898 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4899 bdrv_co_io_em_complete, &co);
4902 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4903 if (!acb) {
4904 return -EIO;
4906 qemu_coroutine_yield();
4908 return co.ret;
4911 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4912 int64_t sector_num, int nb_sectors,
4913 QEMUIOVector *iov)
4915 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4918 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4919 int64_t sector_num, int nb_sectors,
4920 QEMUIOVector *iov)
4922 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4925 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4927 RwCo *rwco = opaque;
4929 rwco->ret = bdrv_co_flush(rwco->bs);
4932 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4934 int ret;
4936 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4937 return 0;
4940 /* Write back cached data to the OS even with cache=unsafe */
4941 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4942 if (bs->drv->bdrv_co_flush_to_os) {
4943 ret = bs->drv->bdrv_co_flush_to_os(bs);
4944 if (ret < 0) {
4945 return ret;
4949 /* But don't actually force it to the disk with cache=unsafe */
4950 if (bs->open_flags & BDRV_O_NO_FLUSH) {
4951 goto flush_parent;
4954 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4955 if (bs->drv->bdrv_co_flush_to_disk) {
4956 ret = bs->drv->bdrv_co_flush_to_disk(bs);
4957 } else if (bs->drv->bdrv_aio_flush) {
4958 BlockAIOCB *acb;
4959 CoroutineIOCompletion co = {
4960 .coroutine = qemu_coroutine_self(),
4963 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4964 if (acb == NULL) {
4965 ret = -EIO;
4966 } else {
4967 qemu_coroutine_yield();
4968 ret = co.ret;
4970 } else {
4972 * Some block drivers always operate in either writethrough or unsafe
4973 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4974 * know how the server works (because the behaviour is hardcoded or
4975 * depends on server-side configuration), so we can't ensure that
4976 * everything is safe on disk. Returning an error doesn't work because
4977 * that would break guests even if the server operates in writethrough
4978 * mode.
4980 * Let's hope the user knows what he's doing.
4982 ret = 0;
4984 if (ret < 0) {
4985 return ret;
4988 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4989 * in the case of cache=unsafe, so there are no useless flushes.
4991 flush_parent:
4992 return bdrv_co_flush(bs->file);
4995 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4997 Error *local_err = NULL;
4998 int ret;
5000 if (!bs->drv) {
5001 return;
5004 if (!(bs->open_flags & BDRV_O_INCOMING)) {
5005 return;
5007 bs->open_flags &= ~BDRV_O_INCOMING;
5009 if (bs->drv->bdrv_invalidate_cache) {
5010 bs->drv->bdrv_invalidate_cache(bs, &local_err);
5011 } else if (bs->file) {
5012 bdrv_invalidate_cache(bs->file, &local_err);
5014 if (local_err) {
5015 error_propagate(errp, local_err);
5016 return;
5019 ret = refresh_total_sectors(bs, bs->total_sectors);
5020 if (ret < 0) {
5021 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5022 return;
5026 void bdrv_invalidate_cache_all(Error **errp)
5028 BlockDriverState *bs;
5029 Error *local_err = NULL;
5031 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5032 AioContext *aio_context = bdrv_get_aio_context(bs);
5034 aio_context_acquire(aio_context);
5035 bdrv_invalidate_cache(bs, &local_err);
5036 aio_context_release(aio_context);
5037 if (local_err) {
5038 error_propagate(errp, local_err);
5039 return;
5044 int bdrv_flush(BlockDriverState *bs)
5046 Coroutine *co;
5047 RwCo rwco = {
5048 .bs = bs,
5049 .ret = NOT_DONE,
5052 if (qemu_in_coroutine()) {
5053 /* Fast-path if already in coroutine context */
5054 bdrv_flush_co_entry(&rwco);
5055 } else {
5056 AioContext *aio_context = bdrv_get_aio_context(bs);
5058 co = qemu_coroutine_create(bdrv_flush_co_entry);
5059 qemu_coroutine_enter(co, &rwco);
5060 while (rwco.ret == NOT_DONE) {
5061 aio_poll(aio_context, true);
5065 return rwco.ret;
5068 typedef struct DiscardCo {
5069 BlockDriverState *bs;
5070 int64_t sector_num;
5071 int nb_sectors;
5072 int ret;
5073 } DiscardCo;
5074 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5076 DiscardCo *rwco = opaque;
5078 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5081 /* if no limit is specified in the BlockLimits use a default
5082 * of 32768 512-byte sectors (16 MiB) per request.
5084 #define MAX_DISCARD_DEFAULT 32768
5086 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5087 int nb_sectors)
5089 int max_discard;
5091 if (!bs->drv) {
5092 return -ENOMEDIUM;
5093 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5094 return -EIO;
5095 } else if (bs->read_only) {
5096 return -EROFS;
5099 bdrv_reset_dirty(bs, sector_num, nb_sectors);
5101 /* Do nothing if disabled. */
5102 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5103 return 0;
5106 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5107 return 0;
5110 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5111 while (nb_sectors > 0) {
5112 int ret;
5113 int num = nb_sectors;
5115 /* align request */
5116 if (bs->bl.discard_alignment &&
5117 num >= bs->bl.discard_alignment &&
5118 sector_num % bs->bl.discard_alignment) {
5119 if (num > bs->bl.discard_alignment) {
5120 num = bs->bl.discard_alignment;
5122 num -= sector_num % bs->bl.discard_alignment;
5125 /* limit request size */
5126 if (num > max_discard) {
5127 num = max_discard;
5130 if (bs->drv->bdrv_co_discard) {
5131 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5132 } else {
5133 BlockAIOCB *acb;
5134 CoroutineIOCompletion co = {
5135 .coroutine = qemu_coroutine_self(),
5138 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5139 bdrv_co_io_em_complete, &co);
5140 if (acb == NULL) {
5141 return -EIO;
5142 } else {
5143 qemu_coroutine_yield();
5144 ret = co.ret;
5147 if (ret && ret != -ENOTSUP) {
5148 return ret;
5151 sector_num += num;
5152 nb_sectors -= num;
5154 return 0;
5157 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5159 Coroutine *co;
5160 DiscardCo rwco = {
5161 .bs = bs,
5162 .sector_num = sector_num,
5163 .nb_sectors = nb_sectors,
5164 .ret = NOT_DONE,
5167 if (qemu_in_coroutine()) {
5168 /* Fast-path if already in coroutine context */
5169 bdrv_discard_co_entry(&rwco);
5170 } else {
5171 AioContext *aio_context = bdrv_get_aio_context(bs);
5173 co = qemu_coroutine_create(bdrv_discard_co_entry);
5174 qemu_coroutine_enter(co, &rwco);
5175 while (rwco.ret == NOT_DONE) {
5176 aio_poll(aio_context, true);
5180 return rwco.ret;
5183 /**************************************************************/
5184 /* removable device support */
5187 * Return TRUE if the media is present
5189 int bdrv_is_inserted(BlockDriverState *bs)
5191 BlockDriver *drv = bs->drv;
5193 if (!drv)
5194 return 0;
5195 if (!drv->bdrv_is_inserted)
5196 return 1;
5197 return drv->bdrv_is_inserted(bs);
5201 * Return whether the media changed since the last call to this
5202 * function, or -ENOTSUP if we don't know. Most drivers don't know.
5204 int bdrv_media_changed(BlockDriverState *bs)
5206 BlockDriver *drv = bs->drv;
5208 if (drv && drv->bdrv_media_changed) {
5209 return drv->bdrv_media_changed(bs);
5211 return -ENOTSUP;
5215 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5217 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5219 BlockDriver *drv = bs->drv;
5220 const char *device_name;
5222 if (drv && drv->bdrv_eject) {
5223 drv->bdrv_eject(bs, eject_flag);
5226 device_name = bdrv_get_device_name(bs);
5227 if (device_name[0] != '\0') {
5228 qapi_event_send_device_tray_moved(device_name,
5229 eject_flag, &error_abort);
5234 * Lock or unlock the media (if it is locked, the user won't be able
5235 * to eject it manually).
5237 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5239 BlockDriver *drv = bs->drv;
5241 trace_bdrv_lock_medium(bs, locked);
5243 if (drv && drv->bdrv_lock_medium) {
5244 drv->bdrv_lock_medium(bs, locked);
5248 /* needed for generic scsi interface */
5250 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5252 BlockDriver *drv = bs->drv;
5254 if (drv && drv->bdrv_ioctl)
5255 return drv->bdrv_ioctl(bs, req, buf);
5256 return -ENOTSUP;
5259 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5260 unsigned long int req, void *buf,
5261 BlockCompletionFunc *cb, void *opaque)
5263 BlockDriver *drv = bs->drv;
5265 if (drv && drv->bdrv_aio_ioctl)
5266 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5267 return NULL;
5270 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5272 bs->guest_block_size = align;
5275 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5277 return qemu_memalign(bdrv_opt_mem_align(bs), size);
5280 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5282 return memset(qemu_blockalign(bs, size), 0, size);
5285 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5287 size_t align = bdrv_opt_mem_align(bs);
5289 /* Ensure that NULL is never returned on success */
5290 assert(align > 0);
5291 if (size == 0) {
5292 size = align;
5295 return qemu_try_memalign(align, size);
5298 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5300 void *mem = qemu_try_blockalign(bs, size);
5302 if (mem) {
5303 memset(mem, 0, size);
5306 return mem;
5310 * Check if all memory in this vector is sector aligned.
5312 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5314 int i;
5315 size_t alignment = bdrv_opt_mem_align(bs);
5317 for (i = 0; i < qiov->niov; i++) {
5318 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5319 return false;
5321 if (qiov->iov[i].iov_len % alignment) {
5322 return false;
5326 return true;
5329 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5330 Error **errp)
5332 int64_t bitmap_size;
5333 BdrvDirtyBitmap *bitmap;
5335 assert((granularity & (granularity - 1)) == 0);
5337 granularity >>= BDRV_SECTOR_BITS;
5338 assert(granularity);
5339 bitmap_size = bdrv_nb_sectors(bs);
5340 if (bitmap_size < 0) {
5341 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5342 errno = -bitmap_size;
5343 return NULL;
5345 bitmap = g_new0(BdrvDirtyBitmap, 1);
5346 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5347 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5348 return bitmap;
5351 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5353 BdrvDirtyBitmap *bm, *next;
5354 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5355 if (bm == bitmap) {
5356 QLIST_REMOVE(bitmap, list);
5357 hbitmap_free(bitmap->bitmap);
5358 g_free(bitmap);
5359 return;
5364 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5366 BdrvDirtyBitmap *bm;
5367 BlockDirtyInfoList *list = NULL;
5368 BlockDirtyInfoList **plist = &list;
5370 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5371 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5372 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5373 info->count = bdrv_get_dirty_count(bs, bm);
5374 info->granularity =
5375 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5376 entry->value = info;
5377 *plist = entry;
5378 plist = &entry->next;
5381 return list;
5384 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5386 if (bitmap) {
5387 return hbitmap_get(bitmap->bitmap, sector);
5388 } else {
5389 return 0;
5393 void bdrv_dirty_iter_init(BlockDriverState *bs,
5394 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5396 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5399 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5400 int nr_sectors)
5402 BdrvDirtyBitmap *bitmap;
5403 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5404 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5408 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5410 BdrvDirtyBitmap *bitmap;
5411 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5412 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5416 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5418 return hbitmap_count(bitmap->bitmap);
5421 /* Get a reference to bs */
5422 void bdrv_ref(BlockDriverState *bs)
5424 bs->refcnt++;
5427 /* Release a previously grabbed reference to bs.
5428 * If after releasing, reference count is zero, the BlockDriverState is
5429 * deleted. */
5430 void bdrv_unref(BlockDriverState *bs)
5432 if (!bs) {
5433 return;
5435 assert(bs->refcnt > 0);
5436 if (--bs->refcnt == 0) {
5437 bdrv_delete(bs);
5441 struct BdrvOpBlocker {
5442 Error *reason;
5443 QLIST_ENTRY(BdrvOpBlocker) list;
5446 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5448 BdrvOpBlocker *blocker;
5449 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5450 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5451 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5452 if (errp) {
5453 error_setg(errp, "Device '%s' is busy: %s",
5454 bdrv_get_device_name(bs),
5455 error_get_pretty(blocker->reason));
5457 return true;
5459 return false;
5462 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5464 BdrvOpBlocker *blocker;
5465 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5467 blocker = g_new0(BdrvOpBlocker, 1);
5468 blocker->reason = reason;
5469 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5472 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5474 BdrvOpBlocker *blocker, *next;
5475 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5476 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5477 if (blocker->reason == reason) {
5478 QLIST_REMOVE(blocker, list);
5479 g_free(blocker);
5484 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5486 int i;
5487 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5488 bdrv_op_block(bs, i, reason);
5492 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5494 int i;
5495 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5496 bdrv_op_unblock(bs, i, reason);
5500 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5502 int i;
5504 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5505 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5506 return false;
5509 return true;
5512 void bdrv_iostatus_enable(BlockDriverState *bs)
5514 bs->iostatus_enabled = true;
5515 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5518 /* The I/O status is only enabled if the drive explicitly
5519 * enables it _and_ the VM is configured to stop on errors */
5520 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5522 return (bs->iostatus_enabled &&
5523 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5524 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5525 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5528 void bdrv_iostatus_disable(BlockDriverState *bs)
5530 bs->iostatus_enabled = false;
5533 void bdrv_iostatus_reset(BlockDriverState *bs)
5535 if (bdrv_iostatus_is_enabled(bs)) {
5536 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5537 if (bs->job) {
5538 block_job_iostatus_reset(bs->job);
5543 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5545 assert(bdrv_iostatus_is_enabled(bs));
5546 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5547 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5548 BLOCK_DEVICE_IO_STATUS_FAILED;
5552 void bdrv_img_create(const char *filename, const char *fmt,
5553 const char *base_filename, const char *base_fmt,
5554 char *options, uint64_t img_size, int flags,
5555 Error **errp, bool quiet)
5557 QemuOptsList *create_opts = NULL;
5558 QemuOpts *opts = NULL;
5559 const char *backing_fmt, *backing_file;
5560 int64_t size;
5561 BlockDriver *drv, *proto_drv;
5562 BlockDriver *backing_drv = NULL;
5563 Error *local_err = NULL;
5564 int ret = 0;
5566 /* Find driver and parse its options */
5567 drv = bdrv_find_format(fmt);
5568 if (!drv) {
5569 error_setg(errp, "Unknown file format '%s'", fmt);
5570 return;
5573 proto_drv = bdrv_find_protocol(filename, true);
5574 if (!proto_drv) {
5575 error_setg(errp, "Unknown protocol '%s'", filename);
5576 return;
5579 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5580 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5582 /* Create parameter list with default values */
5583 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5584 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5586 /* Parse -o options */
5587 if (options) {
5588 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5589 error_setg(errp, "Invalid options for file format '%s'", fmt);
5590 goto out;
5594 if (base_filename) {
5595 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5596 error_setg(errp, "Backing file not supported for file format '%s'",
5597 fmt);
5598 goto out;
5602 if (base_fmt) {
5603 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5604 error_setg(errp, "Backing file format not supported for file "
5605 "format '%s'", fmt);
5606 goto out;
5610 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5611 if (backing_file) {
5612 if (!strcmp(filename, backing_file)) {
5613 error_setg(errp, "Error: Trying to create an image with the "
5614 "same filename as the backing file");
5615 goto out;
5619 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5620 if (backing_fmt) {
5621 backing_drv = bdrv_find_format(backing_fmt);
5622 if (!backing_drv) {
5623 error_setg(errp, "Unknown backing file format '%s'",
5624 backing_fmt);
5625 goto out;
5629 // The size for the image must always be specified, with one exception:
5630 // If we are using a backing file, we can obtain the size from there
5631 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5632 if (size == -1) {
5633 if (backing_file) {
5634 BlockDriverState *bs;
5635 int64_t size;
5636 int back_flags;
5638 /* backing files always opened read-only */
5639 back_flags =
5640 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5642 bs = NULL;
5643 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5644 backing_drv, &local_err);
5645 if (ret < 0) {
5646 goto out;
5648 size = bdrv_getlength(bs);
5649 if (size < 0) {
5650 error_setg_errno(errp, -size, "Could not get size of '%s'",
5651 backing_file);
5652 bdrv_unref(bs);
5653 goto out;
5656 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5658 bdrv_unref(bs);
5659 } else {
5660 error_setg(errp, "Image creation needs a size parameter");
5661 goto out;
5665 if (!quiet) {
5666 printf("Formatting '%s', fmt=%s ", filename, fmt);
5667 qemu_opts_print(opts);
5668 puts("");
5671 ret = bdrv_create(drv, filename, opts, &local_err);
5673 if (ret == -EFBIG) {
5674 /* This is generally a better message than whatever the driver would
5675 * deliver (especially because of the cluster_size_hint), since that
5676 * is most probably not much different from "image too large". */
5677 const char *cluster_size_hint = "";
5678 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5679 cluster_size_hint = " (try using a larger cluster size)";
5681 error_setg(errp, "The image size is too large for file format '%s'"
5682 "%s", fmt, cluster_size_hint);
5683 error_free(local_err);
5684 local_err = NULL;
5687 out:
5688 qemu_opts_del(opts);
5689 qemu_opts_free(create_opts);
5690 if (local_err) {
5691 error_propagate(errp, local_err);
5695 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5697 return bs->aio_context;
5700 void bdrv_detach_aio_context(BlockDriverState *bs)
5702 BdrvAioNotifier *baf;
5704 if (!bs->drv) {
5705 return;
5708 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5709 baf->detach_aio_context(baf->opaque);
5712 if (bs->io_limits_enabled) {
5713 throttle_detach_aio_context(&bs->throttle_state);
5715 if (bs->drv->bdrv_detach_aio_context) {
5716 bs->drv->bdrv_detach_aio_context(bs);
5718 if (bs->file) {
5719 bdrv_detach_aio_context(bs->file);
5721 if (bs->backing_hd) {
5722 bdrv_detach_aio_context(bs->backing_hd);
5725 bs->aio_context = NULL;
5728 void bdrv_attach_aio_context(BlockDriverState *bs,
5729 AioContext *new_context)
5731 BdrvAioNotifier *ban;
5733 if (!bs->drv) {
5734 return;
5737 bs->aio_context = new_context;
5739 if (bs->backing_hd) {
5740 bdrv_attach_aio_context(bs->backing_hd, new_context);
5742 if (bs->file) {
5743 bdrv_attach_aio_context(bs->file, new_context);
5745 if (bs->drv->bdrv_attach_aio_context) {
5746 bs->drv->bdrv_attach_aio_context(bs, new_context);
5748 if (bs->io_limits_enabled) {
5749 throttle_attach_aio_context(&bs->throttle_state, new_context);
5752 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5753 ban->attached_aio_context(new_context, ban->opaque);
5757 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5759 bdrv_drain_all(); /* ensure there are no in-flight requests */
5761 bdrv_detach_aio_context(bs);
5763 /* This function executes in the old AioContext so acquire the new one in
5764 * case it runs in a different thread.
5766 aio_context_acquire(new_context);
5767 bdrv_attach_aio_context(bs, new_context);
5768 aio_context_release(new_context);
5771 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5772 void (*attached_aio_context)(AioContext *new_context, void *opaque),
5773 void (*detach_aio_context)(void *opaque), void *opaque)
5775 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5776 *ban = (BdrvAioNotifier){
5777 .attached_aio_context = attached_aio_context,
5778 .detach_aio_context = detach_aio_context,
5779 .opaque = opaque
5782 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5785 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5786 void (*attached_aio_context)(AioContext *,
5787 void *),
5788 void (*detach_aio_context)(void *),
5789 void *opaque)
5791 BdrvAioNotifier *ban, *ban_next;
5793 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5794 if (ban->attached_aio_context == attached_aio_context &&
5795 ban->detach_aio_context == detach_aio_context &&
5796 ban->opaque == opaque)
5798 QLIST_REMOVE(ban, list);
5799 g_free(ban);
5801 return;
5805 abort();
5808 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5809 NotifierWithReturn *notifier)
5811 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5814 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
5815 BlockDriverAmendStatusCB *status_cb)
5817 if (!bs->drv->bdrv_amend_options) {
5818 return -ENOTSUP;
5820 return bs->drv->bdrv_amend_options(bs, opts, status_cb);
5823 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5824 * of block filter and by bdrv_is_first_non_filter.
5825 * It is used to test if the given bs is the candidate or recurse more in the
5826 * node graph.
5828 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5829 BlockDriverState *candidate)
5831 /* return false if basic checks fails */
5832 if (!bs || !bs->drv) {
5833 return false;
5836 /* the code reached a non block filter driver -> check if the bs is
5837 * the same as the candidate. It's the recursion termination condition.
5839 if (!bs->drv->is_filter) {
5840 return bs == candidate;
5842 /* Down this path the driver is a block filter driver */
5844 /* If the block filter recursion method is defined use it to recurse down
5845 * the node graph.
5847 if (bs->drv->bdrv_recurse_is_first_non_filter) {
5848 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5851 /* the driver is a block filter but don't allow to recurse -> return false
5853 return false;
5856 /* This function checks if the candidate is the first non filter bs down it's
5857 * bs chain. Since we don't have pointers to parents it explore all bs chains
5858 * from the top. Some filters can choose not to pass down the recursion.
5860 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5862 BlockDriverState *bs;
5864 /* walk down the bs forest recursively */
5865 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5866 bool perm;
5868 /* try to recurse in this top level bs */
5869 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5871 /* candidate is the first non filter */
5872 if (perm) {
5873 return true;
5877 return false;
5880 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5882 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5883 AioContext *aio_context;
5885 if (!to_replace_bs) {
5886 error_setg(errp, "Node name '%s' not found", node_name);
5887 return NULL;
5890 aio_context = bdrv_get_aio_context(to_replace_bs);
5891 aio_context_acquire(aio_context);
5893 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5894 to_replace_bs = NULL;
5895 goto out;
5898 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5899 * most non filter in order to prevent data corruption.
5900 * Another benefit is that this tests exclude backing files which are
5901 * blocked by the backing blockers.
5903 if (!bdrv_is_first_non_filter(to_replace_bs)) {
5904 error_setg(errp, "Only top most non filter can be replaced");
5905 to_replace_bs = NULL;
5906 goto out;
5909 out:
5910 aio_context_release(aio_context);
5911 return to_replace_bs;
5914 void bdrv_io_plug(BlockDriverState *bs)
5916 BlockDriver *drv = bs->drv;
5917 if (drv && drv->bdrv_io_plug) {
5918 drv->bdrv_io_plug(bs);
5919 } else if (bs->file) {
5920 bdrv_io_plug(bs->file);
5924 void bdrv_io_unplug(BlockDriverState *bs)
5926 BlockDriver *drv = bs->drv;
5927 if (drv && drv->bdrv_io_unplug) {
5928 drv->bdrv_io_unplug(bs);
5929 } else if (bs->file) {
5930 bdrv_io_unplug(bs->file);
5934 void bdrv_flush_io_queue(BlockDriverState *bs)
5936 BlockDriver *drv = bs->drv;
5937 if (drv && drv->bdrv_flush_io_queue) {
5938 drv->bdrv_flush_io_queue(bs);
5939 } else if (bs->file) {
5940 bdrv_flush_io_queue(bs->file);
5944 static bool append_open_options(QDict *d, BlockDriverState *bs)
5946 const QDictEntry *entry;
5947 bool found_any = false;
5949 for (entry = qdict_first(bs->options); entry;
5950 entry = qdict_next(bs->options, entry))
5952 /* Only take options for this level and exclude all non-driver-specific
5953 * options */
5954 if (!strchr(qdict_entry_key(entry), '.') &&
5955 strcmp(qdict_entry_key(entry), "node-name"))
5957 qobject_incref(qdict_entry_value(entry));
5958 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5959 found_any = true;
5963 return found_any;
5966 /* Updates the following BDS fields:
5967 * - exact_filename: A filename which may be used for opening a block device
5968 * which (mostly) equals the given BDS (even without any
5969 * other options; so reading and writing must return the same
5970 * results, but caching etc. may be different)
5971 * - full_open_options: Options which, when given when opening a block device
5972 * (without a filename), result in a BDS (mostly)
5973 * equalling the given one
5974 * - filename: If exact_filename is set, it is copied here. Otherwise,
5975 * full_open_options is converted to a JSON object, prefixed with
5976 * "json:" (for use through the JSON pseudo protocol) and put here.
5978 void bdrv_refresh_filename(BlockDriverState *bs)
5980 BlockDriver *drv = bs->drv;
5981 QDict *opts;
5983 if (!drv) {
5984 return;
5987 /* This BDS's file name will most probably depend on its file's name, so
5988 * refresh that first */
5989 if (bs->file) {
5990 bdrv_refresh_filename(bs->file);
5993 if (drv->bdrv_refresh_filename) {
5994 /* Obsolete information is of no use here, so drop the old file name
5995 * information before refreshing it */
5996 bs->exact_filename[0] = '\0';
5997 if (bs->full_open_options) {
5998 QDECREF(bs->full_open_options);
5999 bs->full_open_options = NULL;
6002 drv->bdrv_refresh_filename(bs);
6003 } else if (bs->file) {
6004 /* Try to reconstruct valid information from the underlying file */
6005 bool has_open_options;
6007 bs->exact_filename[0] = '\0';
6008 if (bs->full_open_options) {
6009 QDECREF(bs->full_open_options);
6010 bs->full_open_options = NULL;
6013 opts = qdict_new();
6014 has_open_options = append_open_options(opts, bs);
6016 /* If no specific options have been given for this BDS, the filename of
6017 * the underlying file should suffice for this one as well */
6018 if (bs->file->exact_filename[0] && !has_open_options) {
6019 strcpy(bs->exact_filename, bs->file->exact_filename);
6021 /* Reconstructing the full options QDict is simple for most format block
6022 * drivers, as long as the full options are known for the underlying
6023 * file BDS. The full options QDict of that file BDS should somehow
6024 * contain a representation of the filename, therefore the following
6025 * suffices without querying the (exact_)filename of this BDS. */
6026 if (bs->file->full_open_options) {
6027 qdict_put_obj(opts, "driver",
6028 QOBJECT(qstring_from_str(drv->format_name)));
6029 QINCREF(bs->file->full_open_options);
6030 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6032 bs->full_open_options = opts;
6033 } else {
6034 QDECREF(opts);
6036 } else if (!bs->full_open_options && qdict_size(bs->options)) {
6037 /* There is no underlying file BDS (at least referenced by BDS.file),
6038 * so the full options QDict should be equal to the options given
6039 * specifically for this block device when it was opened (plus the
6040 * driver specification).
6041 * Because those options don't change, there is no need to update
6042 * full_open_options when it's already set. */
6044 opts = qdict_new();
6045 append_open_options(opts, bs);
6046 qdict_put_obj(opts, "driver",
6047 QOBJECT(qstring_from_str(drv->format_name)));
6049 if (bs->exact_filename[0]) {
6050 /* This may not work for all block protocol drivers (some may
6051 * require this filename to be parsed), but we have to find some
6052 * default solution here, so just include it. If some block driver
6053 * does not support pure options without any filename at all or
6054 * needs some special format of the options QDict, it needs to
6055 * implement the driver-specific bdrv_refresh_filename() function.
6057 qdict_put_obj(opts, "filename",
6058 QOBJECT(qstring_from_str(bs->exact_filename)));
6061 bs->full_open_options = opts;
6064 if (bs->exact_filename[0]) {
6065 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6066 } else if (bs->full_open_options) {
6067 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6068 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6069 qstring_get_str(json));
6070 QDECREF(json);
6074 /* This accessor function purpose is to allow the device models to access the
6075 * BlockAcctStats structure embedded inside a BlockDriverState without being
6076 * aware of the BlockDriverState structure layout.
6077 * It will go away when the BlockAcctStats structure will be moved inside
6078 * the device models.
6080 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6082 return &bs->stats;