2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include "config-host.h"
25 #include "qemu-common.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
41 #include <sys/types.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
54 struct BdrvDirtyBitmap
{
56 QLIST_ENTRY(BdrvDirtyBitmap
) list
;
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
61 static BlockAIOCB
*bdrv_aio_readv_em(BlockDriverState
*bs
,
62 int64_t sector_num
, QEMUIOVector
*qiov
, int nb_sectors
,
63 BlockCompletionFunc
*cb
, void *opaque
);
64 static BlockAIOCB
*bdrv_aio_writev_em(BlockDriverState
*bs
,
65 int64_t sector_num
, QEMUIOVector
*qiov
, int nb_sectors
,
66 BlockCompletionFunc
*cb
, void *opaque
);
67 static int coroutine_fn
bdrv_co_readv_em(BlockDriverState
*bs
,
68 int64_t sector_num
, int nb_sectors
,
70 static int coroutine_fn
bdrv_co_writev_em(BlockDriverState
*bs
,
71 int64_t sector_num
, int nb_sectors
,
73 static int coroutine_fn
bdrv_co_do_preadv(BlockDriverState
*bs
,
74 int64_t offset
, unsigned int bytes
, QEMUIOVector
*qiov
,
75 BdrvRequestFlags flags
);
76 static int coroutine_fn
bdrv_co_do_pwritev(BlockDriverState
*bs
,
77 int64_t offset
, unsigned int bytes
, QEMUIOVector
*qiov
,
78 BdrvRequestFlags flags
);
79 static BlockAIOCB
*bdrv_co_aio_rw_vector(BlockDriverState
*bs
,
83 BdrvRequestFlags flags
,
84 BlockCompletionFunc
*cb
,
87 static void coroutine_fn
bdrv_co_do_rw(void *opaque
);
88 static int coroutine_fn
bdrv_co_do_write_zeroes(BlockDriverState
*bs
,
89 int64_t sector_num
, int nb_sectors
, BdrvRequestFlags flags
);
91 static QTAILQ_HEAD(, BlockDriverState
) bdrv_states
=
92 QTAILQ_HEAD_INITIALIZER(bdrv_states
);
94 static QTAILQ_HEAD(, BlockDriverState
) graph_bdrv_states
=
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states
);
97 static QLIST_HEAD(, BlockDriver
) bdrv_drivers
=
98 QLIST_HEAD_INITIALIZER(bdrv_drivers
);
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist
;
104 static int is_windows_drive_prefix(const char *filename
)
106 return (((filename
[0] >= 'a' && filename
[0] <= 'z') ||
107 (filename
[0] >= 'A' && filename
[0] <= 'Z')) &&
111 int is_windows_drive(const char *filename
)
113 if (is_windows_drive_prefix(filename
) &&
116 if (strstart(filename
, "\\\\.\\", NULL
) ||
117 strstart(filename
, "//./", NULL
))
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState
*bs
,
129 throttle_config(&bs
->throttle_state
, cfg
);
131 for (i
= 0; i
< 2; i
++) {
132 qemu_co_enter_next(&bs
->throttled_reqs
[i
]);
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState
*bs
)
139 bool drained
= false;
140 bool enabled
= bs
->io_limits_enabled
;
143 bs
->io_limits_enabled
= false;
145 for (i
= 0; i
< 2; i
++) {
146 while (qemu_co_enter_next(&bs
->throttled_reqs
[i
])) {
151 bs
->io_limits_enabled
= enabled
;
156 void bdrv_io_limits_disable(BlockDriverState
*bs
)
158 bs
->io_limits_enabled
= false;
160 bdrv_start_throttled_reqs(bs
);
162 throttle_destroy(&bs
->throttle_state
);
165 static void bdrv_throttle_read_timer_cb(void *opaque
)
167 BlockDriverState
*bs
= opaque
;
168 qemu_co_enter_next(&bs
->throttled_reqs
[0]);
171 static void bdrv_throttle_write_timer_cb(void *opaque
)
173 BlockDriverState
*bs
= opaque
;
174 qemu_co_enter_next(&bs
->throttled_reqs
[1]);
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState
*bs
)
180 assert(!bs
->io_limits_enabled
);
181 throttle_init(&bs
->throttle_state
,
182 bdrv_get_aio_context(bs
),
184 bdrv_throttle_read_timer_cb
,
185 bdrv_throttle_write_timer_cb
,
187 bs
->io_limits_enabled
= true;
190 /* This function makes an IO wait if needed
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
195 static void bdrv_io_limits_intercept(BlockDriverState
*bs
,
199 /* does this io must wait */
200 bool must_wait
= throttle_schedule_timer(&bs
->throttle_state
, is_write
);
202 /* if must wait or any request of this type throttled queue the IO */
204 !qemu_co_queue_empty(&bs
->throttled_reqs
[is_write
])) {
205 qemu_co_queue_wait(&bs
->throttled_reqs
[is_write
]);
208 /* the IO will be executed, do the accounting */
209 throttle_account(&bs
->throttle_state
, is_write
, bytes
);
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs
->throttle_state
, is_write
)) {
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs
->throttled_reqs
[is_write
]);
221 size_t bdrv_opt_mem_align(BlockDriverState
*bs
)
223 if (!bs
|| !bs
->drv
) {
224 /* 4k should be on the safe side */
228 return bs
->bl
.opt_mem_alignment
;
231 /* check if the path starts with "<protocol>:" */
232 static int path_has_protocol(const char *path
)
237 if (is_windows_drive(path
) ||
238 is_windows_drive_prefix(path
)) {
241 p
= path
+ strcspn(path
, ":/\\");
243 p
= path
+ strcspn(path
, ":/");
249 int path_is_absolute(const char *path
)
252 /* specific case for names like: "\\.\d:" */
253 if (is_windows_drive(path
) || is_windows_drive_prefix(path
)) {
256 return (*path
== '/' || *path
== '\\');
258 return (*path
== '/');
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
265 void path_combine(char *dest
, int dest_size
,
266 const char *base_path
,
267 const char *filename
)
274 if (path_is_absolute(filename
)) {
275 pstrcpy(dest
, dest_size
, filename
);
277 p
= strchr(base_path
, ':');
282 p1
= strrchr(base_path
, '/');
286 p2
= strrchr(base_path
, '\\');
298 if (len
> dest_size
- 1)
300 memcpy(dest
, base_path
, len
);
302 pstrcat(dest
, dest_size
, filename
);
306 void bdrv_get_full_backing_filename(BlockDriverState
*bs
, char *dest
, size_t sz
)
308 if (bs
->backing_file
[0] == '\0' || path_has_protocol(bs
->backing_file
)) {
309 pstrcpy(dest
, sz
, bs
->backing_file
);
311 path_combine(dest
, sz
, bs
->filename
, bs
->backing_file
);
315 void bdrv_register(BlockDriver
*bdrv
)
317 /* Block drivers without coroutine functions need emulation */
318 if (!bdrv
->bdrv_co_readv
) {
319 bdrv
->bdrv_co_readv
= bdrv_co_readv_em
;
320 bdrv
->bdrv_co_writev
= bdrv_co_writev_em
;
322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323 * the block driver lacks aio we need to emulate that too.
325 if (!bdrv
->bdrv_aio_readv
) {
326 /* add AIO emulation layer */
327 bdrv
->bdrv_aio_readv
= bdrv_aio_readv_em
;
328 bdrv
->bdrv_aio_writev
= bdrv_aio_writev_em
;
332 QLIST_INSERT_HEAD(&bdrv_drivers
, bdrv
, list
);
335 BlockDriverState
*bdrv_new_root(void)
337 BlockDriverState
*bs
= bdrv_new();
339 QTAILQ_INSERT_TAIL(&bdrv_states
, bs
, device_list
);
343 BlockDriverState
*bdrv_new(void)
345 BlockDriverState
*bs
;
348 bs
= g_new0(BlockDriverState
, 1);
349 QLIST_INIT(&bs
->dirty_bitmaps
);
350 for (i
= 0; i
< BLOCK_OP_TYPE_MAX
; i
++) {
351 QLIST_INIT(&bs
->op_blockers
[i
]);
353 bdrv_iostatus_disable(bs
);
354 notifier_list_init(&bs
->close_notifiers
);
355 notifier_with_return_list_init(&bs
->before_write_notifiers
);
356 qemu_co_queue_init(&bs
->throttled_reqs
[0]);
357 qemu_co_queue_init(&bs
->throttled_reqs
[1]);
359 bs
->aio_context
= qemu_get_aio_context();
364 void bdrv_add_close_notifier(BlockDriverState
*bs
, Notifier
*notify
)
366 notifier_list_add(&bs
->close_notifiers
, notify
);
369 BlockDriver
*bdrv_find_format(const char *format_name
)
372 QLIST_FOREACH(drv1
, &bdrv_drivers
, list
) {
373 if (!strcmp(drv1
->format_name
, format_name
)) {
380 static int bdrv_is_whitelisted(BlockDriver
*drv
, bool read_only
)
382 static const char *whitelist_rw
[] = {
383 CONFIG_BDRV_RW_WHITELIST
385 static const char *whitelist_ro
[] = {
386 CONFIG_BDRV_RO_WHITELIST
390 if (!whitelist_rw
[0] && !whitelist_ro
[0]) {
391 return 1; /* no whitelist, anything goes */
394 for (p
= whitelist_rw
; *p
; p
++) {
395 if (!strcmp(drv
->format_name
, *p
)) {
400 for (p
= whitelist_ro
; *p
; p
++) {
401 if (!strcmp(drv
->format_name
, *p
)) {
409 BlockDriver
*bdrv_find_whitelisted_format(const char *format_name
,
412 BlockDriver
*drv
= bdrv_find_format(format_name
);
413 return drv
&& bdrv_is_whitelisted(drv
, read_only
) ? drv
: NULL
;
416 typedef struct CreateCo
{
424 static void coroutine_fn
bdrv_create_co_entry(void *opaque
)
426 Error
*local_err
= NULL
;
429 CreateCo
*cco
= opaque
;
432 ret
= cco
->drv
->bdrv_create(cco
->filename
, cco
->opts
, &local_err
);
434 error_propagate(&cco
->err
, local_err
);
439 int bdrv_create(BlockDriver
*drv
, const char* filename
,
440 QemuOpts
*opts
, Error
**errp
)
447 .filename
= g_strdup(filename
),
453 if (!drv
->bdrv_create
) {
454 error_setg(errp
, "Driver '%s' does not support image creation", drv
->format_name
);
459 if (qemu_in_coroutine()) {
460 /* Fast-path if already in coroutine context */
461 bdrv_create_co_entry(&cco
);
463 co
= qemu_coroutine_create(bdrv_create_co_entry
);
464 qemu_coroutine_enter(co
, &cco
);
465 while (cco
.ret
== NOT_DONE
) {
466 aio_poll(qemu_get_aio_context(), true);
473 error_propagate(errp
, cco
.err
);
475 error_setg_errno(errp
, -ret
, "Could not create image");
480 g_free(cco
.filename
);
484 int bdrv_create_file(const char *filename
, QemuOpts
*opts
, Error
**errp
)
487 Error
*local_err
= NULL
;
490 drv
= bdrv_find_protocol(filename
, true);
492 error_setg(errp
, "Could not find protocol for file '%s'", filename
);
496 ret
= bdrv_create(drv
, filename
, opts
, &local_err
);
498 error_propagate(errp
, local_err
);
503 void bdrv_refresh_limits(BlockDriverState
*bs
, Error
**errp
)
505 BlockDriver
*drv
= bs
->drv
;
506 Error
*local_err
= NULL
;
508 memset(&bs
->bl
, 0, sizeof(bs
->bl
));
514 /* Take some limits from the children as a default */
516 bdrv_refresh_limits(bs
->file
, &local_err
);
518 error_propagate(errp
, local_err
);
521 bs
->bl
.opt_transfer_length
= bs
->file
->bl
.opt_transfer_length
;
522 bs
->bl
.max_transfer_length
= bs
->file
->bl
.max_transfer_length
;
523 bs
->bl
.opt_mem_alignment
= bs
->file
->bl
.opt_mem_alignment
;
525 bs
->bl
.opt_mem_alignment
= 512;
528 if (bs
->backing_hd
) {
529 bdrv_refresh_limits(bs
->backing_hd
, &local_err
);
531 error_propagate(errp
, local_err
);
534 bs
->bl
.opt_transfer_length
=
535 MAX(bs
->bl
.opt_transfer_length
,
536 bs
->backing_hd
->bl
.opt_transfer_length
);
537 bs
->bl
.max_transfer_length
=
538 MIN_NON_ZERO(bs
->bl
.max_transfer_length
,
539 bs
->backing_hd
->bl
.max_transfer_length
);
540 bs
->bl
.opt_mem_alignment
=
541 MAX(bs
->bl
.opt_mem_alignment
,
542 bs
->backing_hd
->bl
.opt_mem_alignment
);
545 /* Then let the driver override it */
546 if (drv
->bdrv_refresh_limits
) {
547 drv
->bdrv_refresh_limits(bs
, errp
);
552 * Create a uniquely-named empty temporary file.
553 * Return 0 upon success, otherwise a negative errno value.
555 int get_tmp_filename(char *filename
, int size
)
558 char temp_dir
[MAX_PATH
];
559 /* GetTempFileName requires that its output buffer (4th param)
560 have length MAX_PATH or greater. */
561 assert(size
>= MAX_PATH
);
562 return (GetTempPath(MAX_PATH
, temp_dir
)
563 && GetTempFileName(temp_dir
, "qem", 0, filename
)
564 ? 0 : -GetLastError());
568 tmpdir
= getenv("TMPDIR");
572 if (snprintf(filename
, size
, "%s/vl.XXXXXX", tmpdir
) >= size
) {
575 fd
= mkstemp(filename
);
579 if (close(fd
) != 0) {
588 * Detect host devices. By convention, /dev/cdrom[N] is always
589 * recognized as a host CDROM.
591 static BlockDriver
*find_hdev_driver(const char *filename
)
593 int score_max
= 0, score
;
594 BlockDriver
*drv
= NULL
, *d
;
596 QLIST_FOREACH(d
, &bdrv_drivers
, list
) {
597 if (d
->bdrv_probe_device
) {
598 score
= d
->bdrv_probe_device(filename
);
599 if (score
> score_max
) {
609 BlockDriver
*bdrv_find_protocol(const char *filename
,
610 bool allow_protocol_prefix
)
617 /* TODO Drivers without bdrv_file_open must be specified explicitly */
620 * XXX(hch): we really should not let host device detection
621 * override an explicit protocol specification, but moving this
622 * later breaks access to device names with colons in them.
623 * Thanks to the brain-dead persistent naming schemes on udev-
624 * based Linux systems those actually are quite common.
626 drv1
= find_hdev_driver(filename
);
631 if (!path_has_protocol(filename
) || !allow_protocol_prefix
) {
632 return bdrv_find_format("file");
635 p
= strchr(filename
, ':');
638 if (len
> sizeof(protocol
) - 1)
639 len
= sizeof(protocol
) - 1;
640 memcpy(protocol
, filename
, len
);
641 protocol
[len
] = '\0';
642 QLIST_FOREACH(drv1
, &bdrv_drivers
, list
) {
643 if (drv1
->protocol_name
&&
644 !strcmp(drv1
->protocol_name
, protocol
)) {
652 * Guess image format by probing its contents.
653 * This is not a good idea when your image is raw (CVE-2008-2004), but
654 * we do it anyway for backward compatibility.
656 * @buf contains the image's first @buf_size bytes.
657 * @buf_size is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
658 * but can be smaller if the image file is smaller)
659 * @filename is its filename.
661 * For all block drivers, call the bdrv_probe() method to get its
663 * Return the first block driver with the highest probing score.
665 BlockDriver
*bdrv_probe_all(const uint8_t *buf
, int buf_size
,
666 const char *filename
)
668 int score_max
= 0, score
;
669 BlockDriver
*drv
= NULL
, *d
;
671 QLIST_FOREACH(d
, &bdrv_drivers
, list
) {
673 score
= d
->bdrv_probe(buf
, buf_size
, filename
);
674 if (score
> score_max
) {
684 static int find_image_format(BlockDriverState
*bs
, const char *filename
,
685 BlockDriver
**pdrv
, Error
**errp
)
688 uint8_t buf
[BLOCK_PROBE_BUF_SIZE
];
691 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
692 if (bs
->sg
|| !bdrv_is_inserted(bs
) || bdrv_getlength(bs
) == 0) {
693 drv
= bdrv_find_format("raw");
695 error_setg(errp
, "Could not find raw image format");
702 ret
= bdrv_pread(bs
, 0, buf
, sizeof(buf
));
704 error_setg_errno(errp
, -ret
, "Could not read image for determining its "
710 drv
= bdrv_probe_all(buf
, ret
, filename
);
712 error_setg(errp
, "Could not determine image format: No compatible "
721 * Set the current 'total_sectors' value
722 * Return 0 on success, -errno on error.
724 static int refresh_total_sectors(BlockDriverState
*bs
, int64_t hint
)
726 BlockDriver
*drv
= bs
->drv
;
728 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
732 /* query actual device if possible, otherwise just trust the hint */
733 if (drv
->bdrv_getlength
) {
734 int64_t length
= drv
->bdrv_getlength(bs
);
738 hint
= DIV_ROUND_UP(length
, BDRV_SECTOR_SIZE
);
741 bs
->total_sectors
= hint
;
746 * Set open flags for a given discard mode
748 * Return 0 on success, -1 if the discard mode was invalid.
750 int bdrv_parse_discard_flags(const char *mode
, int *flags
)
752 *flags
&= ~BDRV_O_UNMAP
;
754 if (!strcmp(mode
, "off") || !strcmp(mode
, "ignore")) {
756 } else if (!strcmp(mode
, "on") || !strcmp(mode
, "unmap")) {
757 *flags
|= BDRV_O_UNMAP
;
766 * Set open flags for a given cache mode
768 * Return 0 on success, -1 if the cache mode was invalid.
770 int bdrv_parse_cache_flags(const char *mode
, int *flags
)
772 *flags
&= ~BDRV_O_CACHE_MASK
;
774 if (!strcmp(mode
, "off") || !strcmp(mode
, "none")) {
775 *flags
|= BDRV_O_NOCACHE
| BDRV_O_CACHE_WB
;
776 } else if (!strcmp(mode
, "directsync")) {
777 *flags
|= BDRV_O_NOCACHE
;
778 } else if (!strcmp(mode
, "writeback")) {
779 *flags
|= BDRV_O_CACHE_WB
;
780 } else if (!strcmp(mode
, "unsafe")) {
781 *flags
|= BDRV_O_CACHE_WB
;
782 *flags
|= BDRV_O_NO_FLUSH
;
783 } else if (!strcmp(mode
, "writethrough")) {
784 /* this is the default */
793 * The copy-on-read flag is actually a reference count so multiple users may
794 * use the feature without worrying about clobbering its previous state.
795 * Copy-on-read stays enabled until all users have called to disable it.
797 void bdrv_enable_copy_on_read(BlockDriverState
*bs
)
802 void bdrv_disable_copy_on_read(BlockDriverState
*bs
)
804 assert(bs
->copy_on_read
> 0);
809 * Returns the flags that a temporary snapshot should get, based on the
810 * originally requested flags (the originally requested image will have flags
811 * like a backing file)
813 static int bdrv_temp_snapshot_flags(int flags
)
815 return (flags
& ~BDRV_O_SNAPSHOT
) | BDRV_O_TEMPORARY
;
819 * Returns the flags that bs->file should get, based on the given flags for
822 static int bdrv_inherited_flags(int flags
)
824 /* Enable protocol handling, disable format probing for bs->file */
825 flags
|= BDRV_O_PROTOCOL
;
827 /* Our block drivers take care to send flushes and respect unmap policy,
828 * so we can enable both unconditionally on lower layers. */
829 flags
|= BDRV_O_CACHE_WB
| BDRV_O_UNMAP
;
831 /* Clear flags that only apply to the top layer */
832 flags
&= ~(BDRV_O_SNAPSHOT
| BDRV_O_NO_BACKING
| BDRV_O_COPY_ON_READ
);
838 * Returns the flags that bs->backing_hd should get, based on the given flags
841 static int bdrv_backing_flags(int flags
)
843 /* backing files always opened read-only */
844 flags
&= ~(BDRV_O_RDWR
| BDRV_O_COPY_ON_READ
);
846 /* snapshot=on is handled on the top layer */
847 flags
&= ~(BDRV_O_SNAPSHOT
| BDRV_O_TEMPORARY
);
852 static int bdrv_open_flags(BlockDriverState
*bs
, int flags
)
854 int open_flags
= flags
| BDRV_O_CACHE_WB
;
857 * Clear flags that are internal to the block layer before opening the
860 open_flags
&= ~(BDRV_O_SNAPSHOT
| BDRV_O_NO_BACKING
| BDRV_O_PROTOCOL
);
863 * Snapshots should be writable.
865 if (flags
& BDRV_O_TEMPORARY
) {
866 open_flags
|= BDRV_O_RDWR
;
872 static void bdrv_assign_node_name(BlockDriverState
*bs
,
873 const char *node_name
,
880 /* Check for empty string or invalid characters */
881 if (!id_wellformed(node_name
)) {
882 error_setg(errp
, "Invalid node name");
886 /* takes care of avoiding namespaces collisions */
887 if (blk_by_name(node_name
)) {
888 error_setg(errp
, "node-name=%s is conflicting with a device id",
893 /* takes care of avoiding duplicates node names */
894 if (bdrv_find_node(node_name
)) {
895 error_setg(errp
, "Duplicate node name");
899 /* copy node name into the bs and insert it into the graph list */
900 pstrcpy(bs
->node_name
, sizeof(bs
->node_name
), node_name
);
901 QTAILQ_INSERT_TAIL(&graph_bdrv_states
, bs
, node_list
);
905 * Common part for opening disk images and files
907 * Removes all processed options from *options.
909 static int bdrv_open_common(BlockDriverState
*bs
, BlockDriverState
*file
,
910 QDict
*options
, int flags
, BlockDriver
*drv
, Error
**errp
)
913 const char *filename
;
914 const char *node_name
= NULL
;
915 Error
*local_err
= NULL
;
918 assert(bs
->file
== NULL
);
919 assert(options
!= NULL
&& bs
->options
!= options
);
922 filename
= file
->filename
;
924 filename
= qdict_get_try_str(options
, "filename");
927 if (drv
->bdrv_needs_filename
&& !filename
) {
928 error_setg(errp
, "The '%s' block driver requires a file name",
933 trace_bdrv_open_common(bs
, filename
?: "", flags
, drv
->format_name
);
935 node_name
= qdict_get_try_str(options
, "node-name");
936 bdrv_assign_node_name(bs
, node_name
, &local_err
);
938 error_propagate(errp
, local_err
);
941 qdict_del(options
, "node-name");
943 /* bdrv_open() with directly using a protocol as drv. This layer is already
944 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
945 * and return immediately. */
946 if (file
!= NULL
&& drv
->bdrv_file_open
) {
951 bs
->open_flags
= flags
;
952 bs
->guest_block_size
= 512;
953 bs
->request_alignment
= 512;
954 bs
->zero_beyond_eof
= true;
955 open_flags
= bdrv_open_flags(bs
, flags
);
956 bs
->read_only
= !(open_flags
& BDRV_O_RDWR
);
957 bs
->growable
= !!(flags
& BDRV_O_PROTOCOL
);
959 if (use_bdrv_whitelist
&& !bdrv_is_whitelisted(drv
, bs
->read_only
)) {
961 !bs
->read_only
&& bdrv_is_whitelisted(drv
, true)
962 ? "Driver '%s' can only be used for read-only devices"
963 : "Driver '%s' is not whitelisted",
968 assert(bs
->copy_on_read
== 0); /* bdrv_new() and bdrv_close() make it so */
969 if (flags
& BDRV_O_COPY_ON_READ
) {
970 if (!bs
->read_only
) {
971 bdrv_enable_copy_on_read(bs
);
973 error_setg(errp
, "Can't use copy-on-read on read-only device");
978 if (filename
!= NULL
) {
979 pstrcpy(bs
->filename
, sizeof(bs
->filename
), filename
);
981 bs
->filename
[0] = '\0';
983 pstrcpy(bs
->exact_filename
, sizeof(bs
->exact_filename
), bs
->filename
);
986 bs
->opaque
= g_malloc0(drv
->instance_size
);
988 bs
->enable_write_cache
= !!(flags
& BDRV_O_CACHE_WB
);
990 /* Open the image, either directly or using a protocol */
991 if (drv
->bdrv_file_open
) {
992 assert(file
== NULL
);
993 assert(!drv
->bdrv_needs_filename
|| filename
!= NULL
);
994 ret
= drv
->bdrv_file_open(bs
, options
, open_flags
, &local_err
);
997 error_setg(errp
, "Can't use '%s' as a block driver for the "
998 "protocol level", drv
->format_name
);
1003 ret
= drv
->bdrv_open(bs
, options
, open_flags
, &local_err
);
1008 error_propagate(errp
, local_err
);
1009 } else if (bs
->filename
[0]) {
1010 error_setg_errno(errp
, -ret
, "Could not open '%s'", bs
->filename
);
1012 error_setg_errno(errp
, -ret
, "Could not open image");
1017 ret
= refresh_total_sectors(bs
, bs
->total_sectors
);
1019 error_setg_errno(errp
, -ret
, "Could not refresh total sector count");
1023 bdrv_refresh_limits(bs
, &local_err
);
1025 error_propagate(errp
, local_err
);
1030 assert(bdrv_opt_mem_align(bs
) != 0);
1031 assert((bs
->request_alignment
!= 0) || bs
->sg
);
1042 static QDict
*parse_json_filename(const char *filename
, Error
**errp
)
1044 QObject
*options_obj
;
1048 ret
= strstart(filename
, "json:", &filename
);
1051 options_obj
= qobject_from_json(filename
);
1053 error_setg(errp
, "Could not parse the JSON options");
1057 if (qobject_type(options_obj
) != QTYPE_QDICT
) {
1058 qobject_decref(options_obj
);
1059 error_setg(errp
, "Invalid JSON object given");
1063 options
= qobject_to_qdict(options_obj
);
1064 qdict_flatten(options
);
1070 * Fills in default options for opening images and converts the legacy
1071 * filename/flags pair to option QDict entries.
1073 static int bdrv_fill_options(QDict
**options
, const char **pfilename
, int flags
,
1074 BlockDriver
*drv
, Error
**errp
)
1076 const char *filename
= *pfilename
;
1077 const char *drvname
;
1078 bool protocol
= flags
& BDRV_O_PROTOCOL
;
1079 bool parse_filename
= false;
1080 Error
*local_err
= NULL
;
1082 /* Parse json: pseudo-protocol */
1083 if (filename
&& g_str_has_prefix(filename
, "json:")) {
1084 QDict
*json_options
= parse_json_filename(filename
, &local_err
);
1086 error_propagate(errp
, local_err
);
1090 /* Options given in the filename have lower priority than options
1091 * specified directly */
1092 qdict_join(*options
, json_options
, false);
1093 QDECREF(json_options
);
1094 *pfilename
= filename
= NULL
;
1097 /* Fetch the file name from the options QDict if necessary */
1098 if (protocol
&& filename
) {
1099 if (!qdict_haskey(*options
, "filename")) {
1100 qdict_put(*options
, "filename", qstring_from_str(filename
));
1101 parse_filename
= true;
1103 error_setg(errp
, "Can't specify 'file' and 'filename' options at "
1109 /* Find the right block driver */
1110 filename
= qdict_get_try_str(*options
, "filename");
1111 drvname
= qdict_get_try_str(*options
, "driver");
1115 error_setg(errp
, "Driver specified twice");
1118 drvname
= drv
->format_name
;
1119 qdict_put(*options
, "driver", qstring_from_str(drvname
));
1121 if (!drvname
&& protocol
) {
1123 drv
= bdrv_find_protocol(filename
, parse_filename
);
1125 error_setg(errp
, "Unknown protocol");
1129 drvname
= drv
->format_name
;
1130 qdict_put(*options
, "driver", qstring_from_str(drvname
));
1132 error_setg(errp
, "Must specify either driver or file");
1135 } else if (drvname
) {
1136 drv
= bdrv_find_format(drvname
);
1138 error_setg(errp
, "Unknown driver '%s'", drvname
);
1144 assert(drv
|| !protocol
);
1146 /* Driver-specific filename parsing */
1147 if (drv
&& drv
->bdrv_parse_filename
&& parse_filename
) {
1148 drv
->bdrv_parse_filename(filename
, *options
, &local_err
);
1150 error_propagate(errp
, local_err
);
1154 if (!drv
->bdrv_needs_filename
) {
1155 qdict_del(*options
, "filename");
1162 void bdrv_set_backing_hd(BlockDriverState
*bs
, BlockDriverState
*backing_hd
)
1165 if (bs
->backing_hd
) {
1166 assert(bs
->backing_blocker
);
1167 bdrv_op_unblock_all(bs
->backing_hd
, bs
->backing_blocker
);
1168 } else if (backing_hd
) {
1169 error_setg(&bs
->backing_blocker
,
1170 "device is used as backing hd of '%s'",
1171 bdrv_get_device_name(bs
));
1174 bs
->backing_hd
= backing_hd
;
1176 error_free(bs
->backing_blocker
);
1177 bs
->backing_blocker
= NULL
;
1180 bs
->open_flags
&= ~BDRV_O_NO_BACKING
;
1181 pstrcpy(bs
->backing_file
, sizeof(bs
->backing_file
), backing_hd
->filename
);
1182 pstrcpy(bs
->backing_format
, sizeof(bs
->backing_format
),
1183 backing_hd
->drv
? backing_hd
->drv
->format_name
: "");
1185 bdrv_op_block_all(bs
->backing_hd
, bs
->backing_blocker
);
1186 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1187 bdrv_op_unblock(bs
->backing_hd
, BLOCK_OP_TYPE_COMMIT
,
1188 bs
->backing_blocker
);
1190 bdrv_refresh_limits(bs
, NULL
);
1194 * Opens the backing file for a BlockDriverState if not yet open
1196 * options is a QDict of options to pass to the block drivers, or NULL for an
1197 * empty set of options. The reference to the QDict is transferred to this
1198 * function (even on failure), so if the caller intends to reuse the dictionary,
1199 * it needs to use QINCREF() before calling bdrv_file_open.
1201 int bdrv_open_backing_file(BlockDriverState
*bs
, QDict
*options
, Error
**errp
)
1203 char *backing_filename
= g_malloc0(PATH_MAX
);
1205 BlockDriver
*back_drv
= NULL
;
1206 BlockDriverState
*backing_hd
;
1207 Error
*local_err
= NULL
;
1209 if (bs
->backing_hd
!= NULL
) {
1214 /* NULL means an empty set of options */
1215 if (options
== NULL
) {
1216 options
= qdict_new();
1219 bs
->open_flags
&= ~BDRV_O_NO_BACKING
;
1220 if (qdict_haskey(options
, "file.filename")) {
1221 backing_filename
[0] = '\0';
1222 } else if (bs
->backing_file
[0] == '\0' && qdict_size(options
) == 0) {
1226 bdrv_get_full_backing_filename(bs
, backing_filename
, PATH_MAX
);
1229 if (!bs
->drv
|| !bs
->drv
->supports_backing
) {
1231 error_setg(errp
, "Driver doesn't support backing files");
1236 backing_hd
= bdrv_new();
1238 if (bs
->backing_format
[0] != '\0') {
1239 back_drv
= bdrv_find_format(bs
->backing_format
);
1242 assert(bs
->backing_hd
== NULL
);
1243 ret
= bdrv_open(&backing_hd
,
1244 *backing_filename
? backing_filename
: NULL
, NULL
, options
,
1245 bdrv_backing_flags(bs
->open_flags
), back_drv
, &local_err
);
1247 bdrv_unref(backing_hd
);
1249 bs
->open_flags
|= BDRV_O_NO_BACKING
;
1250 error_setg(errp
, "Could not open backing file: %s",
1251 error_get_pretty(local_err
));
1252 error_free(local_err
);
1255 bdrv_set_backing_hd(bs
, backing_hd
);
1258 g_free(backing_filename
);
1263 * Opens a disk image whose options are given as BlockdevRef in another block
1266 * If allow_none is true, no image will be opened if filename is false and no
1267 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1269 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1270 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1271 * itself, all options starting with "${bdref_key}." are considered part of the
1274 * The BlockdevRef will be removed from the options QDict.
1276 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1278 int bdrv_open_image(BlockDriverState
**pbs
, const char *filename
,
1279 QDict
*options
, const char *bdref_key
, int flags
,
1280 bool allow_none
, Error
**errp
)
1282 QDict
*image_options
;
1284 char *bdref_key_dot
;
1285 const char *reference
;
1288 assert(*pbs
== NULL
);
1290 bdref_key_dot
= g_strdup_printf("%s.", bdref_key
);
1291 qdict_extract_subqdict(options
, &image_options
, bdref_key_dot
);
1292 g_free(bdref_key_dot
);
1294 reference
= qdict_get_try_str(options
, bdref_key
);
1295 if (!filename
&& !reference
&& !qdict_size(image_options
)) {
1299 error_setg(errp
, "A block device must be specified for \"%s\"",
1303 QDECREF(image_options
);
1307 ret
= bdrv_open(pbs
, filename
, reference
, image_options
, flags
, NULL
, errp
);
1310 qdict_del(options
, bdref_key
);
1314 int bdrv_append_temp_snapshot(BlockDriverState
*bs
, int flags
, Error
**errp
)
1316 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1317 char *tmp_filename
= g_malloc0(PATH_MAX
+ 1);
1319 BlockDriver
*bdrv_qcow2
;
1320 QemuOpts
*opts
= NULL
;
1321 QDict
*snapshot_options
;
1322 BlockDriverState
*bs_snapshot
;
1326 /* if snapshot, we create a temporary backing file and open it
1327 instead of opening 'filename' directly */
1329 /* Get the required size from the image */
1330 total_size
= bdrv_getlength(bs
);
1331 if (total_size
< 0) {
1333 error_setg_errno(errp
, -total_size
, "Could not get image size");
1337 /* Create the temporary image */
1338 ret
= get_tmp_filename(tmp_filename
, PATH_MAX
+ 1);
1340 error_setg_errno(errp
, -ret
, "Could not get temporary filename");
1344 bdrv_qcow2
= bdrv_find_format("qcow2");
1345 opts
= qemu_opts_create(bdrv_qcow2
->create_opts
, NULL
, 0,
1347 qemu_opt_set_number(opts
, BLOCK_OPT_SIZE
, total_size
);
1348 ret
= bdrv_create(bdrv_qcow2
, tmp_filename
, opts
, &local_err
);
1349 qemu_opts_del(opts
);
1351 error_setg_errno(errp
, -ret
, "Could not create temporary overlay "
1352 "'%s': %s", tmp_filename
,
1353 error_get_pretty(local_err
));
1354 error_free(local_err
);
1358 /* Prepare a new options QDict for the temporary file */
1359 snapshot_options
= qdict_new();
1360 qdict_put(snapshot_options
, "file.driver",
1361 qstring_from_str("file"));
1362 qdict_put(snapshot_options
, "file.filename",
1363 qstring_from_str(tmp_filename
));
1365 bs_snapshot
= bdrv_new();
1367 ret
= bdrv_open(&bs_snapshot
, NULL
, NULL
, snapshot_options
,
1368 flags
, bdrv_qcow2
, &local_err
);
1370 error_propagate(errp
, local_err
);
1374 bdrv_append(bs_snapshot
, bs
);
1377 g_free(tmp_filename
);
1382 * Opens a disk image (raw, qcow2, vmdk, ...)
1384 * options is a QDict of options to pass to the block drivers, or NULL for an
1385 * empty set of options. The reference to the QDict belongs to the block layer
1386 * after the call (even on failure), so if the caller intends to reuse the
1387 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1389 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1390 * If it is not NULL, the referenced BDS will be reused.
1392 * The reference parameter may be used to specify an existing block device which
1393 * should be opened. If specified, neither options nor a filename may be given,
1394 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1396 int bdrv_open(BlockDriverState
**pbs
, const char *filename
,
1397 const char *reference
, QDict
*options
, int flags
,
1398 BlockDriver
*drv
, Error
**errp
)
1401 BlockDriverState
*file
= NULL
, *bs
;
1402 const char *drvname
;
1403 Error
*local_err
= NULL
;
1404 int snapshot_flags
= 0;
1409 bool options_non_empty
= options
? qdict_size(options
) : false;
1413 error_setg(errp
, "Cannot reuse an existing BDS when referencing "
1414 "another block device");
1418 if (filename
|| options_non_empty
) {
1419 error_setg(errp
, "Cannot reference an existing block device with "
1420 "additional options or a new filename");
1424 bs
= bdrv_lookup_bs(reference
, reference
, errp
);
1439 /* NULL means an empty set of options */
1440 if (options
== NULL
) {
1441 options
= qdict_new();
1444 ret
= bdrv_fill_options(&options
, &filename
, flags
, drv
, &local_err
);
1449 /* Find the right image format driver */
1451 drvname
= qdict_get_try_str(options
, "driver");
1453 drv
= bdrv_find_format(drvname
);
1454 qdict_del(options
, "driver");
1456 error_setg(errp
, "Unknown driver: '%s'", drvname
);
1462 assert(drvname
|| !(flags
& BDRV_O_PROTOCOL
));
1463 if (drv
&& !drv
->bdrv_file_open
) {
1464 /* If the user explicitly wants a format driver here, we'll need to add
1465 * another layer for the protocol in bs->file */
1466 flags
&= ~BDRV_O_PROTOCOL
;
1469 bs
->options
= options
;
1470 options
= qdict_clone_shallow(options
);
1472 /* Open image file without format layer */
1473 if ((flags
& BDRV_O_PROTOCOL
) == 0) {
1474 if (flags
& BDRV_O_RDWR
) {
1475 flags
|= BDRV_O_ALLOW_RDWR
;
1477 if (flags
& BDRV_O_SNAPSHOT
) {
1478 snapshot_flags
= bdrv_temp_snapshot_flags(flags
);
1479 flags
= bdrv_backing_flags(flags
);
1482 assert(file
== NULL
);
1483 ret
= bdrv_open_image(&file
, filename
, options
, "file",
1484 bdrv_inherited_flags(flags
),
1491 /* Image format probing */
1494 ret
= find_image_format(file
, filename
, &drv
, &local_err
);
1499 error_setg(errp
, "Must specify either driver or file");
1504 /* Open the image */
1505 ret
= bdrv_open_common(bs
, file
, options
, flags
, drv
, &local_err
);
1510 if (file
&& (bs
->file
!= file
)) {
1515 /* If there is a backing file, use it */
1516 if ((flags
& BDRV_O_NO_BACKING
) == 0) {
1517 QDict
*backing_options
;
1519 qdict_extract_subqdict(options
, &backing_options
, "backing.");
1520 ret
= bdrv_open_backing_file(bs
, backing_options
, &local_err
);
1522 goto close_and_fail
;
1526 bdrv_refresh_filename(bs
);
1528 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1529 * temporary snapshot afterwards. */
1530 if (snapshot_flags
) {
1531 ret
= bdrv_append_temp_snapshot(bs
, snapshot_flags
, &local_err
);
1533 goto close_and_fail
;
1537 /* Check if any unknown options were used */
1538 if (options
&& (qdict_size(options
) != 0)) {
1539 const QDictEntry
*entry
= qdict_first(options
);
1540 if (flags
& BDRV_O_PROTOCOL
) {
1541 error_setg(errp
, "Block protocol '%s' doesn't support the option "
1542 "'%s'", drv
->format_name
, entry
->key
);
1544 error_setg(errp
, "Block format '%s' used by device '%s' doesn't "
1545 "support the option '%s'", drv
->format_name
,
1546 bdrv_get_device_name(bs
), entry
->key
);
1550 goto close_and_fail
;
1553 if (!bdrv_key_required(bs
)) {
1555 blk_dev_change_media_cb(bs
->blk
, true);
1557 } else if (!runstate_check(RUN_STATE_PRELAUNCH
)
1558 && !runstate_check(RUN_STATE_INMIGRATE
)
1559 && !runstate_check(RUN_STATE_PAUSED
)) { /* HACK */
1561 "Guest must be stopped for opening of encrypted image");
1563 goto close_and_fail
;
1574 QDECREF(bs
->options
);
1578 /* If *pbs is NULL, a new BDS has been created in this function and
1579 needs to be freed now. Otherwise, it does not need to be closed,
1580 since it has not really been opened yet. */
1584 error_propagate(errp
, local_err
);
1589 /* See fail path, but now the BDS has to be always closed */
1597 error_propagate(errp
, local_err
);
1602 typedef struct BlockReopenQueueEntry
{
1604 BDRVReopenState state
;
1605 QSIMPLEQ_ENTRY(BlockReopenQueueEntry
) entry
;
1606 } BlockReopenQueueEntry
;
1609 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1610 * reopen of multiple devices.
1612 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1613 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1614 * be created and initialized. This newly created BlockReopenQueue should be
1615 * passed back in for subsequent calls that are intended to be of the same
1618 * bs is the BlockDriverState to add to the reopen queue.
1620 * flags contains the open flags for the associated bs
1622 * returns a pointer to bs_queue, which is either the newly allocated
1623 * bs_queue, or the existing bs_queue being used.
1626 BlockReopenQueue
*bdrv_reopen_queue(BlockReopenQueue
*bs_queue
,
1627 BlockDriverState
*bs
, int flags
)
1631 BlockReopenQueueEntry
*bs_entry
;
1632 if (bs_queue
== NULL
) {
1633 bs_queue
= g_new0(BlockReopenQueue
, 1);
1634 QSIMPLEQ_INIT(bs_queue
);
1637 /* bdrv_open() masks this flag out */
1638 flags
&= ~BDRV_O_PROTOCOL
;
1641 bdrv_reopen_queue(bs_queue
, bs
->file
, bdrv_inherited_flags(flags
));
1644 bs_entry
= g_new0(BlockReopenQueueEntry
, 1);
1645 QSIMPLEQ_INSERT_TAIL(bs_queue
, bs_entry
, entry
);
1647 bs_entry
->state
.bs
= bs
;
1648 bs_entry
->state
.flags
= flags
;
1654 * Reopen multiple BlockDriverStates atomically & transactionally.
1656 * The queue passed in (bs_queue) must have been built up previous
1657 * via bdrv_reopen_queue().
1659 * Reopens all BDS specified in the queue, with the appropriate
1660 * flags. All devices are prepared for reopen, and failure of any
1661 * device will cause all device changes to be abandonded, and intermediate
1664 * If all devices prepare successfully, then the changes are committed
1668 int bdrv_reopen_multiple(BlockReopenQueue
*bs_queue
, Error
**errp
)
1671 BlockReopenQueueEntry
*bs_entry
, *next
;
1672 Error
*local_err
= NULL
;
1674 assert(bs_queue
!= NULL
);
1678 QSIMPLEQ_FOREACH(bs_entry
, bs_queue
, entry
) {
1679 if (bdrv_reopen_prepare(&bs_entry
->state
, bs_queue
, &local_err
)) {
1680 error_propagate(errp
, local_err
);
1683 bs_entry
->prepared
= true;
1686 /* If we reach this point, we have success and just need to apply the
1689 QSIMPLEQ_FOREACH(bs_entry
, bs_queue
, entry
) {
1690 bdrv_reopen_commit(&bs_entry
->state
);
1696 QSIMPLEQ_FOREACH_SAFE(bs_entry
, bs_queue
, entry
, next
) {
1697 if (ret
&& bs_entry
->prepared
) {
1698 bdrv_reopen_abort(&bs_entry
->state
);
1707 /* Reopen a single BlockDriverState with the specified flags. */
1708 int bdrv_reopen(BlockDriverState
*bs
, int bdrv_flags
, Error
**errp
)
1711 Error
*local_err
= NULL
;
1712 BlockReopenQueue
*queue
= bdrv_reopen_queue(NULL
, bs
, bdrv_flags
);
1714 ret
= bdrv_reopen_multiple(queue
, &local_err
);
1715 if (local_err
!= NULL
) {
1716 error_propagate(errp
, local_err
);
1723 * Prepares a BlockDriverState for reopen. All changes are staged in the
1724 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1725 * the block driver layer .bdrv_reopen_prepare()
1727 * bs is the BlockDriverState to reopen
1728 * flags are the new open flags
1729 * queue is the reopen queue
1731 * Returns 0 on success, non-zero on error. On error errp will be set
1734 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1735 * It is the responsibility of the caller to then call the abort() or
1736 * commit() for any other BDS that have been left in a prepare() state
1739 int bdrv_reopen_prepare(BDRVReopenState
*reopen_state
, BlockReopenQueue
*queue
,
1743 Error
*local_err
= NULL
;
1746 assert(reopen_state
!= NULL
);
1747 assert(reopen_state
->bs
->drv
!= NULL
);
1748 drv
= reopen_state
->bs
->drv
;
1750 /* if we are to stay read-only, do not allow permission change
1752 if (!(reopen_state
->bs
->open_flags
& BDRV_O_ALLOW_RDWR
) &&
1753 reopen_state
->flags
& BDRV_O_RDWR
) {
1754 error_set(errp
, QERR_DEVICE_IS_READ_ONLY
,
1755 bdrv_get_device_name(reopen_state
->bs
));
1760 ret
= bdrv_flush(reopen_state
->bs
);
1762 error_set(errp
, ERROR_CLASS_GENERIC_ERROR
, "Error (%s) flushing drive",
1767 if (drv
->bdrv_reopen_prepare
) {
1768 ret
= drv
->bdrv_reopen_prepare(reopen_state
, queue
, &local_err
);
1770 if (local_err
!= NULL
) {
1771 error_propagate(errp
, local_err
);
1773 error_setg(errp
, "failed while preparing to reopen image '%s'",
1774 reopen_state
->bs
->filename
);
1779 /* It is currently mandatory to have a bdrv_reopen_prepare()
1780 * handler for each supported drv. */
1781 error_set(errp
, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED
,
1782 drv
->format_name
, bdrv_get_device_name(reopen_state
->bs
),
1783 "reopening of file");
1795 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1796 * makes them final by swapping the staging BlockDriverState contents into
1797 * the active BlockDriverState contents.
1799 void bdrv_reopen_commit(BDRVReopenState
*reopen_state
)
1803 assert(reopen_state
!= NULL
);
1804 drv
= reopen_state
->bs
->drv
;
1805 assert(drv
!= NULL
);
1807 /* If there are any driver level actions to take */
1808 if (drv
->bdrv_reopen_commit
) {
1809 drv
->bdrv_reopen_commit(reopen_state
);
1812 /* set BDS specific flags now */
1813 reopen_state
->bs
->open_flags
= reopen_state
->flags
;
1814 reopen_state
->bs
->enable_write_cache
= !!(reopen_state
->flags
&
1816 reopen_state
->bs
->read_only
= !(reopen_state
->flags
& BDRV_O_RDWR
);
1818 bdrv_refresh_limits(reopen_state
->bs
, NULL
);
1822 * Abort the reopen, and delete and free the staged changes in
1825 void bdrv_reopen_abort(BDRVReopenState
*reopen_state
)
1829 assert(reopen_state
!= NULL
);
1830 drv
= reopen_state
->bs
->drv
;
1831 assert(drv
!= NULL
);
1833 if (drv
->bdrv_reopen_abort
) {
1834 drv
->bdrv_reopen_abort(reopen_state
);
1839 void bdrv_close(BlockDriverState
*bs
)
1841 BdrvAioNotifier
*ban
, *ban_next
;
1844 block_job_cancel_sync(bs
->job
);
1846 bdrv_drain_all(); /* complete I/O */
1848 bdrv_drain_all(); /* in case flush left pending I/O */
1849 notifier_list_notify(&bs
->close_notifiers
, bs
);
1852 if (bs
->backing_hd
) {
1853 BlockDriverState
*backing_hd
= bs
->backing_hd
;
1854 bdrv_set_backing_hd(bs
, NULL
);
1855 bdrv_unref(backing_hd
);
1857 bs
->drv
->bdrv_close(bs
);
1861 bs
->copy_on_read
= 0;
1862 bs
->backing_file
[0] = '\0';
1863 bs
->backing_format
[0] = '\0';
1864 bs
->total_sectors
= 0;
1869 bs
->zero_beyond_eof
= false;
1870 QDECREF(bs
->options
);
1872 QDECREF(bs
->full_open_options
);
1873 bs
->full_open_options
= NULL
;
1875 if (bs
->file
!= NULL
) {
1876 bdrv_unref(bs
->file
);
1882 blk_dev_change_media_cb(bs
->blk
, false);
1885 /*throttling disk I/O limits*/
1886 if (bs
->io_limits_enabled
) {
1887 bdrv_io_limits_disable(bs
);
1890 QLIST_FOREACH_SAFE(ban
, &bs
->aio_notifiers
, list
, ban_next
) {
1893 QLIST_INIT(&bs
->aio_notifiers
);
1896 void bdrv_close_all(void)
1898 BlockDriverState
*bs
;
1900 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
1901 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
1903 aio_context_acquire(aio_context
);
1905 aio_context_release(aio_context
);
1909 /* Check if any requests are in-flight (including throttled requests) */
1910 static bool bdrv_requests_pending(BlockDriverState
*bs
)
1912 if (!QLIST_EMPTY(&bs
->tracked_requests
)) {
1915 if (!qemu_co_queue_empty(&bs
->throttled_reqs
[0])) {
1918 if (!qemu_co_queue_empty(&bs
->throttled_reqs
[1])) {
1921 if (bs
->file
&& bdrv_requests_pending(bs
->file
)) {
1924 if (bs
->backing_hd
&& bdrv_requests_pending(bs
->backing_hd
)) {
1930 static bool bdrv_drain_one(BlockDriverState
*bs
)
1934 bdrv_flush_io_queue(bs
);
1935 bdrv_start_throttled_reqs(bs
);
1936 bs_busy
= bdrv_requests_pending(bs
);
1937 bs_busy
|= aio_poll(bdrv_get_aio_context(bs
), bs_busy
);
1942 * Wait for pending requests to complete on a single BlockDriverState subtree
1944 * See the warning in bdrv_drain_all(). This function can only be called if
1945 * you are sure nothing can generate I/O because you have op blockers
1948 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
1951 void bdrv_drain(BlockDriverState
*bs
)
1953 while (bdrv_drain_one(bs
)) {
1954 /* Keep iterating */
1959 * Wait for pending requests to complete across all BlockDriverStates
1961 * This function does not flush data to disk, use bdrv_flush_all() for that
1962 * after calling this function.
1964 * Note that completion of an asynchronous I/O operation can trigger any
1965 * number of other I/O operations on other devices---for example a coroutine
1966 * can be arbitrarily complex and a constant flow of I/O can come until the
1967 * coroutine is complete. Because of this, it is not possible to have a
1968 * function to drain a single device's I/O queue.
1970 void bdrv_drain_all(void)
1972 /* Always run first iteration so any pending completion BHs run */
1974 BlockDriverState
*bs
;
1979 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
1980 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
1982 aio_context_acquire(aio_context
);
1983 busy
|= bdrv_drain_one(bs
);
1984 aio_context_release(aio_context
);
1989 /* make a BlockDriverState anonymous by removing from bdrv_state and
1990 * graph_bdrv_state list.
1991 Also, NULL terminate the device_name to prevent double remove */
1992 void bdrv_make_anon(BlockDriverState
*bs
)
1995 * Take care to remove bs from bdrv_states only when it's actually
1996 * in it. Note that bs->device_list.tqe_prev is initially null,
1997 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
1998 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1999 * resetting it to null on remove.
2001 if (bs
->device_list
.tqe_prev
) {
2002 QTAILQ_REMOVE(&bdrv_states
, bs
, device_list
);
2003 bs
->device_list
.tqe_prev
= NULL
;
2005 if (bs
->node_name
[0] != '\0') {
2006 QTAILQ_REMOVE(&graph_bdrv_states
, bs
, node_list
);
2008 bs
->node_name
[0] = '\0';
2011 static void bdrv_rebind(BlockDriverState
*bs
)
2013 if (bs
->drv
&& bs
->drv
->bdrv_rebind
) {
2014 bs
->drv
->bdrv_rebind(bs
);
2018 static void bdrv_move_feature_fields(BlockDriverState
*bs_dest
,
2019 BlockDriverState
*bs_src
)
2021 /* move some fields that need to stay attached to the device */
2024 bs_dest
->guest_block_size
= bs_src
->guest_block_size
;
2025 bs_dest
->copy_on_read
= bs_src
->copy_on_read
;
2027 bs_dest
->enable_write_cache
= bs_src
->enable_write_cache
;
2029 /* i/o throttled req */
2030 memcpy(&bs_dest
->throttle_state
,
2031 &bs_src
->throttle_state
,
2032 sizeof(ThrottleState
));
2033 bs_dest
->throttled_reqs
[0] = bs_src
->throttled_reqs
[0];
2034 bs_dest
->throttled_reqs
[1] = bs_src
->throttled_reqs
[1];
2035 bs_dest
->io_limits_enabled
= bs_src
->io_limits_enabled
;
2038 bs_dest
->on_read_error
= bs_src
->on_read_error
;
2039 bs_dest
->on_write_error
= bs_src
->on_write_error
;
2042 bs_dest
->iostatus_enabled
= bs_src
->iostatus_enabled
;
2043 bs_dest
->iostatus
= bs_src
->iostatus
;
2046 bs_dest
->dirty_bitmaps
= bs_src
->dirty_bitmaps
;
2048 /* reference count */
2049 bs_dest
->refcnt
= bs_src
->refcnt
;
2052 bs_dest
->job
= bs_src
->job
;
2054 /* keep the same entry in bdrv_states */
2055 bs_dest
->device_list
= bs_src
->device_list
;
2056 bs_dest
->blk
= bs_src
->blk
;
2058 memcpy(bs_dest
->op_blockers
, bs_src
->op_blockers
,
2059 sizeof(bs_dest
->op_blockers
));
2063 * Swap bs contents for two image chains while they are live,
2064 * while keeping required fields on the BlockDriverState that is
2065 * actually attached to a device.
2067 * This will modify the BlockDriverState fields, and swap contents
2068 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2070 * bs_new must not be attached to a BlockBackend.
2072 * This function does not create any image files.
2074 void bdrv_swap(BlockDriverState
*bs_new
, BlockDriverState
*bs_old
)
2076 BlockDriverState tmp
;
2078 /* The code needs to swap the node_name but simply swapping node_list won't
2079 * work so first remove the nodes from the graph list, do the swap then
2080 * insert them back if needed.
2082 if (bs_new
->node_name
[0] != '\0') {
2083 QTAILQ_REMOVE(&graph_bdrv_states
, bs_new
, node_list
);
2085 if (bs_old
->node_name
[0] != '\0') {
2086 QTAILQ_REMOVE(&graph_bdrv_states
, bs_old
, node_list
);
2089 /* bs_new must be unattached and shouldn't have anything fancy enabled */
2090 assert(!bs_new
->blk
);
2091 assert(QLIST_EMPTY(&bs_new
->dirty_bitmaps
));
2092 assert(bs_new
->job
== NULL
);
2093 assert(bs_new
->io_limits_enabled
== false);
2094 assert(!throttle_have_timer(&bs_new
->throttle_state
));
2100 /* there are some fields that should not be swapped, move them back */
2101 bdrv_move_feature_fields(&tmp
, bs_old
);
2102 bdrv_move_feature_fields(bs_old
, bs_new
);
2103 bdrv_move_feature_fields(bs_new
, &tmp
);
2105 /* bs_new must remain unattached */
2106 assert(!bs_new
->blk
);
2108 /* Check a few fields that should remain attached to the device */
2109 assert(bs_new
->job
== NULL
);
2110 assert(bs_new
->io_limits_enabled
== false);
2111 assert(!throttle_have_timer(&bs_new
->throttle_state
));
2113 /* insert the nodes back into the graph node list if needed */
2114 if (bs_new
->node_name
[0] != '\0') {
2115 QTAILQ_INSERT_TAIL(&graph_bdrv_states
, bs_new
, node_list
);
2117 if (bs_old
->node_name
[0] != '\0') {
2118 QTAILQ_INSERT_TAIL(&graph_bdrv_states
, bs_old
, node_list
);
2121 bdrv_rebind(bs_new
);
2122 bdrv_rebind(bs_old
);
2126 * Add new bs contents at the top of an image chain while the chain is
2127 * live, while keeping required fields on the top layer.
2129 * This will modify the BlockDriverState fields, and swap contents
2130 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2132 * bs_new must not be attached to a BlockBackend.
2134 * This function does not create any image files.
2136 void bdrv_append(BlockDriverState
*bs_new
, BlockDriverState
*bs_top
)
2138 bdrv_swap(bs_new
, bs_top
);
2140 /* The contents of 'tmp' will become bs_top, as we are
2141 * swapping bs_new and bs_top contents. */
2142 bdrv_set_backing_hd(bs_top
, bs_new
);
2145 static void bdrv_delete(BlockDriverState
*bs
)
2148 assert(bdrv_op_blocker_is_empty(bs
));
2149 assert(!bs
->refcnt
);
2150 assert(QLIST_EMPTY(&bs
->dirty_bitmaps
));
2154 /* remove from list, if necessary */
2161 * Run consistency checks on an image
2163 * Returns 0 if the check could be completed (it doesn't mean that the image is
2164 * free of errors) or -errno when an internal error occurred. The results of the
2165 * check are stored in res.
2167 int bdrv_check(BlockDriverState
*bs
, BdrvCheckResult
*res
, BdrvCheckMode fix
)
2169 if (bs
->drv
== NULL
) {
2172 if (bs
->drv
->bdrv_check
== NULL
) {
2176 memset(res
, 0, sizeof(*res
));
2177 return bs
->drv
->bdrv_check(bs
, res
, fix
);
2180 #define COMMIT_BUF_SECTORS 2048
2182 /* commit COW file into the raw image */
2183 int bdrv_commit(BlockDriverState
*bs
)
2185 BlockDriver
*drv
= bs
->drv
;
2186 int64_t sector
, total_sectors
, length
, backing_length
;
2187 int n
, ro
, open_flags
;
2189 uint8_t *buf
= NULL
;
2190 char filename
[PATH_MAX
];
2195 if (!bs
->backing_hd
) {
2199 if (bdrv_op_is_blocked(bs
, BLOCK_OP_TYPE_COMMIT
, NULL
) ||
2200 bdrv_op_is_blocked(bs
->backing_hd
, BLOCK_OP_TYPE_COMMIT
, NULL
)) {
2204 ro
= bs
->backing_hd
->read_only
;
2205 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2206 pstrcpy(filename
, sizeof(filename
), bs
->backing_hd
->filename
);
2207 open_flags
= bs
->backing_hd
->open_flags
;
2210 if (bdrv_reopen(bs
->backing_hd
, open_flags
| BDRV_O_RDWR
, NULL
)) {
2215 length
= bdrv_getlength(bs
);
2221 backing_length
= bdrv_getlength(bs
->backing_hd
);
2222 if (backing_length
< 0) {
2223 ret
= backing_length
;
2227 /* If our top snapshot is larger than the backing file image,
2228 * grow the backing file image if possible. If not possible,
2229 * we must return an error */
2230 if (length
> backing_length
) {
2231 ret
= bdrv_truncate(bs
->backing_hd
, length
);
2237 total_sectors
= length
>> BDRV_SECTOR_BITS
;
2239 /* qemu_try_blockalign() for bs will choose an alignment that works for
2240 * bs->backing_hd as well, so no need to compare the alignment manually. */
2241 buf
= qemu_try_blockalign(bs
, COMMIT_BUF_SECTORS
* BDRV_SECTOR_SIZE
);
2247 for (sector
= 0; sector
< total_sectors
; sector
+= n
) {
2248 ret
= bdrv_is_allocated(bs
, sector
, COMMIT_BUF_SECTORS
, &n
);
2253 ret
= bdrv_read(bs
, sector
, buf
, n
);
2258 ret
= bdrv_write(bs
->backing_hd
, sector
, buf
, n
);
2265 if (drv
->bdrv_make_empty
) {
2266 ret
= drv
->bdrv_make_empty(bs
);
2274 * Make sure all data we wrote to the backing device is actually
2277 if (bs
->backing_hd
) {
2278 bdrv_flush(bs
->backing_hd
);
2286 /* ignoring error return here */
2287 bdrv_reopen(bs
->backing_hd
, open_flags
& ~BDRV_O_RDWR
, NULL
);
2293 int bdrv_commit_all(void)
2295 BlockDriverState
*bs
;
2297 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
2298 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
2300 aio_context_acquire(aio_context
);
2301 if (bs
->drv
&& bs
->backing_hd
) {
2302 int ret
= bdrv_commit(bs
);
2304 aio_context_release(aio_context
);
2308 aio_context_release(aio_context
);
2314 * Remove an active request from the tracked requests list
2316 * This function should be called when a tracked request is completing.
2318 static void tracked_request_end(BdrvTrackedRequest
*req
)
2320 if (req
->serialising
) {
2321 req
->bs
->serialising_in_flight
--;
2324 QLIST_REMOVE(req
, list
);
2325 qemu_co_queue_restart_all(&req
->wait_queue
);
2329 * Add an active request to the tracked requests list
2331 static void tracked_request_begin(BdrvTrackedRequest
*req
,
2332 BlockDriverState
*bs
,
2334 unsigned int bytes
, bool is_write
)
2336 *req
= (BdrvTrackedRequest
){
2340 .is_write
= is_write
,
2341 .co
= qemu_coroutine_self(),
2342 .serialising
= false,
2343 .overlap_offset
= offset
,
2344 .overlap_bytes
= bytes
,
2347 qemu_co_queue_init(&req
->wait_queue
);
2349 QLIST_INSERT_HEAD(&bs
->tracked_requests
, req
, list
);
2352 static void mark_request_serialising(BdrvTrackedRequest
*req
, uint64_t align
)
2354 int64_t overlap_offset
= req
->offset
& ~(align
- 1);
2355 unsigned int overlap_bytes
= ROUND_UP(req
->offset
+ req
->bytes
, align
)
2358 if (!req
->serialising
) {
2359 req
->bs
->serialising_in_flight
++;
2360 req
->serialising
= true;
2363 req
->overlap_offset
= MIN(req
->overlap_offset
, overlap_offset
);
2364 req
->overlap_bytes
= MAX(req
->overlap_bytes
, overlap_bytes
);
2368 * Round a region to cluster boundaries
2370 void bdrv_round_to_clusters(BlockDriverState
*bs
,
2371 int64_t sector_num
, int nb_sectors
,
2372 int64_t *cluster_sector_num
,
2373 int *cluster_nb_sectors
)
2375 BlockDriverInfo bdi
;
2377 if (bdrv_get_info(bs
, &bdi
) < 0 || bdi
.cluster_size
== 0) {
2378 *cluster_sector_num
= sector_num
;
2379 *cluster_nb_sectors
= nb_sectors
;
2381 int64_t c
= bdi
.cluster_size
/ BDRV_SECTOR_SIZE
;
2382 *cluster_sector_num
= QEMU_ALIGN_DOWN(sector_num
, c
);
2383 *cluster_nb_sectors
= QEMU_ALIGN_UP(sector_num
- *cluster_sector_num
+
2388 static int bdrv_get_cluster_size(BlockDriverState
*bs
)
2390 BlockDriverInfo bdi
;
2393 ret
= bdrv_get_info(bs
, &bdi
);
2394 if (ret
< 0 || bdi
.cluster_size
== 0) {
2395 return bs
->request_alignment
;
2397 return bdi
.cluster_size
;
2401 static bool tracked_request_overlaps(BdrvTrackedRequest
*req
,
2402 int64_t offset
, unsigned int bytes
)
2405 if (offset
>= req
->overlap_offset
+ req
->overlap_bytes
) {
2409 if (req
->overlap_offset
>= offset
+ bytes
) {
2415 static bool coroutine_fn
wait_serialising_requests(BdrvTrackedRequest
*self
)
2417 BlockDriverState
*bs
= self
->bs
;
2418 BdrvTrackedRequest
*req
;
2420 bool waited
= false;
2422 if (!bs
->serialising_in_flight
) {
2428 QLIST_FOREACH(req
, &bs
->tracked_requests
, list
) {
2429 if (req
== self
|| (!req
->serialising
&& !self
->serialising
)) {
2432 if (tracked_request_overlaps(req
, self
->overlap_offset
,
2433 self
->overlap_bytes
))
2435 /* Hitting this means there was a reentrant request, for
2436 * example, a block driver issuing nested requests. This must
2437 * never happen since it means deadlock.
2439 assert(qemu_coroutine_self() != req
->co
);
2441 /* If the request is already (indirectly) waiting for us, or
2442 * will wait for us as soon as it wakes up, then just go on
2443 * (instead of producing a deadlock in the former case). */
2444 if (!req
->waiting_for
) {
2445 self
->waiting_for
= req
;
2446 qemu_co_queue_wait(&req
->wait_queue
);
2447 self
->waiting_for
= NULL
;
2462 * -EINVAL - backing format specified, but no file
2463 * -ENOSPC - can't update the backing file because no space is left in the
2465 * -ENOTSUP - format driver doesn't support changing the backing file
2467 int bdrv_change_backing_file(BlockDriverState
*bs
,
2468 const char *backing_file
, const char *backing_fmt
)
2470 BlockDriver
*drv
= bs
->drv
;
2473 /* Backing file format doesn't make sense without a backing file */
2474 if (backing_fmt
&& !backing_file
) {
2478 if (drv
->bdrv_change_backing_file
!= NULL
) {
2479 ret
= drv
->bdrv_change_backing_file(bs
, backing_file
, backing_fmt
);
2485 pstrcpy(bs
->backing_file
, sizeof(bs
->backing_file
), backing_file
?: "");
2486 pstrcpy(bs
->backing_format
, sizeof(bs
->backing_format
), backing_fmt
?: "");
2492 * Finds the image layer in the chain that has 'bs' as its backing file.
2494 * active is the current topmost image.
2496 * Returns NULL if bs is not found in active's image chain,
2497 * or if active == bs.
2499 * Returns the bottommost base image if bs == NULL.
2501 BlockDriverState
*bdrv_find_overlay(BlockDriverState
*active
,
2502 BlockDriverState
*bs
)
2504 while (active
&& bs
!= active
->backing_hd
) {
2505 active
= active
->backing_hd
;
2511 /* Given a BDS, searches for the base layer. */
2512 BlockDriverState
*bdrv_find_base(BlockDriverState
*bs
)
2514 return bdrv_find_overlay(bs
, NULL
);
2517 typedef struct BlkIntermediateStates
{
2518 BlockDriverState
*bs
;
2519 QSIMPLEQ_ENTRY(BlkIntermediateStates
) entry
;
2520 } BlkIntermediateStates
;
2524 * Drops images above 'base' up to and including 'top', and sets the image
2525 * above 'top' to have base as its backing file.
2527 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2528 * information in 'bs' can be properly updated.
2530 * E.g., this will convert the following chain:
2531 * bottom <- base <- intermediate <- top <- active
2535 * bottom <- base <- active
2537 * It is allowed for bottom==base, in which case it converts:
2539 * base <- intermediate <- top <- active
2545 * If backing_file_str is non-NULL, it will be used when modifying top's
2546 * overlay image metadata.
2549 * if active == top, that is considered an error
2552 int bdrv_drop_intermediate(BlockDriverState
*active
, BlockDriverState
*top
,
2553 BlockDriverState
*base
, const char *backing_file_str
)
2555 BlockDriverState
*intermediate
;
2556 BlockDriverState
*base_bs
= NULL
;
2557 BlockDriverState
*new_top_bs
= NULL
;
2558 BlkIntermediateStates
*intermediate_state
, *next
;
2561 QSIMPLEQ_HEAD(states_to_delete
, BlkIntermediateStates
) states_to_delete
;
2562 QSIMPLEQ_INIT(&states_to_delete
);
2564 if (!top
->drv
|| !base
->drv
) {
2568 new_top_bs
= bdrv_find_overlay(active
, top
);
2570 if (new_top_bs
== NULL
) {
2571 /* we could not find the image above 'top', this is an error */
2575 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2576 * to do, no intermediate images */
2577 if (new_top_bs
->backing_hd
== base
) {
2584 /* now we will go down through the list, and add each BDS we find
2585 * into our deletion queue, until we hit the 'base'
2587 while (intermediate
) {
2588 intermediate_state
= g_new0(BlkIntermediateStates
, 1);
2589 intermediate_state
->bs
= intermediate
;
2590 QSIMPLEQ_INSERT_TAIL(&states_to_delete
, intermediate_state
, entry
);
2592 if (intermediate
->backing_hd
== base
) {
2593 base_bs
= intermediate
->backing_hd
;
2596 intermediate
= intermediate
->backing_hd
;
2598 if (base_bs
== NULL
) {
2599 /* something went wrong, we did not end at the base. safely
2600 * unravel everything, and exit with error */
2604 /* success - we can delete the intermediate states, and link top->base */
2605 backing_file_str
= backing_file_str
? backing_file_str
: base_bs
->filename
;
2606 ret
= bdrv_change_backing_file(new_top_bs
, backing_file_str
,
2607 base_bs
->drv
? base_bs
->drv
->format_name
: "");
2611 bdrv_set_backing_hd(new_top_bs
, base_bs
);
2613 QSIMPLEQ_FOREACH_SAFE(intermediate_state
, &states_to_delete
, entry
, next
) {
2614 /* so that bdrv_close() does not recursively close the chain */
2615 bdrv_set_backing_hd(intermediate_state
->bs
, NULL
);
2616 bdrv_unref(intermediate_state
->bs
);
2621 QSIMPLEQ_FOREACH_SAFE(intermediate_state
, &states_to_delete
, entry
, next
) {
2622 g_free(intermediate_state
);
2628 static int bdrv_check_byte_request(BlockDriverState
*bs
, int64_t offset
,
2633 if (size
> INT_MAX
) {
2637 if (!bdrv_is_inserted(bs
))
2643 len
= bdrv_getlength(bs
);
2648 if ((offset
> len
) || (len
- offset
< size
))
2654 static int bdrv_check_request(BlockDriverState
*bs
, int64_t sector_num
,
2657 if (nb_sectors
< 0 || nb_sectors
> INT_MAX
/ BDRV_SECTOR_SIZE
) {
2661 return bdrv_check_byte_request(bs
, sector_num
* BDRV_SECTOR_SIZE
,
2662 nb_sectors
* BDRV_SECTOR_SIZE
);
2665 typedef struct RwCo
{
2666 BlockDriverState
*bs
;
2671 BdrvRequestFlags flags
;
2674 static void coroutine_fn
bdrv_rw_co_entry(void *opaque
)
2676 RwCo
*rwco
= opaque
;
2678 if (!rwco
->is_write
) {
2679 rwco
->ret
= bdrv_co_do_preadv(rwco
->bs
, rwco
->offset
,
2680 rwco
->qiov
->size
, rwco
->qiov
,
2683 rwco
->ret
= bdrv_co_do_pwritev(rwco
->bs
, rwco
->offset
,
2684 rwco
->qiov
->size
, rwco
->qiov
,
2690 * Process a vectored synchronous request using coroutines
2692 static int bdrv_prwv_co(BlockDriverState
*bs
, int64_t offset
,
2693 QEMUIOVector
*qiov
, bool is_write
,
2694 BdrvRequestFlags flags
)
2701 .is_write
= is_write
,
2707 * In sync call context, when the vcpu is blocked, this throttling timer
2708 * will not fire; so the I/O throttling function has to be disabled here
2709 * if it has been enabled.
2711 if (bs
->io_limits_enabled
) {
2712 fprintf(stderr
, "Disabling I/O throttling on '%s' due "
2713 "to synchronous I/O.\n", bdrv_get_device_name(bs
));
2714 bdrv_io_limits_disable(bs
);
2717 if (qemu_in_coroutine()) {
2718 /* Fast-path if already in coroutine context */
2719 bdrv_rw_co_entry(&rwco
);
2721 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
2723 co
= qemu_coroutine_create(bdrv_rw_co_entry
);
2724 qemu_coroutine_enter(co
, &rwco
);
2725 while (rwco
.ret
== NOT_DONE
) {
2726 aio_poll(aio_context
, true);
2733 * Process a synchronous request using coroutines
2735 static int bdrv_rw_co(BlockDriverState
*bs
, int64_t sector_num
, uint8_t *buf
,
2736 int nb_sectors
, bool is_write
, BdrvRequestFlags flags
)
2739 struct iovec iov
= {
2740 .iov_base
= (void *)buf
,
2741 .iov_len
= nb_sectors
* BDRV_SECTOR_SIZE
,
2744 if (nb_sectors
< 0 || nb_sectors
> INT_MAX
/ BDRV_SECTOR_SIZE
) {
2748 qemu_iovec_init_external(&qiov
, &iov
, 1);
2749 return bdrv_prwv_co(bs
, sector_num
<< BDRV_SECTOR_BITS
,
2750 &qiov
, is_write
, flags
);
2753 /* return < 0 if error. See bdrv_write() for the return codes */
2754 int bdrv_read(BlockDriverState
*bs
, int64_t sector_num
,
2755 uint8_t *buf
, int nb_sectors
)
2757 return bdrv_rw_co(bs
, sector_num
, buf
, nb_sectors
, false, 0);
2760 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2761 int bdrv_read_unthrottled(BlockDriverState
*bs
, int64_t sector_num
,
2762 uint8_t *buf
, int nb_sectors
)
2767 enabled
= bs
->io_limits_enabled
;
2768 bs
->io_limits_enabled
= false;
2769 ret
= bdrv_read(bs
, sector_num
, buf
, nb_sectors
);
2770 bs
->io_limits_enabled
= enabled
;
2774 /* Return < 0 if error. Important errors are:
2775 -EIO generic I/O error (may happen for all errors)
2776 -ENOMEDIUM No media inserted.
2777 -EINVAL Invalid sector number or nb_sectors
2778 -EACCES Trying to write a read-only device
2780 int bdrv_write(BlockDriverState
*bs
, int64_t sector_num
,
2781 const uint8_t *buf
, int nb_sectors
)
2783 return bdrv_rw_co(bs
, sector_num
, (uint8_t *)buf
, nb_sectors
, true, 0);
2786 int bdrv_write_zeroes(BlockDriverState
*bs
, int64_t sector_num
,
2787 int nb_sectors
, BdrvRequestFlags flags
)
2789 return bdrv_rw_co(bs
, sector_num
, NULL
, nb_sectors
, true,
2790 BDRV_REQ_ZERO_WRITE
| flags
);
2794 * Completely zero out a block device with the help of bdrv_write_zeroes.
2795 * The operation is sped up by checking the block status and only writing
2796 * zeroes to the device if they currently do not return zeroes. Optional
2797 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2799 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2801 int bdrv_make_zero(BlockDriverState
*bs
, BdrvRequestFlags flags
)
2803 int64_t target_sectors
, ret
, nb_sectors
, sector_num
= 0;
2806 target_sectors
= bdrv_nb_sectors(bs
);
2807 if (target_sectors
< 0) {
2808 return target_sectors
;
2812 nb_sectors
= target_sectors
- sector_num
;
2813 if (nb_sectors
<= 0) {
2816 if (nb_sectors
> INT_MAX
/ BDRV_SECTOR_SIZE
) {
2817 nb_sectors
= INT_MAX
/ BDRV_SECTOR_SIZE
;
2819 ret
= bdrv_get_block_status(bs
, sector_num
, nb_sectors
, &n
);
2821 error_report("error getting block status at sector %" PRId64
": %s",
2822 sector_num
, strerror(-ret
));
2825 if (ret
& BDRV_BLOCK_ZERO
) {
2829 ret
= bdrv_write_zeroes(bs
, sector_num
, n
, flags
);
2831 error_report("error writing zeroes at sector %" PRId64
": %s",
2832 sector_num
, strerror(-ret
));
2839 int bdrv_pread(BlockDriverState
*bs
, int64_t offset
, void *buf
, int bytes
)
2842 struct iovec iov
= {
2843 .iov_base
= (void *)buf
,
2852 qemu_iovec_init_external(&qiov
, &iov
, 1);
2853 ret
= bdrv_prwv_co(bs
, offset
, &qiov
, false, 0);
2861 int bdrv_pwritev(BlockDriverState
*bs
, int64_t offset
, QEMUIOVector
*qiov
)
2865 ret
= bdrv_prwv_co(bs
, offset
, qiov
, true, 0);
2873 int bdrv_pwrite(BlockDriverState
*bs
, int64_t offset
,
2874 const void *buf
, int bytes
)
2877 struct iovec iov
= {
2878 .iov_base
= (void *) buf
,
2886 qemu_iovec_init_external(&qiov
, &iov
, 1);
2887 return bdrv_pwritev(bs
, offset
, &qiov
);
2891 * Writes to the file and ensures that no writes are reordered across this
2892 * request (acts as a barrier)
2894 * Returns 0 on success, -errno in error cases.
2896 int bdrv_pwrite_sync(BlockDriverState
*bs
, int64_t offset
,
2897 const void *buf
, int count
)
2901 ret
= bdrv_pwrite(bs
, offset
, buf
, count
);
2906 /* No flush needed for cache modes that already do it */
2907 if (bs
->enable_write_cache
) {
2914 static int coroutine_fn
bdrv_co_do_copy_on_readv(BlockDriverState
*bs
,
2915 int64_t sector_num
, int nb_sectors
, QEMUIOVector
*qiov
)
2917 /* Perform I/O through a temporary buffer so that users who scribble over
2918 * their read buffer while the operation is in progress do not end up
2919 * modifying the image file. This is critical for zero-copy guest I/O
2920 * where anything might happen inside guest memory.
2922 void *bounce_buffer
;
2924 BlockDriver
*drv
= bs
->drv
;
2926 QEMUIOVector bounce_qiov
;
2927 int64_t cluster_sector_num
;
2928 int cluster_nb_sectors
;
2932 /* Cover entire cluster so no additional backing file I/O is required when
2933 * allocating cluster in the image file.
2935 bdrv_round_to_clusters(bs
, sector_num
, nb_sectors
,
2936 &cluster_sector_num
, &cluster_nb_sectors
);
2938 trace_bdrv_co_do_copy_on_readv(bs
, sector_num
, nb_sectors
,
2939 cluster_sector_num
, cluster_nb_sectors
);
2941 iov
.iov_len
= cluster_nb_sectors
* BDRV_SECTOR_SIZE
;
2942 iov
.iov_base
= bounce_buffer
= qemu_try_blockalign(bs
, iov
.iov_len
);
2943 if (bounce_buffer
== NULL
) {
2948 qemu_iovec_init_external(&bounce_qiov
, &iov
, 1);
2950 ret
= drv
->bdrv_co_readv(bs
, cluster_sector_num
, cluster_nb_sectors
,
2956 if (drv
->bdrv_co_write_zeroes
&&
2957 buffer_is_zero(bounce_buffer
, iov
.iov_len
)) {
2958 ret
= bdrv_co_do_write_zeroes(bs
, cluster_sector_num
,
2959 cluster_nb_sectors
, 0);
2961 /* This does not change the data on the disk, it is not necessary
2962 * to flush even in cache=writethrough mode.
2964 ret
= drv
->bdrv_co_writev(bs
, cluster_sector_num
, cluster_nb_sectors
,
2969 /* It might be okay to ignore write errors for guest requests. If this
2970 * is a deliberate copy-on-read then we don't want to ignore the error.
2971 * Simply report it in all cases.
2976 skip_bytes
= (sector_num
- cluster_sector_num
) * BDRV_SECTOR_SIZE
;
2977 qemu_iovec_from_buf(qiov
, 0, bounce_buffer
+ skip_bytes
,
2978 nb_sectors
* BDRV_SECTOR_SIZE
);
2981 qemu_vfree(bounce_buffer
);
2986 * Forwards an already correctly aligned request to the BlockDriver. This
2987 * handles copy on read and zeroing after EOF; any other features must be
2988 * implemented by the caller.
2990 static int coroutine_fn
bdrv_aligned_preadv(BlockDriverState
*bs
,
2991 BdrvTrackedRequest
*req
, int64_t offset
, unsigned int bytes
,
2992 int64_t align
, QEMUIOVector
*qiov
, int flags
)
2994 BlockDriver
*drv
= bs
->drv
;
2997 int64_t sector_num
= offset
>> BDRV_SECTOR_BITS
;
2998 unsigned int nb_sectors
= bytes
>> BDRV_SECTOR_BITS
;
3000 assert((offset
& (BDRV_SECTOR_SIZE
- 1)) == 0);
3001 assert((bytes
& (BDRV_SECTOR_SIZE
- 1)) == 0);
3002 assert(!qiov
|| bytes
== qiov
->size
);
3004 /* Handle Copy on Read and associated serialisation */
3005 if (flags
& BDRV_REQ_COPY_ON_READ
) {
3006 /* If we touch the same cluster it counts as an overlap. This
3007 * guarantees that allocating writes will be serialized and not race
3008 * with each other for the same cluster. For example, in copy-on-read
3009 * it ensures that the CoR read and write operations are atomic and
3010 * guest writes cannot interleave between them. */
3011 mark_request_serialising(req
, bdrv_get_cluster_size(bs
));
3014 wait_serialising_requests(req
);
3016 if (flags
& BDRV_REQ_COPY_ON_READ
) {
3019 ret
= bdrv_is_allocated(bs
, sector_num
, nb_sectors
, &pnum
);
3024 if (!ret
|| pnum
!= nb_sectors
) {
3025 ret
= bdrv_co_do_copy_on_readv(bs
, sector_num
, nb_sectors
, qiov
);
3030 /* Forward the request to the BlockDriver */
3031 if (!(bs
->zero_beyond_eof
&& bs
->growable
)) {
3032 ret
= drv
->bdrv_co_readv(bs
, sector_num
, nb_sectors
, qiov
);
3034 /* Read zeros after EOF of growable BDSes */
3035 int64_t total_sectors
, max_nb_sectors
;
3037 total_sectors
= bdrv_nb_sectors(bs
);
3038 if (total_sectors
< 0) {
3039 ret
= total_sectors
;
3043 max_nb_sectors
= ROUND_UP(MAX(0, total_sectors
- sector_num
),
3044 align
>> BDRV_SECTOR_BITS
);
3045 if (max_nb_sectors
> 0) {
3046 QEMUIOVector local_qiov
;
3047 size_t local_sectors
;
3049 max_nb_sectors
= MIN(max_nb_sectors
, SIZE_MAX
/ BDRV_SECTOR_BITS
);
3050 local_sectors
= MIN(max_nb_sectors
, nb_sectors
);
3052 qemu_iovec_init(&local_qiov
, qiov
->niov
);
3053 qemu_iovec_concat(&local_qiov
, qiov
, 0,
3054 local_sectors
* BDRV_SECTOR_SIZE
);
3056 ret
= drv
->bdrv_co_readv(bs
, sector_num
, local_sectors
,
3059 qemu_iovec_destroy(&local_qiov
);
3064 /* Reading beyond end of file is supposed to produce zeroes */
3065 if (ret
== 0 && total_sectors
< sector_num
+ nb_sectors
) {
3066 uint64_t offset
= MAX(0, total_sectors
- sector_num
);
3067 uint64_t bytes
= (sector_num
+ nb_sectors
- offset
) *
3069 qemu_iovec_memset(qiov
, offset
* BDRV_SECTOR_SIZE
, 0, bytes
);
3078 * Handle a read request in coroutine context
3080 static int coroutine_fn
bdrv_co_do_preadv(BlockDriverState
*bs
,
3081 int64_t offset
, unsigned int bytes
, QEMUIOVector
*qiov
,
3082 BdrvRequestFlags flags
)
3084 BlockDriver
*drv
= bs
->drv
;
3085 BdrvTrackedRequest req
;
3087 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3088 uint64_t align
= MAX(BDRV_SECTOR_SIZE
, bs
->request_alignment
);
3089 uint8_t *head_buf
= NULL
;
3090 uint8_t *tail_buf
= NULL
;
3091 QEMUIOVector local_qiov
;
3092 bool use_local_qiov
= false;
3098 if (bdrv_check_byte_request(bs
, offset
, bytes
)) {
3102 if (bs
->copy_on_read
) {
3103 flags
|= BDRV_REQ_COPY_ON_READ
;
3106 /* throttling disk I/O */
3107 if (bs
->io_limits_enabled
) {
3108 bdrv_io_limits_intercept(bs
, bytes
, false);
3111 /* Align read if necessary by padding qiov */
3112 if (offset
& (align
- 1)) {
3113 head_buf
= qemu_blockalign(bs
, align
);
3114 qemu_iovec_init(&local_qiov
, qiov
->niov
+ 2);
3115 qemu_iovec_add(&local_qiov
, head_buf
, offset
& (align
- 1));
3116 qemu_iovec_concat(&local_qiov
, qiov
, 0, qiov
->size
);
3117 use_local_qiov
= true;
3119 bytes
+= offset
& (align
- 1);
3120 offset
= offset
& ~(align
- 1);
3123 if ((offset
+ bytes
) & (align
- 1)) {
3124 if (!use_local_qiov
) {
3125 qemu_iovec_init(&local_qiov
, qiov
->niov
+ 1);
3126 qemu_iovec_concat(&local_qiov
, qiov
, 0, qiov
->size
);
3127 use_local_qiov
= true;
3129 tail_buf
= qemu_blockalign(bs
, align
);
3130 qemu_iovec_add(&local_qiov
, tail_buf
,
3131 align
- ((offset
+ bytes
) & (align
- 1)));
3133 bytes
= ROUND_UP(bytes
, align
);
3136 tracked_request_begin(&req
, bs
, offset
, bytes
, false);
3137 ret
= bdrv_aligned_preadv(bs
, &req
, offset
, bytes
, align
,
3138 use_local_qiov
? &local_qiov
: qiov
,
3140 tracked_request_end(&req
);
3142 if (use_local_qiov
) {
3143 qemu_iovec_destroy(&local_qiov
);
3144 qemu_vfree(head_buf
);
3145 qemu_vfree(tail_buf
);
3151 static int coroutine_fn
bdrv_co_do_readv(BlockDriverState
*bs
,
3152 int64_t sector_num
, int nb_sectors
, QEMUIOVector
*qiov
,
3153 BdrvRequestFlags flags
)
3155 if (nb_sectors
< 0 || nb_sectors
> (UINT_MAX
>> BDRV_SECTOR_BITS
)) {
3159 return bdrv_co_do_preadv(bs
, sector_num
<< BDRV_SECTOR_BITS
,
3160 nb_sectors
<< BDRV_SECTOR_BITS
, qiov
, flags
);
3163 int coroutine_fn
bdrv_co_readv(BlockDriverState
*bs
, int64_t sector_num
,
3164 int nb_sectors
, QEMUIOVector
*qiov
)
3166 trace_bdrv_co_readv(bs
, sector_num
, nb_sectors
);
3168 return bdrv_co_do_readv(bs
, sector_num
, nb_sectors
, qiov
, 0);
3171 int coroutine_fn
bdrv_co_copy_on_readv(BlockDriverState
*bs
,
3172 int64_t sector_num
, int nb_sectors
, QEMUIOVector
*qiov
)
3174 trace_bdrv_co_copy_on_readv(bs
, sector_num
, nb_sectors
);
3176 return bdrv_co_do_readv(bs
, sector_num
, nb_sectors
, qiov
,
3177 BDRV_REQ_COPY_ON_READ
);
3180 /* if no limit is specified in the BlockLimits use a default
3181 * of 32768 512-byte sectors (16 MiB) per request.
3183 #define MAX_WRITE_ZEROES_DEFAULT 32768
3185 static int coroutine_fn
bdrv_co_do_write_zeroes(BlockDriverState
*bs
,
3186 int64_t sector_num
, int nb_sectors
, BdrvRequestFlags flags
)
3188 BlockDriver
*drv
= bs
->drv
;
3190 struct iovec iov
= {0};
3193 int max_write_zeroes
= bs
->bl
.max_write_zeroes
?
3194 bs
->bl
.max_write_zeroes
: MAX_WRITE_ZEROES_DEFAULT
;
3196 while (nb_sectors
> 0 && !ret
) {
3197 int num
= nb_sectors
;
3199 /* Align request. Block drivers can expect the "bulk" of the request
3202 if (bs
->bl
.write_zeroes_alignment
3203 && num
> bs
->bl
.write_zeroes_alignment
) {
3204 if (sector_num
% bs
->bl
.write_zeroes_alignment
!= 0) {
3205 /* Make a small request up to the first aligned sector. */
3206 num
= bs
->bl
.write_zeroes_alignment
;
3207 num
-= sector_num
% bs
->bl
.write_zeroes_alignment
;
3208 } else if ((sector_num
+ num
) % bs
->bl
.write_zeroes_alignment
!= 0) {
3209 /* Shorten the request to the last aligned sector. num cannot
3210 * underflow because num > bs->bl.write_zeroes_alignment.
3212 num
-= (sector_num
+ num
) % bs
->bl
.write_zeroes_alignment
;
3216 /* limit request size */
3217 if (num
> max_write_zeroes
) {
3218 num
= max_write_zeroes
;
3222 /* First try the efficient write zeroes operation */
3223 if (drv
->bdrv_co_write_zeroes
) {
3224 ret
= drv
->bdrv_co_write_zeroes(bs
, sector_num
, num
, flags
);
3227 if (ret
== -ENOTSUP
) {
3228 /* Fall back to bounce buffer if write zeroes is unsupported */
3229 iov
.iov_len
= num
* BDRV_SECTOR_SIZE
;
3230 if (iov
.iov_base
== NULL
) {
3231 iov
.iov_base
= qemu_try_blockalign(bs
, num
* BDRV_SECTOR_SIZE
);
3232 if (iov
.iov_base
== NULL
) {
3236 memset(iov
.iov_base
, 0, num
* BDRV_SECTOR_SIZE
);
3238 qemu_iovec_init_external(&qiov
, &iov
, 1);
3240 ret
= drv
->bdrv_co_writev(bs
, sector_num
, num
, &qiov
);
3242 /* Keep bounce buffer around if it is big enough for all
3243 * all future requests.
3245 if (num
< max_write_zeroes
) {
3246 qemu_vfree(iov
.iov_base
);
3247 iov
.iov_base
= NULL
;
3256 qemu_vfree(iov
.iov_base
);
3261 * Forwards an already correctly aligned write request to the BlockDriver.
3263 static int coroutine_fn
bdrv_aligned_pwritev(BlockDriverState
*bs
,
3264 BdrvTrackedRequest
*req
, int64_t offset
, unsigned int bytes
,
3265 QEMUIOVector
*qiov
, int flags
)
3267 BlockDriver
*drv
= bs
->drv
;
3271 int64_t sector_num
= offset
>> BDRV_SECTOR_BITS
;
3272 unsigned int nb_sectors
= bytes
>> BDRV_SECTOR_BITS
;
3274 assert((offset
& (BDRV_SECTOR_SIZE
- 1)) == 0);
3275 assert((bytes
& (BDRV_SECTOR_SIZE
- 1)) == 0);
3276 assert(!qiov
|| bytes
== qiov
->size
);
3278 waited
= wait_serialising_requests(req
);
3279 assert(!waited
|| !req
->serialising
);
3280 assert(req
->overlap_offset
<= offset
);
3281 assert(offset
+ bytes
<= req
->overlap_offset
+ req
->overlap_bytes
);
3283 ret
= notifier_with_return_list_notify(&bs
->before_write_notifiers
, req
);
3285 if (!ret
&& bs
->detect_zeroes
!= BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF
&&
3286 !(flags
& BDRV_REQ_ZERO_WRITE
) && drv
->bdrv_co_write_zeroes
&&
3287 qemu_iovec_is_zero(qiov
)) {
3288 flags
|= BDRV_REQ_ZERO_WRITE
;
3289 if (bs
->detect_zeroes
== BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP
) {
3290 flags
|= BDRV_REQ_MAY_UNMAP
;
3295 /* Do nothing, write notifier decided to fail this request */
3296 } else if (flags
& BDRV_REQ_ZERO_WRITE
) {
3297 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV_ZERO
);
3298 ret
= bdrv_co_do_write_zeroes(bs
, sector_num
, nb_sectors
, flags
);
3300 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV
);
3301 ret
= drv
->bdrv_co_writev(bs
, sector_num
, nb_sectors
, qiov
);
3303 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV_DONE
);
3305 if (ret
== 0 && !bs
->enable_write_cache
) {
3306 ret
= bdrv_co_flush(bs
);
3309 bdrv_set_dirty(bs
, sector_num
, nb_sectors
);
3311 block_acct_highest_sector(&bs
->stats
, sector_num
, nb_sectors
);
3313 if (bs
->growable
&& ret
>= 0) {
3314 bs
->total_sectors
= MAX(bs
->total_sectors
, sector_num
+ nb_sectors
);
3321 * Handle a write request in coroutine context
3323 static int coroutine_fn
bdrv_co_do_pwritev(BlockDriverState
*bs
,
3324 int64_t offset
, unsigned int bytes
, QEMUIOVector
*qiov
,
3325 BdrvRequestFlags flags
)
3327 BdrvTrackedRequest req
;
3328 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3329 uint64_t align
= MAX(BDRV_SECTOR_SIZE
, bs
->request_alignment
);
3330 uint8_t *head_buf
= NULL
;
3331 uint8_t *tail_buf
= NULL
;
3332 QEMUIOVector local_qiov
;
3333 bool use_local_qiov
= false;
3339 if (bs
->read_only
) {
3342 if (bdrv_check_byte_request(bs
, offset
, bytes
)) {
3346 /* throttling disk I/O */
3347 if (bs
->io_limits_enabled
) {
3348 bdrv_io_limits_intercept(bs
, bytes
, true);
3352 * Align write if necessary by performing a read-modify-write cycle.
3353 * Pad qiov with the read parts and be sure to have a tracked request not
3354 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3356 tracked_request_begin(&req
, bs
, offset
, bytes
, true);
3358 if (offset
& (align
- 1)) {
3359 QEMUIOVector head_qiov
;
3360 struct iovec head_iov
;
3362 mark_request_serialising(&req
, align
);
3363 wait_serialising_requests(&req
);
3365 head_buf
= qemu_blockalign(bs
, align
);
3366 head_iov
= (struct iovec
) {
3367 .iov_base
= head_buf
,
3370 qemu_iovec_init_external(&head_qiov
, &head_iov
, 1);
3372 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV_RMW_HEAD
);
3373 ret
= bdrv_aligned_preadv(bs
, &req
, offset
& ~(align
- 1), align
,
3374 align
, &head_qiov
, 0);
3378 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV_RMW_AFTER_HEAD
);
3380 qemu_iovec_init(&local_qiov
, qiov
->niov
+ 2);
3381 qemu_iovec_add(&local_qiov
, head_buf
, offset
& (align
- 1));
3382 qemu_iovec_concat(&local_qiov
, qiov
, 0, qiov
->size
);
3383 use_local_qiov
= true;
3385 bytes
+= offset
& (align
- 1);
3386 offset
= offset
& ~(align
- 1);
3389 if ((offset
+ bytes
) & (align
- 1)) {
3390 QEMUIOVector tail_qiov
;
3391 struct iovec tail_iov
;
3395 mark_request_serialising(&req
, align
);
3396 waited
= wait_serialising_requests(&req
);
3397 assert(!waited
|| !use_local_qiov
);
3399 tail_buf
= qemu_blockalign(bs
, align
);
3400 tail_iov
= (struct iovec
) {
3401 .iov_base
= tail_buf
,
3404 qemu_iovec_init_external(&tail_qiov
, &tail_iov
, 1);
3406 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV_RMW_TAIL
);
3407 ret
= bdrv_aligned_preadv(bs
, &req
, (offset
+ bytes
) & ~(align
- 1), align
,
3408 align
, &tail_qiov
, 0);
3412 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV_RMW_AFTER_TAIL
);
3414 if (!use_local_qiov
) {
3415 qemu_iovec_init(&local_qiov
, qiov
->niov
+ 1);
3416 qemu_iovec_concat(&local_qiov
, qiov
, 0, qiov
->size
);
3417 use_local_qiov
= true;
3420 tail_bytes
= (offset
+ bytes
) & (align
- 1);
3421 qemu_iovec_add(&local_qiov
, tail_buf
+ tail_bytes
, align
- tail_bytes
);
3423 bytes
= ROUND_UP(bytes
, align
);
3426 ret
= bdrv_aligned_pwritev(bs
, &req
, offset
, bytes
,
3427 use_local_qiov
? &local_qiov
: qiov
,
3431 tracked_request_end(&req
);
3433 if (use_local_qiov
) {
3434 qemu_iovec_destroy(&local_qiov
);
3436 qemu_vfree(head_buf
);
3437 qemu_vfree(tail_buf
);
3442 static int coroutine_fn
bdrv_co_do_writev(BlockDriverState
*bs
,
3443 int64_t sector_num
, int nb_sectors
, QEMUIOVector
*qiov
,
3444 BdrvRequestFlags flags
)
3446 if (nb_sectors
< 0 || nb_sectors
> (INT_MAX
>> BDRV_SECTOR_BITS
)) {
3450 return bdrv_co_do_pwritev(bs
, sector_num
<< BDRV_SECTOR_BITS
,
3451 nb_sectors
<< BDRV_SECTOR_BITS
, qiov
, flags
);
3454 int coroutine_fn
bdrv_co_writev(BlockDriverState
*bs
, int64_t sector_num
,
3455 int nb_sectors
, QEMUIOVector
*qiov
)
3457 trace_bdrv_co_writev(bs
, sector_num
, nb_sectors
);
3459 return bdrv_co_do_writev(bs
, sector_num
, nb_sectors
, qiov
, 0);
3462 int coroutine_fn
bdrv_co_write_zeroes(BlockDriverState
*bs
,
3463 int64_t sector_num
, int nb_sectors
,
3464 BdrvRequestFlags flags
)
3466 trace_bdrv_co_write_zeroes(bs
, sector_num
, nb_sectors
, flags
);
3468 if (!(bs
->open_flags
& BDRV_O_UNMAP
)) {
3469 flags
&= ~BDRV_REQ_MAY_UNMAP
;
3472 return bdrv_co_do_writev(bs
, sector_num
, nb_sectors
, NULL
,
3473 BDRV_REQ_ZERO_WRITE
| flags
);
3477 * Truncate file to 'offset' bytes (needed only for file protocols)
3479 int bdrv_truncate(BlockDriverState
*bs
, int64_t offset
)
3481 BlockDriver
*drv
= bs
->drv
;
3485 if (!drv
->bdrv_truncate
)
3490 ret
= drv
->bdrv_truncate(bs
, offset
);
3492 ret
= refresh_total_sectors(bs
, offset
>> BDRV_SECTOR_BITS
);
3494 blk_dev_resize_cb(bs
->blk
);
3501 * Length of a allocated file in bytes. Sparse files are counted by actual
3502 * allocated space. Return < 0 if error or unknown.
3504 int64_t bdrv_get_allocated_file_size(BlockDriverState
*bs
)
3506 BlockDriver
*drv
= bs
->drv
;
3510 if (drv
->bdrv_get_allocated_file_size
) {
3511 return drv
->bdrv_get_allocated_file_size(bs
);
3514 return bdrv_get_allocated_file_size(bs
->file
);
3520 * Return number of sectors on success, -errno on error.
3522 int64_t bdrv_nb_sectors(BlockDriverState
*bs
)
3524 BlockDriver
*drv
= bs
->drv
;
3529 if (drv
->has_variable_length
) {
3530 int ret
= refresh_total_sectors(bs
, bs
->total_sectors
);
3535 return bs
->total_sectors
;
3539 * Return length in bytes on success, -errno on error.
3540 * The length is always a multiple of BDRV_SECTOR_SIZE.
3542 int64_t bdrv_getlength(BlockDriverState
*bs
)
3544 int64_t ret
= bdrv_nb_sectors(bs
);
3546 return ret
< 0 ? ret
: ret
* BDRV_SECTOR_SIZE
;
3549 /* return 0 as number of sectors if no device present or error */
3550 void bdrv_get_geometry(BlockDriverState
*bs
, uint64_t *nb_sectors_ptr
)
3552 int64_t nb_sectors
= bdrv_nb_sectors(bs
);
3554 *nb_sectors_ptr
= nb_sectors
< 0 ? 0 : nb_sectors
;
3557 void bdrv_set_on_error(BlockDriverState
*bs
, BlockdevOnError on_read_error
,
3558 BlockdevOnError on_write_error
)
3560 bs
->on_read_error
= on_read_error
;
3561 bs
->on_write_error
= on_write_error
;
3564 BlockdevOnError
bdrv_get_on_error(BlockDriverState
*bs
, bool is_read
)
3566 return is_read
? bs
->on_read_error
: bs
->on_write_error
;
3569 BlockErrorAction
bdrv_get_error_action(BlockDriverState
*bs
, bool is_read
, int error
)
3571 BlockdevOnError on_err
= is_read
? bs
->on_read_error
: bs
->on_write_error
;
3574 case BLOCKDEV_ON_ERROR_ENOSPC
:
3575 return (error
== ENOSPC
) ?
3576 BLOCK_ERROR_ACTION_STOP
: BLOCK_ERROR_ACTION_REPORT
;
3577 case BLOCKDEV_ON_ERROR_STOP
:
3578 return BLOCK_ERROR_ACTION_STOP
;
3579 case BLOCKDEV_ON_ERROR_REPORT
:
3580 return BLOCK_ERROR_ACTION_REPORT
;
3581 case BLOCKDEV_ON_ERROR_IGNORE
:
3582 return BLOCK_ERROR_ACTION_IGNORE
;
3588 static void send_qmp_error_event(BlockDriverState
*bs
,
3589 BlockErrorAction action
,
3590 bool is_read
, int error
)
3592 IoOperationType optype
;
3594 optype
= is_read
? IO_OPERATION_TYPE_READ
: IO_OPERATION_TYPE_WRITE
;
3595 qapi_event_send_block_io_error(bdrv_get_device_name(bs
), optype
, action
,
3596 bdrv_iostatus_is_enabled(bs
),
3597 error
== ENOSPC
, strerror(error
),
3601 /* This is done by device models because, while the block layer knows
3602 * about the error, it does not know whether an operation comes from
3603 * the device or the block layer (from a job, for example).
3605 void bdrv_error_action(BlockDriverState
*bs
, BlockErrorAction action
,
3606 bool is_read
, int error
)
3610 if (action
== BLOCK_ERROR_ACTION_STOP
) {
3611 /* First set the iostatus, so that "info block" returns an iostatus
3612 * that matches the events raised so far (an additional error iostatus
3613 * is fine, but not a lost one).
3615 bdrv_iostatus_set_err(bs
, error
);
3617 /* Then raise the request to stop the VM and the event.
3618 * qemu_system_vmstop_request_prepare has two effects. First,
3619 * it ensures that the STOP event always comes after the
3620 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3621 * can observe the STOP event and do a "cont" before the STOP
3622 * event is issued, the VM will not stop. In this case, vm_start()
3623 * also ensures that the STOP/RESUME pair of events is emitted.
3625 qemu_system_vmstop_request_prepare();
3626 send_qmp_error_event(bs
, action
, is_read
, error
);
3627 qemu_system_vmstop_request(RUN_STATE_IO_ERROR
);
3629 send_qmp_error_event(bs
, action
, is_read
, error
);
3633 int bdrv_is_read_only(BlockDriverState
*bs
)
3635 return bs
->read_only
;
3638 int bdrv_is_sg(BlockDriverState
*bs
)
3643 int bdrv_enable_write_cache(BlockDriverState
*bs
)
3645 return bs
->enable_write_cache
;
3648 void bdrv_set_enable_write_cache(BlockDriverState
*bs
, bool wce
)
3650 bs
->enable_write_cache
= wce
;
3652 /* so a reopen() will preserve wce */
3654 bs
->open_flags
|= BDRV_O_CACHE_WB
;
3656 bs
->open_flags
&= ~BDRV_O_CACHE_WB
;
3660 int bdrv_is_encrypted(BlockDriverState
*bs
)
3662 if (bs
->backing_hd
&& bs
->backing_hd
->encrypted
)
3664 return bs
->encrypted
;
3667 int bdrv_key_required(BlockDriverState
*bs
)
3669 BlockDriverState
*backing_hd
= bs
->backing_hd
;
3671 if (backing_hd
&& backing_hd
->encrypted
&& !backing_hd
->valid_key
)
3673 return (bs
->encrypted
&& !bs
->valid_key
);
3676 int bdrv_set_key(BlockDriverState
*bs
, const char *key
)
3679 if (bs
->backing_hd
&& bs
->backing_hd
->encrypted
) {
3680 ret
= bdrv_set_key(bs
->backing_hd
, key
);
3686 if (!bs
->encrypted
) {
3688 } else if (!bs
->drv
|| !bs
->drv
->bdrv_set_key
) {
3691 ret
= bs
->drv
->bdrv_set_key(bs
, key
);
3694 } else if (!bs
->valid_key
) {
3697 /* call the change callback now, we skipped it on open */
3698 blk_dev_change_media_cb(bs
->blk
, true);
3704 const char *bdrv_get_format_name(BlockDriverState
*bs
)
3706 return bs
->drv
? bs
->drv
->format_name
: NULL
;
3709 static int qsort_strcmp(const void *a
, const void *b
)
3711 return strcmp(a
, b
);
3714 void bdrv_iterate_format(void (*it
)(void *opaque
, const char *name
),
3720 const char **formats
= NULL
;
3722 QLIST_FOREACH(drv
, &bdrv_drivers
, list
) {
3723 if (drv
->format_name
) {
3726 while (formats
&& i
&& !found
) {
3727 found
= !strcmp(formats
[--i
], drv
->format_name
);
3731 formats
= g_renew(const char *, formats
, count
+ 1);
3732 formats
[count
++] = drv
->format_name
;
3737 qsort(formats
, count
, sizeof(formats
[0]), qsort_strcmp
);
3739 for (i
= 0; i
< count
; i
++) {
3740 it(opaque
, formats
[i
]);
3746 /* This function is to find block backend bs */
3747 /* TODO convert callers to blk_by_name(), then remove */
3748 BlockDriverState
*bdrv_find(const char *name
)
3750 BlockBackend
*blk
= blk_by_name(name
);
3752 return blk
? blk_bs(blk
) : NULL
;
3755 /* This function is to find a node in the bs graph */
3756 BlockDriverState
*bdrv_find_node(const char *node_name
)
3758 BlockDriverState
*bs
;
3762 QTAILQ_FOREACH(bs
, &graph_bdrv_states
, node_list
) {
3763 if (!strcmp(node_name
, bs
->node_name
)) {
3770 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3771 BlockDeviceInfoList
*bdrv_named_nodes_list(void)
3773 BlockDeviceInfoList
*list
, *entry
;
3774 BlockDriverState
*bs
;
3777 QTAILQ_FOREACH(bs
, &graph_bdrv_states
, node_list
) {
3778 entry
= g_malloc0(sizeof(*entry
));
3779 entry
->value
= bdrv_block_device_info(bs
);
3787 BlockDriverState
*bdrv_lookup_bs(const char *device
,
3788 const char *node_name
,
3792 BlockDriverState
*bs
;
3795 blk
= blk_by_name(device
);
3803 bs
= bdrv_find_node(node_name
);
3810 error_setg(errp
, "Cannot find device=%s nor node_name=%s",
3811 device
? device
: "",
3812 node_name
? node_name
: "");
3816 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3817 * return false. If either argument is NULL, return false. */
3818 bool bdrv_chain_contains(BlockDriverState
*top
, BlockDriverState
*base
)
3820 while (top
&& top
!= base
) {
3821 top
= top
->backing_hd
;
3827 BlockDriverState
*bdrv_next_node(BlockDriverState
*bs
)
3830 return QTAILQ_FIRST(&graph_bdrv_states
);
3832 return QTAILQ_NEXT(bs
, node_list
);
3835 BlockDriverState
*bdrv_next(BlockDriverState
*bs
)
3838 return QTAILQ_FIRST(&bdrv_states
);
3840 return QTAILQ_NEXT(bs
, device_list
);
3843 const char *bdrv_get_node_name(const BlockDriverState
*bs
)
3845 return bs
->node_name
;
3848 /* TODO check what callers really want: bs->node_name or blk_name() */
3849 const char *bdrv_get_device_name(const BlockDriverState
*bs
)
3851 return bs
->blk
? blk_name(bs
->blk
) : "";
3854 int bdrv_get_flags(BlockDriverState
*bs
)
3856 return bs
->open_flags
;
3859 int bdrv_flush_all(void)
3861 BlockDriverState
*bs
;
3864 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
3865 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
3868 aio_context_acquire(aio_context
);
3869 ret
= bdrv_flush(bs
);
3870 if (ret
< 0 && !result
) {
3873 aio_context_release(aio_context
);
3879 int bdrv_has_zero_init_1(BlockDriverState
*bs
)
3884 int bdrv_has_zero_init(BlockDriverState
*bs
)
3888 /* If BS is a copy on write image, it is initialized to
3889 the contents of the base image, which may not be zeroes. */
3890 if (bs
->backing_hd
) {
3893 if (bs
->drv
->bdrv_has_zero_init
) {
3894 return bs
->drv
->bdrv_has_zero_init(bs
);
3901 bool bdrv_unallocated_blocks_are_zero(BlockDriverState
*bs
)
3903 BlockDriverInfo bdi
;
3905 if (bs
->backing_hd
) {
3909 if (bdrv_get_info(bs
, &bdi
) == 0) {
3910 return bdi
.unallocated_blocks_are_zero
;
3916 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState
*bs
)
3918 BlockDriverInfo bdi
;
3920 if (bs
->backing_hd
|| !(bs
->open_flags
& BDRV_O_UNMAP
)) {
3924 if (bdrv_get_info(bs
, &bdi
) == 0) {
3925 return bdi
.can_write_zeroes_with_unmap
;
3931 typedef struct BdrvCoGetBlockStatusData
{
3932 BlockDriverState
*bs
;
3933 BlockDriverState
*base
;
3939 } BdrvCoGetBlockStatusData
;
3942 * Returns the allocation status of the specified sectors.
3943 * Drivers not implementing the functionality are assumed to not support
3944 * backing files, hence all their sectors are reported as allocated.
3946 * If 'sector_num' is beyond the end of the disk image the return value is 0
3947 * and 'pnum' is set to 0.
3949 * 'pnum' is set to the number of sectors (including and immediately following
3950 * the specified sector) that are known to be in the same
3951 * allocated/unallocated state.
3953 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3954 * beyond the end of the disk image it will be clamped.
3956 static int64_t coroutine_fn
bdrv_co_get_block_status(BlockDriverState
*bs
,
3958 int nb_sectors
, int *pnum
)
3960 int64_t total_sectors
;
3964 total_sectors
= bdrv_nb_sectors(bs
);
3965 if (total_sectors
< 0) {
3966 return total_sectors
;
3969 if (sector_num
>= total_sectors
) {
3974 n
= total_sectors
- sector_num
;
3975 if (n
< nb_sectors
) {
3979 if (!bs
->drv
->bdrv_co_get_block_status
) {
3981 ret
= BDRV_BLOCK_DATA
| BDRV_BLOCK_ALLOCATED
;
3982 if (bs
->drv
->protocol_name
) {
3983 ret
|= BDRV_BLOCK_OFFSET_VALID
| (sector_num
* BDRV_SECTOR_SIZE
);
3988 ret
= bs
->drv
->bdrv_co_get_block_status(bs
, sector_num
, nb_sectors
, pnum
);
3994 if (ret
& BDRV_BLOCK_RAW
) {
3995 assert(ret
& BDRV_BLOCK_OFFSET_VALID
);
3996 return bdrv_get_block_status(bs
->file
, ret
>> BDRV_SECTOR_BITS
,
4000 if (ret
& (BDRV_BLOCK_DATA
| BDRV_BLOCK_ZERO
)) {
4001 ret
|= BDRV_BLOCK_ALLOCATED
;
4004 if (!(ret
& BDRV_BLOCK_DATA
) && !(ret
& BDRV_BLOCK_ZERO
)) {
4005 if (bdrv_unallocated_blocks_are_zero(bs
)) {
4006 ret
|= BDRV_BLOCK_ZERO
;
4007 } else if (bs
->backing_hd
) {
4008 BlockDriverState
*bs2
= bs
->backing_hd
;
4009 int64_t nb_sectors2
= bdrv_nb_sectors(bs2
);
4010 if (nb_sectors2
>= 0 && sector_num
>= nb_sectors2
) {
4011 ret
|= BDRV_BLOCK_ZERO
;
4017 (ret
& BDRV_BLOCK_DATA
) && !(ret
& BDRV_BLOCK_ZERO
) &&
4018 (ret
& BDRV_BLOCK_OFFSET_VALID
)) {
4021 ret2
= bdrv_co_get_block_status(bs
->file
, ret
>> BDRV_SECTOR_BITS
,
4024 /* Ignore errors. This is just providing extra information, it
4025 * is useful but not necessary.
4028 /* !file_pnum indicates an offset at or beyond the EOF; it is
4029 * perfectly valid for the format block driver to point to such
4030 * offsets, so catch it and mark everything as zero */
4031 ret
|= BDRV_BLOCK_ZERO
;
4033 /* Limit request to the range reported by the protocol driver */
4035 ret
|= (ret2
& BDRV_BLOCK_ZERO
);
4043 /* Coroutine wrapper for bdrv_get_block_status() */
4044 static void coroutine_fn
bdrv_get_block_status_co_entry(void *opaque
)
4046 BdrvCoGetBlockStatusData
*data
= opaque
;
4047 BlockDriverState
*bs
= data
->bs
;
4049 data
->ret
= bdrv_co_get_block_status(bs
, data
->sector_num
, data
->nb_sectors
,
4055 * Synchronous wrapper around bdrv_co_get_block_status().
4057 * See bdrv_co_get_block_status() for details.
4059 int64_t bdrv_get_block_status(BlockDriverState
*bs
, int64_t sector_num
,
4060 int nb_sectors
, int *pnum
)
4063 BdrvCoGetBlockStatusData data
= {
4065 .sector_num
= sector_num
,
4066 .nb_sectors
= nb_sectors
,
4071 if (qemu_in_coroutine()) {
4072 /* Fast-path if already in coroutine context */
4073 bdrv_get_block_status_co_entry(&data
);
4075 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
4077 co
= qemu_coroutine_create(bdrv_get_block_status_co_entry
);
4078 qemu_coroutine_enter(co
, &data
);
4079 while (!data
.done
) {
4080 aio_poll(aio_context
, true);
4086 int coroutine_fn
bdrv_is_allocated(BlockDriverState
*bs
, int64_t sector_num
,
4087 int nb_sectors
, int *pnum
)
4089 int64_t ret
= bdrv_get_block_status(bs
, sector_num
, nb_sectors
, pnum
);
4093 return !!(ret
& BDRV_BLOCK_ALLOCATED
);
4097 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4099 * Return true if the given sector is allocated in any image between
4100 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4101 * sector is allocated in any image of the chain. Return false otherwise.
4103 * 'pnum' is set to the number of sectors (including and immediately following
4104 * the specified sector) that are known to be in the same
4105 * allocated/unallocated state.
4108 int bdrv_is_allocated_above(BlockDriverState
*top
,
4109 BlockDriverState
*base
,
4111 int nb_sectors
, int *pnum
)
4113 BlockDriverState
*intermediate
;
4114 int ret
, n
= nb_sectors
;
4117 while (intermediate
&& intermediate
!= base
) {
4119 ret
= bdrv_is_allocated(intermediate
, sector_num
, nb_sectors
,
4129 * [sector_num, nb_sectors] is unallocated on top but intermediate
4132 * [sector_num+x, nr_sectors] allocated.
4134 if (n
> pnum_inter
&&
4135 (intermediate
== top
||
4136 sector_num
+ pnum_inter
< intermediate
->total_sectors
)) {
4140 intermediate
= intermediate
->backing_hd
;
4147 const char *bdrv_get_encrypted_filename(BlockDriverState
*bs
)
4149 if (bs
->backing_hd
&& bs
->backing_hd
->encrypted
)
4150 return bs
->backing_file
;
4151 else if (bs
->encrypted
)
4152 return bs
->filename
;
4157 void bdrv_get_backing_filename(BlockDriverState
*bs
,
4158 char *filename
, int filename_size
)
4160 pstrcpy(filename
, filename_size
, bs
->backing_file
);
4163 int bdrv_write_compressed(BlockDriverState
*bs
, int64_t sector_num
,
4164 const uint8_t *buf
, int nb_sectors
)
4166 BlockDriver
*drv
= bs
->drv
;
4169 if (!drv
->bdrv_write_compressed
)
4171 if (bdrv_check_request(bs
, sector_num
, nb_sectors
))
4174 assert(QLIST_EMPTY(&bs
->dirty_bitmaps
));
4176 return drv
->bdrv_write_compressed(bs
, sector_num
, buf
, nb_sectors
);
4179 int bdrv_get_info(BlockDriverState
*bs
, BlockDriverInfo
*bdi
)
4181 BlockDriver
*drv
= bs
->drv
;
4184 if (!drv
->bdrv_get_info
)
4186 memset(bdi
, 0, sizeof(*bdi
));
4187 return drv
->bdrv_get_info(bs
, bdi
);
4190 ImageInfoSpecific
*bdrv_get_specific_info(BlockDriverState
*bs
)
4192 BlockDriver
*drv
= bs
->drv
;
4193 if (drv
&& drv
->bdrv_get_specific_info
) {
4194 return drv
->bdrv_get_specific_info(bs
);
4199 int bdrv_save_vmstate(BlockDriverState
*bs
, const uint8_t *buf
,
4200 int64_t pos
, int size
)
4203 struct iovec iov
= {
4204 .iov_base
= (void *) buf
,
4208 qemu_iovec_init_external(&qiov
, &iov
, 1);
4209 return bdrv_writev_vmstate(bs
, &qiov
, pos
);
4212 int bdrv_writev_vmstate(BlockDriverState
*bs
, QEMUIOVector
*qiov
, int64_t pos
)
4214 BlockDriver
*drv
= bs
->drv
;
4218 } else if (drv
->bdrv_save_vmstate
) {
4219 return drv
->bdrv_save_vmstate(bs
, qiov
, pos
);
4220 } else if (bs
->file
) {
4221 return bdrv_writev_vmstate(bs
->file
, qiov
, pos
);
4227 int bdrv_load_vmstate(BlockDriverState
*bs
, uint8_t *buf
,
4228 int64_t pos
, int size
)
4230 BlockDriver
*drv
= bs
->drv
;
4233 if (drv
->bdrv_load_vmstate
)
4234 return drv
->bdrv_load_vmstate(bs
, buf
, pos
, size
);
4236 return bdrv_load_vmstate(bs
->file
, buf
, pos
, size
);
4240 void bdrv_debug_event(BlockDriverState
*bs
, BlkDebugEvent event
)
4242 if (!bs
|| !bs
->drv
|| !bs
->drv
->bdrv_debug_event
) {
4246 bs
->drv
->bdrv_debug_event(bs
, event
);
4249 int bdrv_debug_breakpoint(BlockDriverState
*bs
, const char *event
,
4252 while (bs
&& bs
->drv
&& !bs
->drv
->bdrv_debug_breakpoint
) {
4256 if (bs
&& bs
->drv
&& bs
->drv
->bdrv_debug_breakpoint
) {
4257 return bs
->drv
->bdrv_debug_breakpoint(bs
, event
, tag
);
4263 int bdrv_debug_remove_breakpoint(BlockDriverState
*bs
, const char *tag
)
4265 while (bs
&& bs
->drv
&& !bs
->drv
->bdrv_debug_remove_breakpoint
) {
4269 if (bs
&& bs
->drv
&& bs
->drv
->bdrv_debug_remove_breakpoint
) {
4270 return bs
->drv
->bdrv_debug_remove_breakpoint(bs
, tag
);
4276 int bdrv_debug_resume(BlockDriverState
*bs
, const char *tag
)
4278 while (bs
&& (!bs
->drv
|| !bs
->drv
->bdrv_debug_resume
)) {
4282 if (bs
&& bs
->drv
&& bs
->drv
->bdrv_debug_resume
) {
4283 return bs
->drv
->bdrv_debug_resume(bs
, tag
);
4289 bool bdrv_debug_is_suspended(BlockDriverState
*bs
, const char *tag
)
4291 while (bs
&& bs
->drv
&& !bs
->drv
->bdrv_debug_is_suspended
) {
4295 if (bs
&& bs
->drv
&& bs
->drv
->bdrv_debug_is_suspended
) {
4296 return bs
->drv
->bdrv_debug_is_suspended(bs
, tag
);
4302 int bdrv_is_snapshot(BlockDriverState
*bs
)
4304 return !!(bs
->open_flags
& BDRV_O_SNAPSHOT
);
4307 /* backing_file can either be relative, or absolute, or a protocol. If it is
4308 * relative, it must be relative to the chain. So, passing in bs->filename
4309 * from a BDS as backing_file should not be done, as that may be relative to
4310 * the CWD rather than the chain. */
4311 BlockDriverState
*bdrv_find_backing_image(BlockDriverState
*bs
,
4312 const char *backing_file
)
4314 char *filename_full
= NULL
;
4315 char *backing_file_full
= NULL
;
4316 char *filename_tmp
= NULL
;
4317 int is_protocol
= 0;
4318 BlockDriverState
*curr_bs
= NULL
;
4319 BlockDriverState
*retval
= NULL
;
4321 if (!bs
|| !bs
->drv
|| !backing_file
) {
4325 filename_full
= g_malloc(PATH_MAX
);
4326 backing_file_full
= g_malloc(PATH_MAX
);
4327 filename_tmp
= g_malloc(PATH_MAX
);
4329 is_protocol
= path_has_protocol(backing_file
);
4331 for (curr_bs
= bs
; curr_bs
->backing_hd
; curr_bs
= curr_bs
->backing_hd
) {
4333 /* If either of the filename paths is actually a protocol, then
4334 * compare unmodified paths; otherwise make paths relative */
4335 if (is_protocol
|| path_has_protocol(curr_bs
->backing_file
)) {
4336 if (strcmp(backing_file
, curr_bs
->backing_file
) == 0) {
4337 retval
= curr_bs
->backing_hd
;
4341 /* If not an absolute filename path, make it relative to the current
4342 * image's filename path */
4343 path_combine(filename_tmp
, PATH_MAX
, curr_bs
->filename
,
4346 /* We are going to compare absolute pathnames */
4347 if (!realpath(filename_tmp
, filename_full
)) {
4351 /* We need to make sure the backing filename we are comparing against
4352 * is relative to the current image filename (or absolute) */
4353 path_combine(filename_tmp
, PATH_MAX
, curr_bs
->filename
,
4354 curr_bs
->backing_file
);
4356 if (!realpath(filename_tmp
, backing_file_full
)) {
4360 if (strcmp(backing_file_full
, filename_full
) == 0) {
4361 retval
= curr_bs
->backing_hd
;
4367 g_free(filename_full
);
4368 g_free(backing_file_full
);
4369 g_free(filename_tmp
);
4373 int bdrv_get_backing_file_depth(BlockDriverState
*bs
)
4379 if (!bs
->backing_hd
) {
4383 return 1 + bdrv_get_backing_file_depth(bs
->backing_hd
);
4386 /**************************************************************/
4389 BlockAIOCB
*bdrv_aio_readv(BlockDriverState
*bs
, int64_t sector_num
,
4390 QEMUIOVector
*qiov
, int nb_sectors
,
4391 BlockCompletionFunc
*cb
, void *opaque
)
4393 trace_bdrv_aio_readv(bs
, sector_num
, nb_sectors
, opaque
);
4395 return bdrv_co_aio_rw_vector(bs
, sector_num
, qiov
, nb_sectors
, 0,
4399 BlockAIOCB
*bdrv_aio_writev(BlockDriverState
*bs
, int64_t sector_num
,
4400 QEMUIOVector
*qiov
, int nb_sectors
,
4401 BlockCompletionFunc
*cb
, void *opaque
)
4403 trace_bdrv_aio_writev(bs
, sector_num
, nb_sectors
, opaque
);
4405 return bdrv_co_aio_rw_vector(bs
, sector_num
, qiov
, nb_sectors
, 0,
4409 BlockAIOCB
*bdrv_aio_write_zeroes(BlockDriverState
*bs
,
4410 int64_t sector_num
, int nb_sectors
, BdrvRequestFlags flags
,
4411 BlockCompletionFunc
*cb
, void *opaque
)
4413 trace_bdrv_aio_write_zeroes(bs
, sector_num
, nb_sectors
, flags
, opaque
);
4415 return bdrv_co_aio_rw_vector(bs
, sector_num
, NULL
, nb_sectors
,
4416 BDRV_REQ_ZERO_WRITE
| flags
,
4421 typedef struct MultiwriteCB
{
4426 BlockCompletionFunc
*cb
;
4428 QEMUIOVector
*free_qiov
;
4432 static void multiwrite_user_cb(MultiwriteCB
*mcb
)
4436 for (i
= 0; i
< mcb
->num_callbacks
; i
++) {
4437 mcb
->callbacks
[i
].cb(mcb
->callbacks
[i
].opaque
, mcb
->error
);
4438 if (mcb
->callbacks
[i
].free_qiov
) {
4439 qemu_iovec_destroy(mcb
->callbacks
[i
].free_qiov
);
4441 g_free(mcb
->callbacks
[i
].free_qiov
);
4445 static void multiwrite_cb(void *opaque
, int ret
)
4447 MultiwriteCB
*mcb
= opaque
;
4449 trace_multiwrite_cb(mcb
, ret
);
4451 if (ret
< 0 && !mcb
->error
) {
4455 mcb
->num_requests
--;
4456 if (mcb
->num_requests
== 0) {
4457 multiwrite_user_cb(mcb
);
4462 static int multiwrite_req_compare(const void *a
, const void *b
)
4464 const BlockRequest
*req1
= a
, *req2
= b
;
4467 * Note that we can't simply subtract req2->sector from req1->sector
4468 * here as that could overflow the return value.
4470 if (req1
->sector
> req2
->sector
) {
4472 } else if (req1
->sector
< req2
->sector
) {
4480 * Takes a bunch of requests and tries to merge them. Returns the number of
4481 * requests that remain after merging.
4483 static int multiwrite_merge(BlockDriverState
*bs
, BlockRequest
*reqs
,
4484 int num_reqs
, MultiwriteCB
*mcb
)
4488 // Sort requests by start sector
4489 qsort(reqs
, num_reqs
, sizeof(*reqs
), &multiwrite_req_compare
);
4491 // Check if adjacent requests touch the same clusters. If so, combine them,
4492 // filling up gaps with zero sectors.
4494 for (i
= 1; i
< num_reqs
; i
++) {
4496 int64_t oldreq_last
= reqs
[outidx
].sector
+ reqs
[outidx
].nb_sectors
;
4498 // Handle exactly sequential writes and overlapping writes.
4499 if (reqs
[i
].sector
<= oldreq_last
) {
4503 if (reqs
[outidx
].qiov
->niov
+ reqs
[i
].qiov
->niov
+ 1 > IOV_MAX
) {
4507 if (bs
->bl
.max_transfer_length
&& reqs
[outidx
].nb_sectors
+
4508 reqs
[i
].nb_sectors
> bs
->bl
.max_transfer_length
) {
4514 QEMUIOVector
*qiov
= g_malloc0(sizeof(*qiov
));
4515 qemu_iovec_init(qiov
,
4516 reqs
[outidx
].qiov
->niov
+ reqs
[i
].qiov
->niov
+ 1);
4518 // Add the first request to the merged one. If the requests are
4519 // overlapping, drop the last sectors of the first request.
4520 size
= (reqs
[i
].sector
- reqs
[outidx
].sector
) << 9;
4521 qemu_iovec_concat(qiov
, reqs
[outidx
].qiov
, 0, size
);
4523 // We should need to add any zeros between the two requests
4524 assert (reqs
[i
].sector
<= oldreq_last
);
4526 // Add the second request
4527 qemu_iovec_concat(qiov
, reqs
[i
].qiov
, 0, reqs
[i
].qiov
->size
);
4529 // Add tail of first request, if necessary
4530 if (qiov
->size
< reqs
[outidx
].qiov
->size
) {
4531 qemu_iovec_concat(qiov
, reqs
[outidx
].qiov
, qiov
->size
,
4532 reqs
[outidx
].qiov
->size
- qiov
->size
);
4535 reqs
[outidx
].nb_sectors
= qiov
->size
>> 9;
4536 reqs
[outidx
].qiov
= qiov
;
4538 mcb
->callbacks
[i
].free_qiov
= reqs
[outidx
].qiov
;
4541 reqs
[outidx
].sector
= reqs
[i
].sector
;
4542 reqs
[outidx
].nb_sectors
= reqs
[i
].nb_sectors
;
4543 reqs
[outidx
].qiov
= reqs
[i
].qiov
;
4551 * Submit multiple AIO write requests at once.
4553 * On success, the function returns 0 and all requests in the reqs array have
4554 * been submitted. In error case this function returns -1, and any of the
4555 * requests may or may not be submitted yet. In particular, this means that the
4556 * callback will be called for some of the requests, for others it won't. The
4557 * caller must check the error field of the BlockRequest to wait for the right
4558 * callbacks (if error != 0, no callback will be called).
4560 * The implementation may modify the contents of the reqs array, e.g. to merge
4561 * requests. However, the fields opaque and error are left unmodified as they
4562 * are used to signal failure for a single request to the caller.
4564 int bdrv_aio_multiwrite(BlockDriverState
*bs
, BlockRequest
*reqs
, int num_reqs
)
4569 /* don't submit writes if we don't have a medium */
4570 if (bs
->drv
== NULL
) {
4571 for (i
= 0; i
< num_reqs
; i
++) {
4572 reqs
[i
].error
= -ENOMEDIUM
;
4577 if (num_reqs
== 0) {
4581 // Create MultiwriteCB structure
4582 mcb
= g_malloc0(sizeof(*mcb
) + num_reqs
* sizeof(*mcb
->callbacks
));
4583 mcb
->num_requests
= 0;
4584 mcb
->num_callbacks
= num_reqs
;
4586 for (i
= 0; i
< num_reqs
; i
++) {
4587 mcb
->callbacks
[i
].cb
= reqs
[i
].cb
;
4588 mcb
->callbacks
[i
].opaque
= reqs
[i
].opaque
;
4591 // Check for mergable requests
4592 num_reqs
= multiwrite_merge(bs
, reqs
, num_reqs
, mcb
);
4594 trace_bdrv_aio_multiwrite(mcb
, mcb
->num_callbacks
, num_reqs
);
4596 /* Run the aio requests. */
4597 mcb
->num_requests
= num_reqs
;
4598 for (i
= 0; i
< num_reqs
; i
++) {
4599 bdrv_co_aio_rw_vector(bs
, reqs
[i
].sector
, reqs
[i
].qiov
,
4600 reqs
[i
].nb_sectors
, reqs
[i
].flags
,
4608 void bdrv_aio_cancel(BlockAIOCB
*acb
)
4611 bdrv_aio_cancel_async(acb
);
4612 while (acb
->refcnt
> 1) {
4613 if (acb
->aiocb_info
->get_aio_context
) {
4614 aio_poll(acb
->aiocb_info
->get_aio_context(acb
), true);
4615 } else if (acb
->bs
) {
4616 aio_poll(bdrv_get_aio_context(acb
->bs
), true);
4621 qemu_aio_unref(acb
);
4624 /* Async version of aio cancel. The caller is not blocked if the acb implements
4625 * cancel_async, otherwise we do nothing and let the request normally complete.
4626 * In either case the completion callback must be called. */
4627 void bdrv_aio_cancel_async(BlockAIOCB
*acb
)
4629 if (acb
->aiocb_info
->cancel_async
) {
4630 acb
->aiocb_info
->cancel_async(acb
);
4634 /**************************************************************/
4635 /* async block device emulation */
4637 typedef struct BlockAIOCBSync
{
4641 /* vector translation state */
4647 static const AIOCBInfo bdrv_em_aiocb_info
= {
4648 .aiocb_size
= sizeof(BlockAIOCBSync
),
4651 static void bdrv_aio_bh_cb(void *opaque
)
4653 BlockAIOCBSync
*acb
= opaque
;
4655 if (!acb
->is_write
&& acb
->ret
>= 0) {
4656 qemu_iovec_from_buf(acb
->qiov
, 0, acb
->bounce
, acb
->qiov
->size
);
4658 qemu_vfree(acb
->bounce
);
4659 acb
->common
.cb(acb
->common
.opaque
, acb
->ret
);
4660 qemu_bh_delete(acb
->bh
);
4662 qemu_aio_unref(acb
);
4665 static BlockAIOCB
*bdrv_aio_rw_vector(BlockDriverState
*bs
,
4669 BlockCompletionFunc
*cb
,
4674 BlockAIOCBSync
*acb
;
4676 acb
= qemu_aio_get(&bdrv_em_aiocb_info
, bs
, cb
, opaque
);
4677 acb
->is_write
= is_write
;
4679 acb
->bounce
= qemu_try_blockalign(bs
, qiov
->size
);
4680 acb
->bh
= aio_bh_new(bdrv_get_aio_context(bs
), bdrv_aio_bh_cb
, acb
);
4682 if (acb
->bounce
== NULL
) {
4684 } else if (is_write
) {
4685 qemu_iovec_to_buf(acb
->qiov
, 0, acb
->bounce
, qiov
->size
);
4686 acb
->ret
= bs
->drv
->bdrv_write(bs
, sector_num
, acb
->bounce
, nb_sectors
);
4688 acb
->ret
= bs
->drv
->bdrv_read(bs
, sector_num
, acb
->bounce
, nb_sectors
);
4691 qemu_bh_schedule(acb
->bh
);
4693 return &acb
->common
;
4696 static BlockAIOCB
*bdrv_aio_readv_em(BlockDriverState
*bs
,
4697 int64_t sector_num
, QEMUIOVector
*qiov
, int nb_sectors
,
4698 BlockCompletionFunc
*cb
, void *opaque
)
4700 return bdrv_aio_rw_vector(bs
, sector_num
, qiov
, nb_sectors
, cb
, opaque
, 0);
4703 static BlockAIOCB
*bdrv_aio_writev_em(BlockDriverState
*bs
,
4704 int64_t sector_num
, QEMUIOVector
*qiov
, int nb_sectors
,
4705 BlockCompletionFunc
*cb
, void *opaque
)
4707 return bdrv_aio_rw_vector(bs
, sector_num
, qiov
, nb_sectors
, cb
, opaque
, 1);
4711 typedef struct BlockAIOCBCoroutine
{
4717 } BlockAIOCBCoroutine
;
4719 static const AIOCBInfo bdrv_em_co_aiocb_info
= {
4720 .aiocb_size
= sizeof(BlockAIOCBCoroutine
),
4723 static void bdrv_co_em_bh(void *opaque
)
4725 BlockAIOCBCoroutine
*acb
= opaque
;
4727 acb
->common
.cb(acb
->common
.opaque
, acb
->req
.error
);
4729 qemu_bh_delete(acb
->bh
);
4730 qemu_aio_unref(acb
);
4733 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4734 static void coroutine_fn
bdrv_co_do_rw(void *opaque
)
4736 BlockAIOCBCoroutine
*acb
= opaque
;
4737 BlockDriverState
*bs
= acb
->common
.bs
;
4739 if (!acb
->is_write
) {
4740 acb
->req
.error
= bdrv_co_do_readv(bs
, acb
->req
.sector
,
4741 acb
->req
.nb_sectors
, acb
->req
.qiov
, acb
->req
.flags
);
4743 acb
->req
.error
= bdrv_co_do_writev(bs
, acb
->req
.sector
,
4744 acb
->req
.nb_sectors
, acb
->req
.qiov
, acb
->req
.flags
);
4747 acb
->bh
= aio_bh_new(bdrv_get_aio_context(bs
), bdrv_co_em_bh
, acb
);
4748 qemu_bh_schedule(acb
->bh
);
4751 static BlockAIOCB
*bdrv_co_aio_rw_vector(BlockDriverState
*bs
,
4755 BdrvRequestFlags flags
,
4756 BlockCompletionFunc
*cb
,
4761 BlockAIOCBCoroutine
*acb
;
4763 acb
= qemu_aio_get(&bdrv_em_co_aiocb_info
, bs
, cb
, opaque
);
4764 acb
->req
.sector
= sector_num
;
4765 acb
->req
.nb_sectors
= nb_sectors
;
4766 acb
->req
.qiov
= qiov
;
4767 acb
->req
.flags
= flags
;
4768 acb
->is_write
= is_write
;
4770 co
= qemu_coroutine_create(bdrv_co_do_rw
);
4771 qemu_coroutine_enter(co
, acb
);
4773 return &acb
->common
;
4776 static void coroutine_fn
bdrv_aio_flush_co_entry(void *opaque
)
4778 BlockAIOCBCoroutine
*acb
= opaque
;
4779 BlockDriverState
*bs
= acb
->common
.bs
;
4781 acb
->req
.error
= bdrv_co_flush(bs
);
4782 acb
->bh
= aio_bh_new(bdrv_get_aio_context(bs
), bdrv_co_em_bh
, acb
);
4783 qemu_bh_schedule(acb
->bh
);
4786 BlockAIOCB
*bdrv_aio_flush(BlockDriverState
*bs
,
4787 BlockCompletionFunc
*cb
, void *opaque
)
4789 trace_bdrv_aio_flush(bs
, opaque
);
4792 BlockAIOCBCoroutine
*acb
;
4794 acb
= qemu_aio_get(&bdrv_em_co_aiocb_info
, bs
, cb
, opaque
);
4796 co
= qemu_coroutine_create(bdrv_aio_flush_co_entry
);
4797 qemu_coroutine_enter(co
, acb
);
4799 return &acb
->common
;
4802 static void coroutine_fn
bdrv_aio_discard_co_entry(void *opaque
)
4804 BlockAIOCBCoroutine
*acb
= opaque
;
4805 BlockDriverState
*bs
= acb
->common
.bs
;
4807 acb
->req
.error
= bdrv_co_discard(bs
, acb
->req
.sector
, acb
->req
.nb_sectors
);
4808 acb
->bh
= aio_bh_new(bdrv_get_aio_context(bs
), bdrv_co_em_bh
, acb
);
4809 qemu_bh_schedule(acb
->bh
);
4812 BlockAIOCB
*bdrv_aio_discard(BlockDriverState
*bs
,
4813 int64_t sector_num
, int nb_sectors
,
4814 BlockCompletionFunc
*cb
, void *opaque
)
4817 BlockAIOCBCoroutine
*acb
;
4819 trace_bdrv_aio_discard(bs
, sector_num
, nb_sectors
, opaque
);
4821 acb
= qemu_aio_get(&bdrv_em_co_aiocb_info
, bs
, cb
, opaque
);
4822 acb
->req
.sector
= sector_num
;
4823 acb
->req
.nb_sectors
= nb_sectors
;
4824 co
= qemu_coroutine_create(bdrv_aio_discard_co_entry
);
4825 qemu_coroutine_enter(co
, acb
);
4827 return &acb
->common
;
4830 void bdrv_init(void)
4832 module_call_init(MODULE_INIT_BLOCK
);
4835 void bdrv_init_with_whitelist(void)
4837 use_bdrv_whitelist
= 1;
4841 void *qemu_aio_get(const AIOCBInfo
*aiocb_info
, BlockDriverState
*bs
,
4842 BlockCompletionFunc
*cb
, void *opaque
)
4846 acb
= g_slice_alloc(aiocb_info
->aiocb_size
);
4847 acb
->aiocb_info
= aiocb_info
;
4850 acb
->opaque
= opaque
;
4855 void qemu_aio_ref(void *p
)
4857 BlockAIOCB
*acb
= p
;
4861 void qemu_aio_unref(void *p
)
4863 BlockAIOCB
*acb
= p
;
4864 assert(acb
->refcnt
> 0);
4865 if (--acb
->refcnt
== 0) {
4866 g_slice_free1(acb
->aiocb_info
->aiocb_size
, acb
);
4870 /**************************************************************/
4871 /* Coroutine block device emulation */
4873 typedef struct CoroutineIOCompletion
{
4874 Coroutine
*coroutine
;
4876 } CoroutineIOCompletion
;
4878 static void bdrv_co_io_em_complete(void *opaque
, int ret
)
4880 CoroutineIOCompletion
*co
= opaque
;
4883 qemu_coroutine_enter(co
->coroutine
, NULL
);
4886 static int coroutine_fn
bdrv_co_io_em(BlockDriverState
*bs
, int64_t sector_num
,
4887 int nb_sectors
, QEMUIOVector
*iov
,
4890 CoroutineIOCompletion co
= {
4891 .coroutine
= qemu_coroutine_self(),
4896 acb
= bs
->drv
->bdrv_aio_writev(bs
, sector_num
, iov
, nb_sectors
,
4897 bdrv_co_io_em_complete
, &co
);
4899 acb
= bs
->drv
->bdrv_aio_readv(bs
, sector_num
, iov
, nb_sectors
,
4900 bdrv_co_io_em_complete
, &co
);
4903 trace_bdrv_co_io_em(bs
, sector_num
, nb_sectors
, is_write
, acb
);
4907 qemu_coroutine_yield();
4912 static int coroutine_fn
bdrv_co_readv_em(BlockDriverState
*bs
,
4913 int64_t sector_num
, int nb_sectors
,
4916 return bdrv_co_io_em(bs
, sector_num
, nb_sectors
, iov
, false);
4919 static int coroutine_fn
bdrv_co_writev_em(BlockDriverState
*bs
,
4920 int64_t sector_num
, int nb_sectors
,
4923 return bdrv_co_io_em(bs
, sector_num
, nb_sectors
, iov
, true);
4926 static void coroutine_fn
bdrv_flush_co_entry(void *opaque
)
4928 RwCo
*rwco
= opaque
;
4930 rwco
->ret
= bdrv_co_flush(rwco
->bs
);
4933 int coroutine_fn
bdrv_co_flush(BlockDriverState
*bs
)
4937 if (!bs
|| !bdrv_is_inserted(bs
) || bdrv_is_read_only(bs
)) {
4941 /* Write back cached data to the OS even with cache=unsafe */
4942 BLKDBG_EVENT(bs
->file
, BLKDBG_FLUSH_TO_OS
);
4943 if (bs
->drv
->bdrv_co_flush_to_os
) {
4944 ret
= bs
->drv
->bdrv_co_flush_to_os(bs
);
4950 /* But don't actually force it to the disk with cache=unsafe */
4951 if (bs
->open_flags
& BDRV_O_NO_FLUSH
) {
4955 BLKDBG_EVENT(bs
->file
, BLKDBG_FLUSH_TO_DISK
);
4956 if (bs
->drv
->bdrv_co_flush_to_disk
) {
4957 ret
= bs
->drv
->bdrv_co_flush_to_disk(bs
);
4958 } else if (bs
->drv
->bdrv_aio_flush
) {
4960 CoroutineIOCompletion co
= {
4961 .coroutine
= qemu_coroutine_self(),
4964 acb
= bs
->drv
->bdrv_aio_flush(bs
, bdrv_co_io_em_complete
, &co
);
4968 qemu_coroutine_yield();
4973 * Some block drivers always operate in either writethrough or unsafe
4974 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4975 * know how the server works (because the behaviour is hardcoded or
4976 * depends on server-side configuration), so we can't ensure that
4977 * everything is safe on disk. Returning an error doesn't work because
4978 * that would break guests even if the server operates in writethrough
4981 * Let's hope the user knows what he's doing.
4989 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4990 * in the case of cache=unsafe, so there are no useless flushes.
4993 return bdrv_co_flush(bs
->file
);
4996 void bdrv_invalidate_cache(BlockDriverState
*bs
, Error
**errp
)
4998 Error
*local_err
= NULL
;
5005 if (!(bs
->open_flags
& BDRV_O_INCOMING
)) {
5008 bs
->open_flags
&= ~BDRV_O_INCOMING
;
5010 if (bs
->drv
->bdrv_invalidate_cache
) {
5011 bs
->drv
->bdrv_invalidate_cache(bs
, &local_err
);
5012 } else if (bs
->file
) {
5013 bdrv_invalidate_cache(bs
->file
, &local_err
);
5016 error_propagate(errp
, local_err
);
5020 ret
= refresh_total_sectors(bs
, bs
->total_sectors
);
5022 error_setg_errno(errp
, -ret
, "Could not refresh total sector count");
5027 void bdrv_invalidate_cache_all(Error
**errp
)
5029 BlockDriverState
*bs
;
5030 Error
*local_err
= NULL
;
5032 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
5033 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
5035 aio_context_acquire(aio_context
);
5036 bdrv_invalidate_cache(bs
, &local_err
);
5037 aio_context_release(aio_context
);
5039 error_propagate(errp
, local_err
);
5045 int bdrv_flush(BlockDriverState
*bs
)
5053 if (qemu_in_coroutine()) {
5054 /* Fast-path if already in coroutine context */
5055 bdrv_flush_co_entry(&rwco
);
5057 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
5059 co
= qemu_coroutine_create(bdrv_flush_co_entry
);
5060 qemu_coroutine_enter(co
, &rwco
);
5061 while (rwco
.ret
== NOT_DONE
) {
5062 aio_poll(aio_context
, true);
5069 typedef struct DiscardCo
{
5070 BlockDriverState
*bs
;
5075 static void coroutine_fn
bdrv_discard_co_entry(void *opaque
)
5077 DiscardCo
*rwco
= opaque
;
5079 rwco
->ret
= bdrv_co_discard(rwco
->bs
, rwco
->sector_num
, rwco
->nb_sectors
);
5082 /* if no limit is specified in the BlockLimits use a default
5083 * of 32768 512-byte sectors (16 MiB) per request.
5085 #define MAX_DISCARD_DEFAULT 32768
5087 int coroutine_fn
bdrv_co_discard(BlockDriverState
*bs
, int64_t sector_num
,
5094 } else if (bdrv_check_request(bs
, sector_num
, nb_sectors
)) {
5096 } else if (bs
->read_only
) {
5100 bdrv_reset_dirty(bs
, sector_num
, nb_sectors
);
5102 /* Do nothing if disabled. */
5103 if (!(bs
->open_flags
& BDRV_O_UNMAP
)) {
5107 if (!bs
->drv
->bdrv_co_discard
&& !bs
->drv
->bdrv_aio_discard
) {
5111 max_discard
= bs
->bl
.max_discard
? bs
->bl
.max_discard
: MAX_DISCARD_DEFAULT
;
5112 while (nb_sectors
> 0) {
5114 int num
= nb_sectors
;
5117 if (bs
->bl
.discard_alignment
&&
5118 num
>= bs
->bl
.discard_alignment
&&
5119 sector_num
% bs
->bl
.discard_alignment
) {
5120 if (num
> bs
->bl
.discard_alignment
) {
5121 num
= bs
->bl
.discard_alignment
;
5123 num
-= sector_num
% bs
->bl
.discard_alignment
;
5126 /* limit request size */
5127 if (num
> max_discard
) {
5131 if (bs
->drv
->bdrv_co_discard
) {
5132 ret
= bs
->drv
->bdrv_co_discard(bs
, sector_num
, num
);
5135 CoroutineIOCompletion co
= {
5136 .coroutine
= qemu_coroutine_self(),
5139 acb
= bs
->drv
->bdrv_aio_discard(bs
, sector_num
, nb_sectors
,
5140 bdrv_co_io_em_complete
, &co
);
5144 qemu_coroutine_yield();
5148 if (ret
&& ret
!= -ENOTSUP
) {
5158 int bdrv_discard(BlockDriverState
*bs
, int64_t sector_num
, int nb_sectors
)
5163 .sector_num
= sector_num
,
5164 .nb_sectors
= nb_sectors
,
5168 if (qemu_in_coroutine()) {
5169 /* Fast-path if already in coroutine context */
5170 bdrv_discard_co_entry(&rwco
);
5172 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
5174 co
= qemu_coroutine_create(bdrv_discard_co_entry
);
5175 qemu_coroutine_enter(co
, &rwco
);
5176 while (rwco
.ret
== NOT_DONE
) {
5177 aio_poll(aio_context
, true);
5184 /**************************************************************/
5185 /* removable device support */
5188 * Return TRUE if the media is present
5190 int bdrv_is_inserted(BlockDriverState
*bs
)
5192 BlockDriver
*drv
= bs
->drv
;
5196 if (!drv
->bdrv_is_inserted
)
5198 return drv
->bdrv_is_inserted(bs
);
5202 * Return whether the media changed since the last call to this
5203 * function, or -ENOTSUP if we don't know. Most drivers don't know.
5205 int bdrv_media_changed(BlockDriverState
*bs
)
5207 BlockDriver
*drv
= bs
->drv
;
5209 if (drv
&& drv
->bdrv_media_changed
) {
5210 return drv
->bdrv_media_changed(bs
);
5216 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5218 void bdrv_eject(BlockDriverState
*bs
, bool eject_flag
)
5220 BlockDriver
*drv
= bs
->drv
;
5221 const char *device_name
;
5223 if (drv
&& drv
->bdrv_eject
) {
5224 drv
->bdrv_eject(bs
, eject_flag
);
5227 device_name
= bdrv_get_device_name(bs
);
5228 if (device_name
[0] != '\0') {
5229 qapi_event_send_device_tray_moved(device_name
,
5230 eject_flag
, &error_abort
);
5235 * Lock or unlock the media (if it is locked, the user won't be able
5236 * to eject it manually).
5238 void bdrv_lock_medium(BlockDriverState
*bs
, bool locked
)
5240 BlockDriver
*drv
= bs
->drv
;
5242 trace_bdrv_lock_medium(bs
, locked
);
5244 if (drv
&& drv
->bdrv_lock_medium
) {
5245 drv
->bdrv_lock_medium(bs
, locked
);
5249 /* needed for generic scsi interface */
5251 int bdrv_ioctl(BlockDriverState
*bs
, unsigned long int req
, void *buf
)
5253 BlockDriver
*drv
= bs
->drv
;
5255 if (drv
&& drv
->bdrv_ioctl
)
5256 return drv
->bdrv_ioctl(bs
, req
, buf
);
5260 BlockAIOCB
*bdrv_aio_ioctl(BlockDriverState
*bs
,
5261 unsigned long int req
, void *buf
,
5262 BlockCompletionFunc
*cb
, void *opaque
)
5264 BlockDriver
*drv
= bs
->drv
;
5266 if (drv
&& drv
->bdrv_aio_ioctl
)
5267 return drv
->bdrv_aio_ioctl(bs
, req
, buf
, cb
, opaque
);
5271 void bdrv_set_guest_block_size(BlockDriverState
*bs
, int align
)
5273 bs
->guest_block_size
= align
;
5276 void *qemu_blockalign(BlockDriverState
*bs
, size_t size
)
5278 return qemu_memalign(bdrv_opt_mem_align(bs
), size
);
5281 void *qemu_blockalign0(BlockDriverState
*bs
, size_t size
)
5283 return memset(qemu_blockalign(bs
, size
), 0, size
);
5286 void *qemu_try_blockalign(BlockDriverState
*bs
, size_t size
)
5288 size_t align
= bdrv_opt_mem_align(bs
);
5290 /* Ensure that NULL is never returned on success */
5296 return qemu_try_memalign(align
, size
);
5299 void *qemu_try_blockalign0(BlockDriverState
*bs
, size_t size
)
5301 void *mem
= qemu_try_blockalign(bs
, size
);
5304 memset(mem
, 0, size
);
5311 * Check if all memory in this vector is sector aligned.
5313 bool bdrv_qiov_is_aligned(BlockDriverState
*bs
, QEMUIOVector
*qiov
)
5316 size_t alignment
= bdrv_opt_mem_align(bs
);
5318 for (i
= 0; i
< qiov
->niov
; i
++) {
5319 if ((uintptr_t) qiov
->iov
[i
].iov_base
% alignment
) {
5322 if (qiov
->iov
[i
].iov_len
% alignment
) {
5330 BdrvDirtyBitmap
*bdrv_create_dirty_bitmap(BlockDriverState
*bs
, int granularity
,
5333 int64_t bitmap_size
;
5334 BdrvDirtyBitmap
*bitmap
;
5336 assert((granularity
& (granularity
- 1)) == 0);
5338 granularity
>>= BDRV_SECTOR_BITS
;
5339 assert(granularity
);
5340 bitmap_size
= bdrv_nb_sectors(bs
);
5341 if (bitmap_size
< 0) {
5342 error_setg_errno(errp
, -bitmap_size
, "could not get length of device");
5343 errno
= -bitmap_size
;
5346 bitmap
= g_new0(BdrvDirtyBitmap
, 1);
5347 bitmap
->bitmap
= hbitmap_alloc(bitmap_size
, ffs(granularity
) - 1);
5348 QLIST_INSERT_HEAD(&bs
->dirty_bitmaps
, bitmap
, list
);
5352 void bdrv_release_dirty_bitmap(BlockDriverState
*bs
, BdrvDirtyBitmap
*bitmap
)
5354 BdrvDirtyBitmap
*bm
, *next
;
5355 QLIST_FOREACH_SAFE(bm
, &bs
->dirty_bitmaps
, list
, next
) {
5357 QLIST_REMOVE(bitmap
, list
);
5358 hbitmap_free(bitmap
->bitmap
);
5365 BlockDirtyInfoList
*bdrv_query_dirty_bitmaps(BlockDriverState
*bs
)
5367 BdrvDirtyBitmap
*bm
;
5368 BlockDirtyInfoList
*list
= NULL
;
5369 BlockDirtyInfoList
**plist
= &list
;
5371 QLIST_FOREACH(bm
, &bs
->dirty_bitmaps
, list
) {
5372 BlockDirtyInfo
*info
= g_new0(BlockDirtyInfo
, 1);
5373 BlockDirtyInfoList
*entry
= g_new0(BlockDirtyInfoList
, 1);
5374 info
->count
= bdrv_get_dirty_count(bs
, bm
);
5376 ((int64_t) BDRV_SECTOR_SIZE
<< hbitmap_granularity(bm
->bitmap
));
5377 entry
->value
= info
;
5379 plist
= &entry
->next
;
5385 int bdrv_get_dirty(BlockDriverState
*bs
, BdrvDirtyBitmap
*bitmap
, int64_t sector
)
5388 return hbitmap_get(bitmap
->bitmap
, sector
);
5394 void bdrv_dirty_iter_init(BlockDriverState
*bs
,
5395 BdrvDirtyBitmap
*bitmap
, HBitmapIter
*hbi
)
5397 hbitmap_iter_init(hbi
, bitmap
->bitmap
, 0);
5400 void bdrv_set_dirty(BlockDriverState
*bs
, int64_t cur_sector
,
5403 BdrvDirtyBitmap
*bitmap
;
5404 QLIST_FOREACH(bitmap
, &bs
->dirty_bitmaps
, list
) {
5405 hbitmap_set(bitmap
->bitmap
, cur_sector
, nr_sectors
);
5409 void bdrv_reset_dirty(BlockDriverState
*bs
, int64_t cur_sector
, int nr_sectors
)
5411 BdrvDirtyBitmap
*bitmap
;
5412 QLIST_FOREACH(bitmap
, &bs
->dirty_bitmaps
, list
) {
5413 hbitmap_reset(bitmap
->bitmap
, cur_sector
, nr_sectors
);
5417 int64_t bdrv_get_dirty_count(BlockDriverState
*bs
, BdrvDirtyBitmap
*bitmap
)
5419 return hbitmap_count(bitmap
->bitmap
);
5422 /* Get a reference to bs */
5423 void bdrv_ref(BlockDriverState
*bs
)
5428 /* Release a previously grabbed reference to bs.
5429 * If after releasing, reference count is zero, the BlockDriverState is
5431 void bdrv_unref(BlockDriverState
*bs
)
5436 assert(bs
->refcnt
> 0);
5437 if (--bs
->refcnt
== 0) {
5442 struct BdrvOpBlocker
{
5444 QLIST_ENTRY(BdrvOpBlocker
) list
;
5447 bool bdrv_op_is_blocked(BlockDriverState
*bs
, BlockOpType op
, Error
**errp
)
5449 BdrvOpBlocker
*blocker
;
5450 assert((int) op
>= 0 && op
< BLOCK_OP_TYPE_MAX
);
5451 if (!QLIST_EMPTY(&bs
->op_blockers
[op
])) {
5452 blocker
= QLIST_FIRST(&bs
->op_blockers
[op
]);
5454 error_setg(errp
, "Device '%s' is busy: %s",
5455 bdrv_get_device_name(bs
),
5456 error_get_pretty(blocker
->reason
));
5463 void bdrv_op_block(BlockDriverState
*bs
, BlockOpType op
, Error
*reason
)
5465 BdrvOpBlocker
*blocker
;
5466 assert((int) op
>= 0 && op
< BLOCK_OP_TYPE_MAX
);
5468 blocker
= g_new0(BdrvOpBlocker
, 1);
5469 blocker
->reason
= reason
;
5470 QLIST_INSERT_HEAD(&bs
->op_blockers
[op
], blocker
, list
);
5473 void bdrv_op_unblock(BlockDriverState
*bs
, BlockOpType op
, Error
*reason
)
5475 BdrvOpBlocker
*blocker
, *next
;
5476 assert((int) op
>= 0 && op
< BLOCK_OP_TYPE_MAX
);
5477 QLIST_FOREACH_SAFE(blocker
, &bs
->op_blockers
[op
], list
, next
) {
5478 if (blocker
->reason
== reason
) {
5479 QLIST_REMOVE(blocker
, list
);
5485 void bdrv_op_block_all(BlockDriverState
*bs
, Error
*reason
)
5488 for (i
= 0; i
< BLOCK_OP_TYPE_MAX
; i
++) {
5489 bdrv_op_block(bs
, i
, reason
);
5493 void bdrv_op_unblock_all(BlockDriverState
*bs
, Error
*reason
)
5496 for (i
= 0; i
< BLOCK_OP_TYPE_MAX
; i
++) {
5497 bdrv_op_unblock(bs
, i
, reason
);
5501 bool bdrv_op_blocker_is_empty(BlockDriverState
*bs
)
5505 for (i
= 0; i
< BLOCK_OP_TYPE_MAX
; i
++) {
5506 if (!QLIST_EMPTY(&bs
->op_blockers
[i
])) {
5513 void bdrv_iostatus_enable(BlockDriverState
*bs
)
5515 bs
->iostatus_enabled
= true;
5516 bs
->iostatus
= BLOCK_DEVICE_IO_STATUS_OK
;
5519 /* The I/O status is only enabled if the drive explicitly
5520 * enables it _and_ the VM is configured to stop on errors */
5521 bool bdrv_iostatus_is_enabled(const BlockDriverState
*bs
)
5523 return (bs
->iostatus_enabled
&&
5524 (bs
->on_write_error
== BLOCKDEV_ON_ERROR_ENOSPC
||
5525 bs
->on_write_error
== BLOCKDEV_ON_ERROR_STOP
||
5526 bs
->on_read_error
== BLOCKDEV_ON_ERROR_STOP
));
5529 void bdrv_iostatus_disable(BlockDriverState
*bs
)
5531 bs
->iostatus_enabled
= false;
5534 void bdrv_iostatus_reset(BlockDriverState
*bs
)
5536 if (bdrv_iostatus_is_enabled(bs
)) {
5537 bs
->iostatus
= BLOCK_DEVICE_IO_STATUS_OK
;
5539 block_job_iostatus_reset(bs
->job
);
5544 void bdrv_iostatus_set_err(BlockDriverState
*bs
, int error
)
5546 assert(bdrv_iostatus_is_enabled(bs
));
5547 if (bs
->iostatus
== BLOCK_DEVICE_IO_STATUS_OK
) {
5548 bs
->iostatus
= error
== ENOSPC
? BLOCK_DEVICE_IO_STATUS_NOSPACE
:
5549 BLOCK_DEVICE_IO_STATUS_FAILED
;
5553 void bdrv_img_create(const char *filename
, const char *fmt
,
5554 const char *base_filename
, const char *base_fmt
,
5555 char *options
, uint64_t img_size
, int flags
,
5556 Error
**errp
, bool quiet
)
5558 QemuOptsList
*create_opts
= NULL
;
5559 QemuOpts
*opts
= NULL
;
5560 const char *backing_fmt
, *backing_file
;
5562 BlockDriver
*drv
, *proto_drv
;
5563 BlockDriver
*backing_drv
= NULL
;
5564 Error
*local_err
= NULL
;
5567 /* Find driver and parse its options */
5568 drv
= bdrv_find_format(fmt
);
5570 error_setg(errp
, "Unknown file format '%s'", fmt
);
5574 proto_drv
= bdrv_find_protocol(filename
, true);
5576 error_setg(errp
, "Unknown protocol '%s'", filename
);
5580 create_opts
= qemu_opts_append(create_opts
, drv
->create_opts
);
5581 create_opts
= qemu_opts_append(create_opts
, proto_drv
->create_opts
);
5583 /* Create parameter list with default values */
5584 opts
= qemu_opts_create(create_opts
, NULL
, 0, &error_abort
);
5585 qemu_opt_set_number(opts
, BLOCK_OPT_SIZE
, img_size
);
5587 /* Parse -o options */
5589 if (qemu_opts_do_parse(opts
, options
, NULL
) != 0) {
5590 error_setg(errp
, "Invalid options for file format '%s'", fmt
);
5595 if (base_filename
) {
5596 if (qemu_opt_set(opts
, BLOCK_OPT_BACKING_FILE
, base_filename
)) {
5597 error_setg(errp
, "Backing file not supported for file format '%s'",
5604 if (qemu_opt_set(opts
, BLOCK_OPT_BACKING_FMT
, base_fmt
)) {
5605 error_setg(errp
, "Backing file format not supported for file "
5606 "format '%s'", fmt
);
5611 backing_file
= qemu_opt_get(opts
, BLOCK_OPT_BACKING_FILE
);
5613 if (!strcmp(filename
, backing_file
)) {
5614 error_setg(errp
, "Error: Trying to create an image with the "
5615 "same filename as the backing file");
5620 backing_fmt
= qemu_opt_get(opts
, BLOCK_OPT_BACKING_FMT
);
5622 backing_drv
= bdrv_find_format(backing_fmt
);
5624 error_setg(errp
, "Unknown backing file format '%s'",
5630 // The size for the image must always be specified, with one exception:
5631 // If we are using a backing file, we can obtain the size from there
5632 size
= qemu_opt_get_size(opts
, BLOCK_OPT_SIZE
, 0);
5635 BlockDriverState
*bs
;
5639 /* backing files always opened read-only */
5641 flags
& ~(BDRV_O_RDWR
| BDRV_O_SNAPSHOT
| BDRV_O_NO_BACKING
);
5644 ret
= bdrv_open(&bs
, backing_file
, NULL
, NULL
, back_flags
,
5645 backing_drv
, &local_err
);
5649 size
= bdrv_getlength(bs
);
5651 error_setg_errno(errp
, -size
, "Could not get size of '%s'",
5657 qemu_opt_set_number(opts
, BLOCK_OPT_SIZE
, size
);
5661 error_setg(errp
, "Image creation needs a size parameter");
5667 printf("Formatting '%s', fmt=%s ", filename
, fmt
);
5668 qemu_opts_print(opts
);
5672 ret
= bdrv_create(drv
, filename
, opts
, &local_err
);
5674 if (ret
== -EFBIG
) {
5675 /* This is generally a better message than whatever the driver would
5676 * deliver (especially because of the cluster_size_hint), since that
5677 * is most probably not much different from "image too large". */
5678 const char *cluster_size_hint
= "";
5679 if (qemu_opt_get_size(opts
, BLOCK_OPT_CLUSTER_SIZE
, 0)) {
5680 cluster_size_hint
= " (try using a larger cluster size)";
5682 error_setg(errp
, "The image size is too large for file format '%s'"
5683 "%s", fmt
, cluster_size_hint
);
5684 error_free(local_err
);
5689 qemu_opts_del(opts
);
5690 qemu_opts_free(create_opts
);
5692 error_propagate(errp
, local_err
);
5696 AioContext
*bdrv_get_aio_context(BlockDriverState
*bs
)
5698 return bs
->aio_context
;
5701 void bdrv_detach_aio_context(BlockDriverState
*bs
)
5703 BdrvAioNotifier
*baf
;
5709 QLIST_FOREACH(baf
, &bs
->aio_notifiers
, list
) {
5710 baf
->detach_aio_context(baf
->opaque
);
5713 if (bs
->io_limits_enabled
) {
5714 throttle_detach_aio_context(&bs
->throttle_state
);
5716 if (bs
->drv
->bdrv_detach_aio_context
) {
5717 bs
->drv
->bdrv_detach_aio_context(bs
);
5720 bdrv_detach_aio_context(bs
->file
);
5722 if (bs
->backing_hd
) {
5723 bdrv_detach_aio_context(bs
->backing_hd
);
5726 bs
->aio_context
= NULL
;
5729 void bdrv_attach_aio_context(BlockDriverState
*bs
,
5730 AioContext
*new_context
)
5732 BdrvAioNotifier
*ban
;
5738 bs
->aio_context
= new_context
;
5740 if (bs
->backing_hd
) {
5741 bdrv_attach_aio_context(bs
->backing_hd
, new_context
);
5744 bdrv_attach_aio_context(bs
->file
, new_context
);
5746 if (bs
->drv
->bdrv_attach_aio_context
) {
5747 bs
->drv
->bdrv_attach_aio_context(bs
, new_context
);
5749 if (bs
->io_limits_enabled
) {
5750 throttle_attach_aio_context(&bs
->throttle_state
, new_context
);
5753 QLIST_FOREACH(ban
, &bs
->aio_notifiers
, list
) {
5754 ban
->attached_aio_context(new_context
, ban
->opaque
);
5758 void bdrv_set_aio_context(BlockDriverState
*bs
, AioContext
*new_context
)
5760 bdrv_drain_all(); /* ensure there are no in-flight requests */
5762 bdrv_detach_aio_context(bs
);
5764 /* This function executes in the old AioContext so acquire the new one in
5765 * case it runs in a different thread.
5767 aio_context_acquire(new_context
);
5768 bdrv_attach_aio_context(bs
, new_context
);
5769 aio_context_release(new_context
);
5772 void bdrv_add_aio_context_notifier(BlockDriverState
*bs
,
5773 void (*attached_aio_context
)(AioContext
*new_context
, void *opaque
),
5774 void (*detach_aio_context
)(void *opaque
), void *opaque
)
5776 BdrvAioNotifier
*ban
= g_new(BdrvAioNotifier
, 1);
5777 *ban
= (BdrvAioNotifier
){
5778 .attached_aio_context
= attached_aio_context
,
5779 .detach_aio_context
= detach_aio_context
,
5783 QLIST_INSERT_HEAD(&bs
->aio_notifiers
, ban
, list
);
5786 void bdrv_remove_aio_context_notifier(BlockDriverState
*bs
,
5787 void (*attached_aio_context
)(AioContext
*,
5789 void (*detach_aio_context
)(void *),
5792 BdrvAioNotifier
*ban
, *ban_next
;
5794 QLIST_FOREACH_SAFE(ban
, &bs
->aio_notifiers
, list
, ban_next
) {
5795 if (ban
->attached_aio_context
== attached_aio_context
&&
5796 ban
->detach_aio_context
== detach_aio_context
&&
5797 ban
->opaque
== opaque
)
5799 QLIST_REMOVE(ban
, list
);
5809 void bdrv_add_before_write_notifier(BlockDriverState
*bs
,
5810 NotifierWithReturn
*notifier
)
5812 notifier_with_return_list_add(&bs
->before_write_notifiers
, notifier
);
5815 int bdrv_amend_options(BlockDriverState
*bs
, QemuOpts
*opts
,
5816 BlockDriverAmendStatusCB
*status_cb
)
5818 if (!bs
->drv
->bdrv_amend_options
) {
5821 return bs
->drv
->bdrv_amend_options(bs
, opts
, status_cb
);
5824 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5825 * of block filter and by bdrv_is_first_non_filter.
5826 * It is used to test if the given bs is the candidate or recurse more in the
5829 bool bdrv_recurse_is_first_non_filter(BlockDriverState
*bs
,
5830 BlockDriverState
*candidate
)
5832 /* return false if basic checks fails */
5833 if (!bs
|| !bs
->drv
) {
5837 /* the code reached a non block filter driver -> check if the bs is
5838 * the same as the candidate. It's the recursion termination condition.
5840 if (!bs
->drv
->is_filter
) {
5841 return bs
== candidate
;
5843 /* Down this path the driver is a block filter driver */
5845 /* If the block filter recursion method is defined use it to recurse down
5848 if (bs
->drv
->bdrv_recurse_is_first_non_filter
) {
5849 return bs
->drv
->bdrv_recurse_is_first_non_filter(bs
, candidate
);
5852 /* the driver is a block filter but don't allow to recurse -> return false
5857 /* This function checks if the candidate is the first non filter bs down it's
5858 * bs chain. Since we don't have pointers to parents it explore all bs chains
5859 * from the top. Some filters can choose not to pass down the recursion.
5861 bool bdrv_is_first_non_filter(BlockDriverState
*candidate
)
5863 BlockDriverState
*bs
;
5865 /* walk down the bs forest recursively */
5866 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
5869 /* try to recurse in this top level bs */
5870 perm
= bdrv_recurse_is_first_non_filter(bs
, candidate
);
5872 /* candidate is the first non filter */
5881 BlockDriverState
*check_to_replace_node(const char *node_name
, Error
**errp
)
5883 BlockDriverState
*to_replace_bs
= bdrv_find_node(node_name
);
5884 AioContext
*aio_context
;
5886 if (!to_replace_bs
) {
5887 error_setg(errp
, "Node name '%s' not found", node_name
);
5891 aio_context
= bdrv_get_aio_context(to_replace_bs
);
5892 aio_context_acquire(aio_context
);
5894 if (bdrv_op_is_blocked(to_replace_bs
, BLOCK_OP_TYPE_REPLACE
, errp
)) {
5895 to_replace_bs
= NULL
;
5899 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5900 * most non filter in order to prevent data corruption.
5901 * Another benefit is that this tests exclude backing files which are
5902 * blocked by the backing blockers.
5904 if (!bdrv_is_first_non_filter(to_replace_bs
)) {
5905 error_setg(errp
, "Only top most non filter can be replaced");
5906 to_replace_bs
= NULL
;
5911 aio_context_release(aio_context
);
5912 return to_replace_bs
;
5915 void bdrv_io_plug(BlockDriverState
*bs
)
5917 BlockDriver
*drv
= bs
->drv
;
5918 if (drv
&& drv
->bdrv_io_plug
) {
5919 drv
->bdrv_io_plug(bs
);
5920 } else if (bs
->file
) {
5921 bdrv_io_plug(bs
->file
);
5925 void bdrv_io_unplug(BlockDriverState
*bs
)
5927 BlockDriver
*drv
= bs
->drv
;
5928 if (drv
&& drv
->bdrv_io_unplug
) {
5929 drv
->bdrv_io_unplug(bs
);
5930 } else if (bs
->file
) {
5931 bdrv_io_unplug(bs
->file
);
5935 void bdrv_flush_io_queue(BlockDriverState
*bs
)
5937 BlockDriver
*drv
= bs
->drv
;
5938 if (drv
&& drv
->bdrv_flush_io_queue
) {
5939 drv
->bdrv_flush_io_queue(bs
);
5940 } else if (bs
->file
) {
5941 bdrv_flush_io_queue(bs
->file
);
5945 static bool append_open_options(QDict
*d
, BlockDriverState
*bs
)
5947 const QDictEntry
*entry
;
5948 bool found_any
= false;
5950 for (entry
= qdict_first(bs
->options
); entry
;
5951 entry
= qdict_next(bs
->options
, entry
))
5953 /* Only take options for this level and exclude all non-driver-specific
5955 if (!strchr(qdict_entry_key(entry
), '.') &&
5956 strcmp(qdict_entry_key(entry
), "node-name"))
5958 qobject_incref(qdict_entry_value(entry
));
5959 qdict_put_obj(d
, qdict_entry_key(entry
), qdict_entry_value(entry
));
5967 /* Updates the following BDS fields:
5968 * - exact_filename: A filename which may be used for opening a block device
5969 * which (mostly) equals the given BDS (even without any
5970 * other options; so reading and writing must return the same
5971 * results, but caching etc. may be different)
5972 * - full_open_options: Options which, when given when opening a block device
5973 * (without a filename), result in a BDS (mostly)
5974 * equalling the given one
5975 * - filename: If exact_filename is set, it is copied here. Otherwise,
5976 * full_open_options is converted to a JSON object, prefixed with
5977 * "json:" (for use through the JSON pseudo protocol) and put here.
5979 void bdrv_refresh_filename(BlockDriverState
*bs
)
5981 BlockDriver
*drv
= bs
->drv
;
5988 /* This BDS's file name will most probably depend on its file's name, so
5989 * refresh that first */
5991 bdrv_refresh_filename(bs
->file
);
5994 if (drv
->bdrv_refresh_filename
) {
5995 /* Obsolete information is of no use here, so drop the old file name
5996 * information before refreshing it */
5997 bs
->exact_filename
[0] = '\0';
5998 if (bs
->full_open_options
) {
5999 QDECREF(bs
->full_open_options
);
6000 bs
->full_open_options
= NULL
;
6003 drv
->bdrv_refresh_filename(bs
);
6004 } else if (bs
->file
) {
6005 /* Try to reconstruct valid information from the underlying file */
6006 bool has_open_options
;
6008 bs
->exact_filename
[0] = '\0';
6009 if (bs
->full_open_options
) {
6010 QDECREF(bs
->full_open_options
);
6011 bs
->full_open_options
= NULL
;
6015 has_open_options
= append_open_options(opts
, bs
);
6017 /* If no specific options have been given for this BDS, the filename of
6018 * the underlying file should suffice for this one as well */
6019 if (bs
->file
->exact_filename
[0] && !has_open_options
) {
6020 strcpy(bs
->exact_filename
, bs
->file
->exact_filename
);
6022 /* Reconstructing the full options QDict is simple for most format block
6023 * drivers, as long as the full options are known for the underlying
6024 * file BDS. The full options QDict of that file BDS should somehow
6025 * contain a representation of the filename, therefore the following
6026 * suffices without querying the (exact_)filename of this BDS. */
6027 if (bs
->file
->full_open_options
) {
6028 qdict_put_obj(opts
, "driver",
6029 QOBJECT(qstring_from_str(drv
->format_name
)));
6030 QINCREF(bs
->file
->full_open_options
);
6031 qdict_put_obj(opts
, "file", QOBJECT(bs
->file
->full_open_options
));
6033 bs
->full_open_options
= opts
;
6037 } else if (!bs
->full_open_options
&& qdict_size(bs
->options
)) {
6038 /* There is no underlying file BDS (at least referenced by BDS.file),
6039 * so the full options QDict should be equal to the options given
6040 * specifically for this block device when it was opened (plus the
6041 * driver specification).
6042 * Because those options don't change, there is no need to update
6043 * full_open_options when it's already set. */
6046 append_open_options(opts
, bs
);
6047 qdict_put_obj(opts
, "driver",
6048 QOBJECT(qstring_from_str(drv
->format_name
)));
6050 if (bs
->exact_filename
[0]) {
6051 /* This may not work for all block protocol drivers (some may
6052 * require this filename to be parsed), but we have to find some
6053 * default solution here, so just include it. If some block driver
6054 * does not support pure options without any filename at all or
6055 * needs some special format of the options QDict, it needs to
6056 * implement the driver-specific bdrv_refresh_filename() function.
6058 qdict_put_obj(opts
, "filename",
6059 QOBJECT(qstring_from_str(bs
->exact_filename
)));
6062 bs
->full_open_options
= opts
;
6065 if (bs
->exact_filename
[0]) {
6066 pstrcpy(bs
->filename
, sizeof(bs
->filename
), bs
->exact_filename
);
6067 } else if (bs
->full_open_options
) {
6068 QString
*json
= qobject_to_json(QOBJECT(bs
->full_open_options
));
6069 snprintf(bs
->filename
, sizeof(bs
->filename
), "json:%s",
6070 qstring_get_str(json
));
6075 /* This accessor function purpose is to allow the device models to access the
6076 * BlockAcctStats structure embedded inside a BlockDriverState without being
6077 * aware of the BlockDriverState structure layout.
6078 * It will go away when the BlockAcctStats structure will be moved inside
6079 * the device models.
6081 BlockAcctStats
*bdrv_get_stats(BlockDriverState
*bs
)