2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include "config-host.h"
25 #include "qemu-common.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
41 #include <sys/types.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
54 struct BdrvDirtyBitmap
{
56 QLIST_ENTRY(BdrvDirtyBitmap
) list
;
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
61 static BlockAIOCB
*bdrv_aio_readv_em(BlockDriverState
*bs
,
62 int64_t sector_num
, QEMUIOVector
*qiov
, int nb_sectors
,
63 BlockCompletionFunc
*cb
, void *opaque
);
64 static BlockAIOCB
*bdrv_aio_writev_em(BlockDriverState
*bs
,
65 int64_t sector_num
, QEMUIOVector
*qiov
, int nb_sectors
,
66 BlockCompletionFunc
*cb
, void *opaque
);
67 static int coroutine_fn
bdrv_co_readv_em(BlockDriverState
*bs
,
68 int64_t sector_num
, int nb_sectors
,
70 static int coroutine_fn
bdrv_co_writev_em(BlockDriverState
*bs
,
71 int64_t sector_num
, int nb_sectors
,
73 static int coroutine_fn
bdrv_co_do_preadv(BlockDriverState
*bs
,
74 int64_t offset
, unsigned int bytes
, QEMUIOVector
*qiov
,
75 BdrvRequestFlags flags
);
76 static int coroutine_fn
bdrv_co_do_pwritev(BlockDriverState
*bs
,
77 int64_t offset
, unsigned int bytes
, QEMUIOVector
*qiov
,
78 BdrvRequestFlags flags
);
79 static BlockAIOCB
*bdrv_co_aio_rw_vector(BlockDriverState
*bs
,
83 BdrvRequestFlags flags
,
84 BlockCompletionFunc
*cb
,
87 static void coroutine_fn
bdrv_co_do_rw(void *opaque
);
88 static int coroutine_fn
bdrv_co_do_write_zeroes(BlockDriverState
*bs
,
89 int64_t sector_num
, int nb_sectors
, BdrvRequestFlags flags
);
91 static QTAILQ_HEAD(, BlockDriverState
) bdrv_states
=
92 QTAILQ_HEAD_INITIALIZER(bdrv_states
);
94 static QTAILQ_HEAD(, BlockDriverState
) graph_bdrv_states
=
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states
);
97 static QLIST_HEAD(, BlockDriver
) bdrv_drivers
=
98 QLIST_HEAD_INITIALIZER(bdrv_drivers
);
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist
;
104 static int is_windows_drive_prefix(const char *filename
)
106 return (((filename
[0] >= 'a' && filename
[0] <= 'z') ||
107 (filename
[0] >= 'A' && filename
[0] <= 'Z')) &&
111 int is_windows_drive(const char *filename
)
113 if (is_windows_drive_prefix(filename
) &&
116 if (strstart(filename
, "\\\\.\\", NULL
) ||
117 strstart(filename
, "//./", NULL
))
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState
*bs
,
129 throttle_config(&bs
->throttle_state
, cfg
);
131 for (i
= 0; i
< 2; i
++) {
132 qemu_co_enter_next(&bs
->throttled_reqs
[i
]);
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState
*bs
)
139 bool drained
= false;
140 bool enabled
= bs
->io_limits_enabled
;
143 bs
->io_limits_enabled
= false;
145 for (i
= 0; i
< 2; i
++) {
146 while (qemu_co_enter_next(&bs
->throttled_reqs
[i
])) {
151 bs
->io_limits_enabled
= enabled
;
156 void bdrv_io_limits_disable(BlockDriverState
*bs
)
158 bs
->io_limits_enabled
= false;
160 bdrv_start_throttled_reqs(bs
);
162 throttle_destroy(&bs
->throttle_state
);
165 static void bdrv_throttle_read_timer_cb(void *opaque
)
167 BlockDriverState
*bs
= opaque
;
168 qemu_co_enter_next(&bs
->throttled_reqs
[0]);
171 static void bdrv_throttle_write_timer_cb(void *opaque
)
173 BlockDriverState
*bs
= opaque
;
174 qemu_co_enter_next(&bs
->throttled_reqs
[1]);
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState
*bs
)
180 assert(!bs
->io_limits_enabled
);
181 throttle_init(&bs
->throttle_state
,
182 bdrv_get_aio_context(bs
),
184 bdrv_throttle_read_timer_cb
,
185 bdrv_throttle_write_timer_cb
,
187 bs
->io_limits_enabled
= true;
190 /* This function makes an IO wait if needed
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
195 static void bdrv_io_limits_intercept(BlockDriverState
*bs
,
199 /* does this io must wait */
200 bool must_wait
= throttle_schedule_timer(&bs
->throttle_state
, is_write
);
202 /* if must wait or any request of this type throttled queue the IO */
204 !qemu_co_queue_empty(&bs
->throttled_reqs
[is_write
])) {
205 qemu_co_queue_wait(&bs
->throttled_reqs
[is_write
]);
208 /* the IO will be executed, do the accounting */
209 throttle_account(&bs
->throttle_state
, is_write
, bytes
);
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs
->throttle_state
, is_write
)) {
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs
->throttled_reqs
[is_write
]);
221 size_t bdrv_opt_mem_align(BlockDriverState
*bs
)
223 if (!bs
|| !bs
->drv
) {
224 /* 4k should be on the safe side */
228 return bs
->bl
.opt_mem_alignment
;
231 /* check if the path starts with "<protocol>:" */
232 static int path_has_protocol(const char *path
)
237 if (is_windows_drive(path
) ||
238 is_windows_drive_prefix(path
)) {
241 p
= path
+ strcspn(path
, ":/\\");
243 p
= path
+ strcspn(path
, ":/");
249 int path_is_absolute(const char *path
)
252 /* specific case for names like: "\\.\d:" */
253 if (is_windows_drive(path
) || is_windows_drive_prefix(path
)) {
256 return (*path
== '/' || *path
== '\\');
258 return (*path
== '/');
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
265 void path_combine(char *dest
, int dest_size
,
266 const char *base_path
,
267 const char *filename
)
274 if (path_is_absolute(filename
)) {
275 pstrcpy(dest
, dest_size
, filename
);
277 p
= strchr(base_path
, ':');
282 p1
= strrchr(base_path
, '/');
286 p2
= strrchr(base_path
, '\\');
298 if (len
> dest_size
- 1)
300 memcpy(dest
, base_path
, len
);
302 pstrcat(dest
, dest_size
, filename
);
306 void bdrv_get_full_backing_filename(BlockDriverState
*bs
, char *dest
, size_t sz
)
308 if (bs
->backing_file
[0] == '\0' || path_has_protocol(bs
->backing_file
)) {
309 pstrcpy(dest
, sz
, bs
->backing_file
);
311 path_combine(dest
, sz
, bs
->filename
, bs
->backing_file
);
315 void bdrv_register(BlockDriver
*bdrv
)
317 /* Block drivers without coroutine functions need emulation */
318 if (!bdrv
->bdrv_co_readv
) {
319 bdrv
->bdrv_co_readv
= bdrv_co_readv_em
;
320 bdrv
->bdrv_co_writev
= bdrv_co_writev_em
;
322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323 * the block driver lacks aio we need to emulate that too.
325 if (!bdrv
->bdrv_aio_readv
) {
326 /* add AIO emulation layer */
327 bdrv
->bdrv_aio_readv
= bdrv_aio_readv_em
;
328 bdrv
->bdrv_aio_writev
= bdrv_aio_writev_em
;
332 QLIST_INSERT_HEAD(&bdrv_drivers
, bdrv
, list
);
335 BlockDriverState
*bdrv_new_root(void)
337 BlockDriverState
*bs
= bdrv_new();
339 QTAILQ_INSERT_TAIL(&bdrv_states
, bs
, device_list
);
343 BlockDriverState
*bdrv_new(void)
345 BlockDriverState
*bs
;
348 bs
= g_new0(BlockDriverState
, 1);
349 QLIST_INIT(&bs
->dirty_bitmaps
);
350 for (i
= 0; i
< BLOCK_OP_TYPE_MAX
; i
++) {
351 QLIST_INIT(&bs
->op_blockers
[i
]);
353 bdrv_iostatus_disable(bs
);
354 notifier_list_init(&bs
->close_notifiers
);
355 notifier_with_return_list_init(&bs
->before_write_notifiers
);
356 qemu_co_queue_init(&bs
->throttled_reqs
[0]);
357 qemu_co_queue_init(&bs
->throttled_reqs
[1]);
359 bs
->aio_context
= qemu_get_aio_context();
364 void bdrv_add_close_notifier(BlockDriverState
*bs
, Notifier
*notify
)
366 notifier_list_add(&bs
->close_notifiers
, notify
);
369 BlockDriver
*bdrv_find_format(const char *format_name
)
372 QLIST_FOREACH(drv1
, &bdrv_drivers
, list
) {
373 if (!strcmp(drv1
->format_name
, format_name
)) {
380 static int bdrv_is_whitelisted(BlockDriver
*drv
, bool read_only
)
382 static const char *whitelist_rw
[] = {
383 CONFIG_BDRV_RW_WHITELIST
385 static const char *whitelist_ro
[] = {
386 CONFIG_BDRV_RO_WHITELIST
390 if (!whitelist_rw
[0] && !whitelist_ro
[0]) {
391 return 1; /* no whitelist, anything goes */
394 for (p
= whitelist_rw
; *p
; p
++) {
395 if (!strcmp(drv
->format_name
, *p
)) {
400 for (p
= whitelist_ro
; *p
; p
++) {
401 if (!strcmp(drv
->format_name
, *p
)) {
409 BlockDriver
*bdrv_find_whitelisted_format(const char *format_name
,
412 BlockDriver
*drv
= bdrv_find_format(format_name
);
413 return drv
&& bdrv_is_whitelisted(drv
, read_only
) ? drv
: NULL
;
416 typedef struct CreateCo
{
424 static void coroutine_fn
bdrv_create_co_entry(void *opaque
)
426 Error
*local_err
= NULL
;
429 CreateCo
*cco
= opaque
;
432 ret
= cco
->drv
->bdrv_create(cco
->filename
, cco
->opts
, &local_err
);
434 error_propagate(&cco
->err
, local_err
);
439 int bdrv_create(BlockDriver
*drv
, const char* filename
,
440 QemuOpts
*opts
, Error
**errp
)
447 .filename
= g_strdup(filename
),
453 if (!drv
->bdrv_create
) {
454 error_setg(errp
, "Driver '%s' does not support image creation", drv
->format_name
);
459 if (qemu_in_coroutine()) {
460 /* Fast-path if already in coroutine context */
461 bdrv_create_co_entry(&cco
);
463 co
= qemu_coroutine_create(bdrv_create_co_entry
);
464 qemu_coroutine_enter(co
, &cco
);
465 while (cco
.ret
== NOT_DONE
) {
466 aio_poll(qemu_get_aio_context(), true);
473 error_propagate(errp
, cco
.err
);
475 error_setg_errno(errp
, -ret
, "Could not create image");
480 g_free(cco
.filename
);
484 int bdrv_create_file(const char *filename
, QemuOpts
*opts
, Error
**errp
)
487 Error
*local_err
= NULL
;
490 drv
= bdrv_find_protocol(filename
, true);
492 error_setg(errp
, "Could not find protocol for file '%s'", filename
);
496 ret
= bdrv_create(drv
, filename
, opts
, &local_err
);
498 error_propagate(errp
, local_err
);
503 void bdrv_refresh_limits(BlockDriverState
*bs
, Error
**errp
)
505 BlockDriver
*drv
= bs
->drv
;
506 Error
*local_err
= NULL
;
508 memset(&bs
->bl
, 0, sizeof(bs
->bl
));
514 /* Take some limits from the children as a default */
516 bdrv_refresh_limits(bs
->file
, &local_err
);
518 error_propagate(errp
, local_err
);
521 bs
->bl
.opt_transfer_length
= bs
->file
->bl
.opt_transfer_length
;
522 bs
->bl
.max_transfer_length
= bs
->file
->bl
.max_transfer_length
;
523 bs
->bl
.opt_mem_alignment
= bs
->file
->bl
.opt_mem_alignment
;
525 bs
->bl
.opt_mem_alignment
= 512;
528 if (bs
->backing_hd
) {
529 bdrv_refresh_limits(bs
->backing_hd
, &local_err
);
531 error_propagate(errp
, local_err
);
534 bs
->bl
.opt_transfer_length
=
535 MAX(bs
->bl
.opt_transfer_length
,
536 bs
->backing_hd
->bl
.opt_transfer_length
);
537 bs
->bl
.max_transfer_length
=
538 MIN_NON_ZERO(bs
->bl
.max_transfer_length
,
539 bs
->backing_hd
->bl
.max_transfer_length
);
540 bs
->bl
.opt_mem_alignment
=
541 MAX(bs
->bl
.opt_mem_alignment
,
542 bs
->backing_hd
->bl
.opt_mem_alignment
);
545 /* Then let the driver override it */
546 if (drv
->bdrv_refresh_limits
) {
547 drv
->bdrv_refresh_limits(bs
, errp
);
552 * Create a uniquely-named empty temporary file.
553 * Return 0 upon success, otherwise a negative errno value.
555 int get_tmp_filename(char *filename
, int size
)
558 char temp_dir
[MAX_PATH
];
559 /* GetTempFileName requires that its output buffer (4th param)
560 have length MAX_PATH or greater. */
561 assert(size
>= MAX_PATH
);
562 return (GetTempPath(MAX_PATH
, temp_dir
)
563 && GetTempFileName(temp_dir
, "qem", 0, filename
)
564 ? 0 : -GetLastError());
568 tmpdir
= getenv("TMPDIR");
572 if (snprintf(filename
, size
, "%s/vl.XXXXXX", tmpdir
) >= size
) {
575 fd
= mkstemp(filename
);
579 if (close(fd
) != 0) {
588 * Detect host devices. By convention, /dev/cdrom[N] is always
589 * recognized as a host CDROM.
591 static BlockDriver
*find_hdev_driver(const char *filename
)
593 int score_max
= 0, score
;
594 BlockDriver
*drv
= NULL
, *d
;
596 QLIST_FOREACH(d
, &bdrv_drivers
, list
) {
597 if (d
->bdrv_probe_device
) {
598 score
= d
->bdrv_probe_device(filename
);
599 if (score
> score_max
) {
609 BlockDriver
*bdrv_find_protocol(const char *filename
,
610 bool allow_protocol_prefix
)
617 /* TODO Drivers without bdrv_file_open must be specified explicitly */
620 * XXX(hch): we really should not let host device detection
621 * override an explicit protocol specification, but moving this
622 * later breaks access to device names with colons in them.
623 * Thanks to the brain-dead persistent naming schemes on udev-
624 * based Linux systems those actually are quite common.
626 drv1
= find_hdev_driver(filename
);
631 if (!path_has_protocol(filename
) || !allow_protocol_prefix
) {
632 return bdrv_find_format("file");
635 p
= strchr(filename
, ':');
638 if (len
> sizeof(protocol
) - 1)
639 len
= sizeof(protocol
) - 1;
640 memcpy(protocol
, filename
, len
);
641 protocol
[len
] = '\0';
642 QLIST_FOREACH(drv1
, &bdrv_drivers
, list
) {
643 if (drv1
->protocol_name
&&
644 !strcmp(drv1
->protocol_name
, protocol
)) {
651 static int find_image_format(BlockDriverState
*bs
, const char *filename
,
652 BlockDriver
**pdrv
, Error
**errp
)
654 int score
, score_max
;
655 BlockDriver
*drv1
, *drv
;
659 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
660 if (bs
->sg
|| !bdrv_is_inserted(bs
) || bdrv_getlength(bs
) == 0) {
661 drv
= bdrv_find_format("raw");
663 error_setg(errp
, "Could not find raw image format");
670 ret
= bdrv_pread(bs
, 0, buf
, sizeof(buf
));
672 error_setg_errno(errp
, -ret
, "Could not read image for determining its "
680 QLIST_FOREACH(drv1
, &bdrv_drivers
, list
) {
681 if (drv1
->bdrv_probe
) {
682 score
= drv1
->bdrv_probe(buf
, ret
, filename
);
683 if (score
> score_max
) {
690 error_setg(errp
, "Could not determine image format: No compatible "
699 * Set the current 'total_sectors' value
700 * Return 0 on success, -errno on error.
702 static int refresh_total_sectors(BlockDriverState
*bs
, int64_t hint
)
704 BlockDriver
*drv
= bs
->drv
;
706 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
710 /* query actual device if possible, otherwise just trust the hint */
711 if (drv
->bdrv_getlength
) {
712 int64_t length
= drv
->bdrv_getlength(bs
);
716 hint
= DIV_ROUND_UP(length
, BDRV_SECTOR_SIZE
);
719 bs
->total_sectors
= hint
;
724 * Set open flags for a given discard mode
726 * Return 0 on success, -1 if the discard mode was invalid.
728 int bdrv_parse_discard_flags(const char *mode
, int *flags
)
730 *flags
&= ~BDRV_O_UNMAP
;
732 if (!strcmp(mode
, "off") || !strcmp(mode
, "ignore")) {
734 } else if (!strcmp(mode
, "on") || !strcmp(mode
, "unmap")) {
735 *flags
|= BDRV_O_UNMAP
;
744 * Set open flags for a given cache mode
746 * Return 0 on success, -1 if the cache mode was invalid.
748 int bdrv_parse_cache_flags(const char *mode
, int *flags
)
750 *flags
&= ~BDRV_O_CACHE_MASK
;
752 if (!strcmp(mode
, "off") || !strcmp(mode
, "none")) {
753 *flags
|= BDRV_O_NOCACHE
| BDRV_O_CACHE_WB
;
754 } else if (!strcmp(mode
, "directsync")) {
755 *flags
|= BDRV_O_NOCACHE
;
756 } else if (!strcmp(mode
, "writeback")) {
757 *flags
|= BDRV_O_CACHE_WB
;
758 } else if (!strcmp(mode
, "unsafe")) {
759 *flags
|= BDRV_O_CACHE_WB
;
760 *flags
|= BDRV_O_NO_FLUSH
;
761 } else if (!strcmp(mode
, "writethrough")) {
762 /* this is the default */
771 * The copy-on-read flag is actually a reference count so multiple users may
772 * use the feature without worrying about clobbering its previous state.
773 * Copy-on-read stays enabled until all users have called to disable it.
775 void bdrv_enable_copy_on_read(BlockDriverState
*bs
)
780 void bdrv_disable_copy_on_read(BlockDriverState
*bs
)
782 assert(bs
->copy_on_read
> 0);
787 * Returns the flags that a temporary snapshot should get, based on the
788 * originally requested flags (the originally requested image will have flags
789 * like a backing file)
791 static int bdrv_temp_snapshot_flags(int flags
)
793 return (flags
& ~BDRV_O_SNAPSHOT
) | BDRV_O_TEMPORARY
;
797 * Returns the flags that bs->file should get, based on the given flags for
800 static int bdrv_inherited_flags(int flags
)
802 /* Enable protocol handling, disable format probing for bs->file */
803 flags
|= BDRV_O_PROTOCOL
;
805 /* Our block drivers take care to send flushes and respect unmap policy,
806 * so we can enable both unconditionally on lower layers. */
807 flags
|= BDRV_O_CACHE_WB
| BDRV_O_UNMAP
;
809 /* Clear flags that only apply to the top layer */
810 flags
&= ~(BDRV_O_SNAPSHOT
| BDRV_O_NO_BACKING
| BDRV_O_COPY_ON_READ
);
816 * Returns the flags that bs->backing_hd should get, based on the given flags
819 static int bdrv_backing_flags(int flags
)
821 /* backing files always opened read-only */
822 flags
&= ~(BDRV_O_RDWR
| BDRV_O_COPY_ON_READ
);
824 /* snapshot=on is handled on the top layer */
825 flags
&= ~(BDRV_O_SNAPSHOT
| BDRV_O_TEMPORARY
);
830 static int bdrv_open_flags(BlockDriverState
*bs
, int flags
)
832 int open_flags
= flags
| BDRV_O_CACHE_WB
;
835 * Clear flags that are internal to the block layer before opening the
838 open_flags
&= ~(BDRV_O_SNAPSHOT
| BDRV_O_NO_BACKING
| BDRV_O_PROTOCOL
);
841 * Snapshots should be writable.
843 if (flags
& BDRV_O_TEMPORARY
) {
844 open_flags
|= BDRV_O_RDWR
;
850 static void bdrv_assign_node_name(BlockDriverState
*bs
,
851 const char *node_name
,
858 /* Check for empty string or invalid characters */
859 if (!id_wellformed(node_name
)) {
860 error_setg(errp
, "Invalid node name");
864 /* takes care of avoiding namespaces collisions */
865 if (blk_by_name(node_name
)) {
866 error_setg(errp
, "node-name=%s is conflicting with a device id",
871 /* takes care of avoiding duplicates node names */
872 if (bdrv_find_node(node_name
)) {
873 error_setg(errp
, "Duplicate node name");
877 /* copy node name into the bs and insert it into the graph list */
878 pstrcpy(bs
->node_name
, sizeof(bs
->node_name
), node_name
);
879 QTAILQ_INSERT_TAIL(&graph_bdrv_states
, bs
, node_list
);
883 * Common part for opening disk images and files
885 * Removes all processed options from *options.
887 static int bdrv_open_common(BlockDriverState
*bs
, BlockDriverState
*file
,
888 QDict
*options
, int flags
, BlockDriver
*drv
, Error
**errp
)
891 const char *filename
;
892 const char *node_name
= NULL
;
893 Error
*local_err
= NULL
;
896 assert(bs
->file
== NULL
);
897 assert(options
!= NULL
&& bs
->options
!= options
);
900 filename
= file
->filename
;
902 filename
= qdict_get_try_str(options
, "filename");
905 if (drv
->bdrv_needs_filename
&& !filename
) {
906 error_setg(errp
, "The '%s' block driver requires a file name",
911 trace_bdrv_open_common(bs
, filename
?: "", flags
, drv
->format_name
);
913 node_name
= qdict_get_try_str(options
, "node-name");
914 bdrv_assign_node_name(bs
, node_name
, &local_err
);
916 error_propagate(errp
, local_err
);
919 qdict_del(options
, "node-name");
921 /* bdrv_open() with directly using a protocol as drv. This layer is already
922 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
923 * and return immediately. */
924 if (file
!= NULL
&& drv
->bdrv_file_open
) {
929 bs
->open_flags
= flags
;
930 bs
->guest_block_size
= 512;
931 bs
->request_alignment
= 512;
932 bs
->zero_beyond_eof
= true;
933 open_flags
= bdrv_open_flags(bs
, flags
);
934 bs
->read_only
= !(open_flags
& BDRV_O_RDWR
);
935 bs
->growable
= !!(flags
& BDRV_O_PROTOCOL
);
937 if (use_bdrv_whitelist
&& !bdrv_is_whitelisted(drv
, bs
->read_only
)) {
939 !bs
->read_only
&& bdrv_is_whitelisted(drv
, true)
940 ? "Driver '%s' can only be used for read-only devices"
941 : "Driver '%s' is not whitelisted",
946 assert(bs
->copy_on_read
== 0); /* bdrv_new() and bdrv_close() make it so */
947 if (flags
& BDRV_O_COPY_ON_READ
) {
948 if (!bs
->read_only
) {
949 bdrv_enable_copy_on_read(bs
);
951 error_setg(errp
, "Can't use copy-on-read on read-only device");
956 if (filename
!= NULL
) {
957 pstrcpy(bs
->filename
, sizeof(bs
->filename
), filename
);
959 bs
->filename
[0] = '\0';
961 pstrcpy(bs
->exact_filename
, sizeof(bs
->exact_filename
), bs
->filename
);
964 bs
->opaque
= g_malloc0(drv
->instance_size
);
966 bs
->enable_write_cache
= !!(flags
& BDRV_O_CACHE_WB
);
968 /* Open the image, either directly or using a protocol */
969 if (drv
->bdrv_file_open
) {
970 assert(file
== NULL
);
971 assert(!drv
->bdrv_needs_filename
|| filename
!= NULL
);
972 ret
= drv
->bdrv_file_open(bs
, options
, open_flags
, &local_err
);
975 error_setg(errp
, "Can't use '%s' as a block driver for the "
976 "protocol level", drv
->format_name
);
981 ret
= drv
->bdrv_open(bs
, options
, open_flags
, &local_err
);
986 error_propagate(errp
, local_err
);
987 } else if (bs
->filename
[0]) {
988 error_setg_errno(errp
, -ret
, "Could not open '%s'", bs
->filename
);
990 error_setg_errno(errp
, -ret
, "Could not open image");
995 ret
= refresh_total_sectors(bs
, bs
->total_sectors
);
997 error_setg_errno(errp
, -ret
, "Could not refresh total sector count");
1001 bdrv_refresh_limits(bs
, &local_err
);
1003 error_propagate(errp
, local_err
);
1008 assert(bdrv_opt_mem_align(bs
) != 0);
1009 assert((bs
->request_alignment
!= 0) || bs
->sg
);
1020 static QDict
*parse_json_filename(const char *filename
, Error
**errp
)
1022 QObject
*options_obj
;
1026 ret
= strstart(filename
, "json:", &filename
);
1029 options_obj
= qobject_from_json(filename
);
1031 error_setg(errp
, "Could not parse the JSON options");
1035 if (qobject_type(options_obj
) != QTYPE_QDICT
) {
1036 qobject_decref(options_obj
);
1037 error_setg(errp
, "Invalid JSON object given");
1041 options
= qobject_to_qdict(options_obj
);
1042 qdict_flatten(options
);
1048 * Fills in default options for opening images and converts the legacy
1049 * filename/flags pair to option QDict entries.
1051 static int bdrv_fill_options(QDict
**options
, const char **pfilename
, int flags
,
1052 BlockDriver
*drv
, Error
**errp
)
1054 const char *filename
= *pfilename
;
1055 const char *drvname
;
1056 bool protocol
= flags
& BDRV_O_PROTOCOL
;
1057 bool parse_filename
= false;
1058 Error
*local_err
= NULL
;
1060 /* Parse json: pseudo-protocol */
1061 if (filename
&& g_str_has_prefix(filename
, "json:")) {
1062 QDict
*json_options
= parse_json_filename(filename
, &local_err
);
1064 error_propagate(errp
, local_err
);
1068 /* Options given in the filename have lower priority than options
1069 * specified directly */
1070 qdict_join(*options
, json_options
, false);
1071 QDECREF(json_options
);
1072 *pfilename
= filename
= NULL
;
1075 /* Fetch the file name from the options QDict if necessary */
1076 if (protocol
&& filename
) {
1077 if (!qdict_haskey(*options
, "filename")) {
1078 qdict_put(*options
, "filename", qstring_from_str(filename
));
1079 parse_filename
= true;
1081 error_setg(errp
, "Can't specify 'file' and 'filename' options at "
1087 /* Find the right block driver */
1088 filename
= qdict_get_try_str(*options
, "filename");
1089 drvname
= qdict_get_try_str(*options
, "driver");
1093 error_setg(errp
, "Driver specified twice");
1096 drvname
= drv
->format_name
;
1097 qdict_put(*options
, "driver", qstring_from_str(drvname
));
1099 if (!drvname
&& protocol
) {
1101 drv
= bdrv_find_protocol(filename
, parse_filename
);
1103 error_setg(errp
, "Unknown protocol");
1107 drvname
= drv
->format_name
;
1108 qdict_put(*options
, "driver", qstring_from_str(drvname
));
1110 error_setg(errp
, "Must specify either driver or file");
1113 } else if (drvname
) {
1114 drv
= bdrv_find_format(drvname
);
1116 error_setg(errp
, "Unknown driver '%s'", drvname
);
1122 assert(drv
|| !protocol
);
1124 /* Driver-specific filename parsing */
1125 if (drv
&& drv
->bdrv_parse_filename
&& parse_filename
) {
1126 drv
->bdrv_parse_filename(filename
, *options
, &local_err
);
1128 error_propagate(errp
, local_err
);
1132 if (!drv
->bdrv_needs_filename
) {
1133 qdict_del(*options
, "filename");
1140 void bdrv_set_backing_hd(BlockDriverState
*bs
, BlockDriverState
*backing_hd
)
1143 if (bs
->backing_hd
) {
1144 assert(bs
->backing_blocker
);
1145 bdrv_op_unblock_all(bs
->backing_hd
, bs
->backing_blocker
);
1146 } else if (backing_hd
) {
1147 error_setg(&bs
->backing_blocker
,
1148 "device is used as backing hd of '%s'",
1149 bdrv_get_device_name(bs
));
1152 bs
->backing_hd
= backing_hd
;
1154 error_free(bs
->backing_blocker
);
1155 bs
->backing_blocker
= NULL
;
1158 bs
->open_flags
&= ~BDRV_O_NO_BACKING
;
1159 pstrcpy(bs
->backing_file
, sizeof(bs
->backing_file
), backing_hd
->filename
);
1160 pstrcpy(bs
->backing_format
, sizeof(bs
->backing_format
),
1161 backing_hd
->drv
? backing_hd
->drv
->format_name
: "");
1163 bdrv_op_block_all(bs
->backing_hd
, bs
->backing_blocker
);
1164 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1165 bdrv_op_unblock(bs
->backing_hd
, BLOCK_OP_TYPE_COMMIT
,
1166 bs
->backing_blocker
);
1168 bdrv_refresh_limits(bs
, NULL
);
1172 * Opens the backing file for a BlockDriverState if not yet open
1174 * options is a QDict of options to pass to the block drivers, or NULL for an
1175 * empty set of options. The reference to the QDict is transferred to this
1176 * function (even on failure), so if the caller intends to reuse the dictionary,
1177 * it needs to use QINCREF() before calling bdrv_file_open.
1179 int bdrv_open_backing_file(BlockDriverState
*bs
, QDict
*options
, Error
**errp
)
1181 char *backing_filename
= g_malloc0(PATH_MAX
);
1183 BlockDriver
*back_drv
= NULL
;
1184 BlockDriverState
*backing_hd
;
1185 Error
*local_err
= NULL
;
1187 if (bs
->backing_hd
!= NULL
) {
1192 /* NULL means an empty set of options */
1193 if (options
== NULL
) {
1194 options
= qdict_new();
1197 bs
->open_flags
&= ~BDRV_O_NO_BACKING
;
1198 if (qdict_haskey(options
, "file.filename")) {
1199 backing_filename
[0] = '\0';
1200 } else if (bs
->backing_file
[0] == '\0' && qdict_size(options
) == 0) {
1204 bdrv_get_full_backing_filename(bs
, backing_filename
, PATH_MAX
);
1207 if (!bs
->drv
|| !bs
->drv
->supports_backing
) {
1209 error_setg(errp
, "Driver doesn't support backing files");
1214 backing_hd
= bdrv_new();
1216 if (bs
->backing_format
[0] != '\0') {
1217 back_drv
= bdrv_find_format(bs
->backing_format
);
1220 assert(bs
->backing_hd
== NULL
);
1221 ret
= bdrv_open(&backing_hd
,
1222 *backing_filename
? backing_filename
: NULL
, NULL
, options
,
1223 bdrv_backing_flags(bs
->open_flags
), back_drv
, &local_err
);
1225 bdrv_unref(backing_hd
);
1227 bs
->open_flags
|= BDRV_O_NO_BACKING
;
1228 error_setg(errp
, "Could not open backing file: %s",
1229 error_get_pretty(local_err
));
1230 error_free(local_err
);
1233 bdrv_set_backing_hd(bs
, backing_hd
);
1236 g_free(backing_filename
);
1241 * Opens a disk image whose options are given as BlockdevRef in another block
1244 * If allow_none is true, no image will be opened if filename is false and no
1245 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1247 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1248 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1249 * itself, all options starting with "${bdref_key}." are considered part of the
1252 * The BlockdevRef will be removed from the options QDict.
1254 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1256 int bdrv_open_image(BlockDriverState
**pbs
, const char *filename
,
1257 QDict
*options
, const char *bdref_key
, int flags
,
1258 bool allow_none
, Error
**errp
)
1260 QDict
*image_options
;
1262 char *bdref_key_dot
;
1263 const char *reference
;
1266 assert(*pbs
== NULL
);
1268 bdref_key_dot
= g_strdup_printf("%s.", bdref_key
);
1269 qdict_extract_subqdict(options
, &image_options
, bdref_key_dot
);
1270 g_free(bdref_key_dot
);
1272 reference
= qdict_get_try_str(options
, bdref_key
);
1273 if (!filename
&& !reference
&& !qdict_size(image_options
)) {
1277 error_setg(errp
, "A block device must be specified for \"%s\"",
1281 QDECREF(image_options
);
1285 ret
= bdrv_open(pbs
, filename
, reference
, image_options
, flags
, NULL
, errp
);
1288 qdict_del(options
, bdref_key
);
1292 int bdrv_append_temp_snapshot(BlockDriverState
*bs
, int flags
, Error
**errp
)
1294 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1295 char *tmp_filename
= g_malloc0(PATH_MAX
+ 1);
1297 BlockDriver
*bdrv_qcow2
;
1298 QemuOpts
*opts
= NULL
;
1299 QDict
*snapshot_options
;
1300 BlockDriverState
*bs_snapshot
;
1304 /* if snapshot, we create a temporary backing file and open it
1305 instead of opening 'filename' directly */
1307 /* Get the required size from the image */
1308 total_size
= bdrv_getlength(bs
);
1309 if (total_size
< 0) {
1311 error_setg_errno(errp
, -total_size
, "Could not get image size");
1315 /* Create the temporary image */
1316 ret
= get_tmp_filename(tmp_filename
, PATH_MAX
+ 1);
1318 error_setg_errno(errp
, -ret
, "Could not get temporary filename");
1322 bdrv_qcow2
= bdrv_find_format("qcow2");
1323 opts
= qemu_opts_create(bdrv_qcow2
->create_opts
, NULL
, 0,
1325 qemu_opt_set_number(opts
, BLOCK_OPT_SIZE
, total_size
);
1326 ret
= bdrv_create(bdrv_qcow2
, tmp_filename
, opts
, &local_err
);
1327 qemu_opts_del(opts
);
1329 error_setg_errno(errp
, -ret
, "Could not create temporary overlay "
1330 "'%s': %s", tmp_filename
,
1331 error_get_pretty(local_err
));
1332 error_free(local_err
);
1336 /* Prepare a new options QDict for the temporary file */
1337 snapshot_options
= qdict_new();
1338 qdict_put(snapshot_options
, "file.driver",
1339 qstring_from_str("file"));
1340 qdict_put(snapshot_options
, "file.filename",
1341 qstring_from_str(tmp_filename
));
1343 bs_snapshot
= bdrv_new();
1345 ret
= bdrv_open(&bs_snapshot
, NULL
, NULL
, snapshot_options
,
1346 flags
, bdrv_qcow2
, &local_err
);
1348 error_propagate(errp
, local_err
);
1352 bdrv_append(bs_snapshot
, bs
);
1355 g_free(tmp_filename
);
1360 * Opens a disk image (raw, qcow2, vmdk, ...)
1362 * options is a QDict of options to pass to the block drivers, or NULL for an
1363 * empty set of options. The reference to the QDict belongs to the block layer
1364 * after the call (even on failure), so if the caller intends to reuse the
1365 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1367 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1368 * If it is not NULL, the referenced BDS will be reused.
1370 * The reference parameter may be used to specify an existing block device which
1371 * should be opened. If specified, neither options nor a filename may be given,
1372 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1374 int bdrv_open(BlockDriverState
**pbs
, const char *filename
,
1375 const char *reference
, QDict
*options
, int flags
,
1376 BlockDriver
*drv
, Error
**errp
)
1379 BlockDriverState
*file
= NULL
, *bs
;
1380 const char *drvname
;
1381 Error
*local_err
= NULL
;
1382 int snapshot_flags
= 0;
1387 bool options_non_empty
= options
? qdict_size(options
) : false;
1391 error_setg(errp
, "Cannot reuse an existing BDS when referencing "
1392 "another block device");
1396 if (filename
|| options_non_empty
) {
1397 error_setg(errp
, "Cannot reference an existing block device with "
1398 "additional options or a new filename");
1402 bs
= bdrv_lookup_bs(reference
, reference
, errp
);
1417 /* NULL means an empty set of options */
1418 if (options
== NULL
) {
1419 options
= qdict_new();
1422 ret
= bdrv_fill_options(&options
, &filename
, flags
, drv
, &local_err
);
1427 /* Find the right image format driver */
1429 drvname
= qdict_get_try_str(options
, "driver");
1431 drv
= bdrv_find_format(drvname
);
1432 qdict_del(options
, "driver");
1434 error_setg(errp
, "Unknown driver: '%s'", drvname
);
1440 assert(drvname
|| !(flags
& BDRV_O_PROTOCOL
));
1441 if (drv
&& !drv
->bdrv_file_open
) {
1442 /* If the user explicitly wants a format driver here, we'll need to add
1443 * another layer for the protocol in bs->file */
1444 flags
&= ~BDRV_O_PROTOCOL
;
1447 bs
->options
= options
;
1448 options
= qdict_clone_shallow(options
);
1450 /* Open image file without format layer */
1451 if ((flags
& BDRV_O_PROTOCOL
) == 0) {
1452 if (flags
& BDRV_O_RDWR
) {
1453 flags
|= BDRV_O_ALLOW_RDWR
;
1455 if (flags
& BDRV_O_SNAPSHOT
) {
1456 snapshot_flags
= bdrv_temp_snapshot_flags(flags
);
1457 flags
= bdrv_backing_flags(flags
);
1460 assert(file
== NULL
);
1461 ret
= bdrv_open_image(&file
, filename
, options
, "file",
1462 bdrv_inherited_flags(flags
),
1469 /* Image format probing */
1471 ret
= find_image_format(file
, filename
, &drv
, &local_err
);
1476 error_setg(errp
, "Must specify either driver or file");
1481 /* Open the image */
1482 ret
= bdrv_open_common(bs
, file
, options
, flags
, drv
, &local_err
);
1487 if (file
&& (bs
->file
!= file
)) {
1492 /* If there is a backing file, use it */
1493 if ((flags
& BDRV_O_NO_BACKING
) == 0) {
1494 QDict
*backing_options
;
1496 qdict_extract_subqdict(options
, &backing_options
, "backing.");
1497 ret
= bdrv_open_backing_file(bs
, backing_options
, &local_err
);
1499 goto close_and_fail
;
1503 bdrv_refresh_filename(bs
);
1505 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1506 * temporary snapshot afterwards. */
1507 if (snapshot_flags
) {
1508 ret
= bdrv_append_temp_snapshot(bs
, snapshot_flags
, &local_err
);
1510 goto close_and_fail
;
1514 /* Check if any unknown options were used */
1515 if (options
&& (qdict_size(options
) != 0)) {
1516 const QDictEntry
*entry
= qdict_first(options
);
1517 if (flags
& BDRV_O_PROTOCOL
) {
1518 error_setg(errp
, "Block protocol '%s' doesn't support the option "
1519 "'%s'", drv
->format_name
, entry
->key
);
1521 error_setg(errp
, "Block format '%s' used by device '%s' doesn't "
1522 "support the option '%s'", drv
->format_name
,
1523 bdrv_get_device_name(bs
), entry
->key
);
1527 goto close_and_fail
;
1530 if (!bdrv_key_required(bs
)) {
1532 blk_dev_change_media_cb(bs
->blk
, true);
1534 } else if (!runstate_check(RUN_STATE_PRELAUNCH
)
1535 && !runstate_check(RUN_STATE_INMIGRATE
)
1536 && !runstate_check(RUN_STATE_PAUSED
)) { /* HACK */
1538 "Guest must be stopped for opening of encrypted image");
1540 goto close_and_fail
;
1551 QDECREF(bs
->options
);
1555 /* If *pbs is NULL, a new BDS has been created in this function and
1556 needs to be freed now. Otherwise, it does not need to be closed,
1557 since it has not really been opened yet. */
1561 error_propagate(errp
, local_err
);
1566 /* See fail path, but now the BDS has to be always closed */
1574 error_propagate(errp
, local_err
);
1579 typedef struct BlockReopenQueueEntry
{
1581 BDRVReopenState state
;
1582 QSIMPLEQ_ENTRY(BlockReopenQueueEntry
) entry
;
1583 } BlockReopenQueueEntry
;
1586 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1587 * reopen of multiple devices.
1589 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1590 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1591 * be created and initialized. This newly created BlockReopenQueue should be
1592 * passed back in for subsequent calls that are intended to be of the same
1595 * bs is the BlockDriverState to add to the reopen queue.
1597 * flags contains the open flags for the associated bs
1599 * returns a pointer to bs_queue, which is either the newly allocated
1600 * bs_queue, or the existing bs_queue being used.
1603 BlockReopenQueue
*bdrv_reopen_queue(BlockReopenQueue
*bs_queue
,
1604 BlockDriverState
*bs
, int flags
)
1608 BlockReopenQueueEntry
*bs_entry
;
1609 if (bs_queue
== NULL
) {
1610 bs_queue
= g_new0(BlockReopenQueue
, 1);
1611 QSIMPLEQ_INIT(bs_queue
);
1614 /* bdrv_open() masks this flag out */
1615 flags
&= ~BDRV_O_PROTOCOL
;
1618 bdrv_reopen_queue(bs_queue
, bs
->file
, bdrv_inherited_flags(flags
));
1621 bs_entry
= g_new0(BlockReopenQueueEntry
, 1);
1622 QSIMPLEQ_INSERT_TAIL(bs_queue
, bs_entry
, entry
);
1624 bs_entry
->state
.bs
= bs
;
1625 bs_entry
->state
.flags
= flags
;
1631 * Reopen multiple BlockDriverStates atomically & transactionally.
1633 * The queue passed in (bs_queue) must have been built up previous
1634 * via bdrv_reopen_queue().
1636 * Reopens all BDS specified in the queue, with the appropriate
1637 * flags. All devices are prepared for reopen, and failure of any
1638 * device will cause all device changes to be abandonded, and intermediate
1641 * If all devices prepare successfully, then the changes are committed
1645 int bdrv_reopen_multiple(BlockReopenQueue
*bs_queue
, Error
**errp
)
1648 BlockReopenQueueEntry
*bs_entry
, *next
;
1649 Error
*local_err
= NULL
;
1651 assert(bs_queue
!= NULL
);
1655 QSIMPLEQ_FOREACH(bs_entry
, bs_queue
, entry
) {
1656 if (bdrv_reopen_prepare(&bs_entry
->state
, bs_queue
, &local_err
)) {
1657 error_propagate(errp
, local_err
);
1660 bs_entry
->prepared
= true;
1663 /* If we reach this point, we have success and just need to apply the
1666 QSIMPLEQ_FOREACH(bs_entry
, bs_queue
, entry
) {
1667 bdrv_reopen_commit(&bs_entry
->state
);
1673 QSIMPLEQ_FOREACH_SAFE(bs_entry
, bs_queue
, entry
, next
) {
1674 if (ret
&& bs_entry
->prepared
) {
1675 bdrv_reopen_abort(&bs_entry
->state
);
1684 /* Reopen a single BlockDriverState with the specified flags. */
1685 int bdrv_reopen(BlockDriverState
*bs
, int bdrv_flags
, Error
**errp
)
1688 Error
*local_err
= NULL
;
1689 BlockReopenQueue
*queue
= bdrv_reopen_queue(NULL
, bs
, bdrv_flags
);
1691 ret
= bdrv_reopen_multiple(queue
, &local_err
);
1692 if (local_err
!= NULL
) {
1693 error_propagate(errp
, local_err
);
1700 * Prepares a BlockDriverState for reopen. All changes are staged in the
1701 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1702 * the block driver layer .bdrv_reopen_prepare()
1704 * bs is the BlockDriverState to reopen
1705 * flags are the new open flags
1706 * queue is the reopen queue
1708 * Returns 0 on success, non-zero on error. On error errp will be set
1711 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1712 * It is the responsibility of the caller to then call the abort() or
1713 * commit() for any other BDS that have been left in a prepare() state
1716 int bdrv_reopen_prepare(BDRVReopenState
*reopen_state
, BlockReopenQueue
*queue
,
1720 Error
*local_err
= NULL
;
1723 assert(reopen_state
!= NULL
);
1724 assert(reopen_state
->bs
->drv
!= NULL
);
1725 drv
= reopen_state
->bs
->drv
;
1727 /* if we are to stay read-only, do not allow permission change
1729 if (!(reopen_state
->bs
->open_flags
& BDRV_O_ALLOW_RDWR
) &&
1730 reopen_state
->flags
& BDRV_O_RDWR
) {
1731 error_set(errp
, QERR_DEVICE_IS_READ_ONLY
,
1732 bdrv_get_device_name(reopen_state
->bs
));
1737 ret
= bdrv_flush(reopen_state
->bs
);
1739 error_set(errp
, ERROR_CLASS_GENERIC_ERROR
, "Error (%s) flushing drive",
1744 if (drv
->bdrv_reopen_prepare
) {
1745 ret
= drv
->bdrv_reopen_prepare(reopen_state
, queue
, &local_err
);
1747 if (local_err
!= NULL
) {
1748 error_propagate(errp
, local_err
);
1750 error_setg(errp
, "failed while preparing to reopen image '%s'",
1751 reopen_state
->bs
->filename
);
1756 /* It is currently mandatory to have a bdrv_reopen_prepare()
1757 * handler for each supported drv. */
1758 error_set(errp
, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED
,
1759 drv
->format_name
, bdrv_get_device_name(reopen_state
->bs
),
1760 "reopening of file");
1772 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1773 * makes them final by swapping the staging BlockDriverState contents into
1774 * the active BlockDriverState contents.
1776 void bdrv_reopen_commit(BDRVReopenState
*reopen_state
)
1780 assert(reopen_state
!= NULL
);
1781 drv
= reopen_state
->bs
->drv
;
1782 assert(drv
!= NULL
);
1784 /* If there are any driver level actions to take */
1785 if (drv
->bdrv_reopen_commit
) {
1786 drv
->bdrv_reopen_commit(reopen_state
);
1789 /* set BDS specific flags now */
1790 reopen_state
->bs
->open_flags
= reopen_state
->flags
;
1791 reopen_state
->bs
->enable_write_cache
= !!(reopen_state
->flags
&
1793 reopen_state
->bs
->read_only
= !(reopen_state
->flags
& BDRV_O_RDWR
);
1795 bdrv_refresh_limits(reopen_state
->bs
, NULL
);
1799 * Abort the reopen, and delete and free the staged changes in
1802 void bdrv_reopen_abort(BDRVReopenState
*reopen_state
)
1806 assert(reopen_state
!= NULL
);
1807 drv
= reopen_state
->bs
->drv
;
1808 assert(drv
!= NULL
);
1810 if (drv
->bdrv_reopen_abort
) {
1811 drv
->bdrv_reopen_abort(reopen_state
);
1816 void bdrv_close(BlockDriverState
*bs
)
1818 BdrvAioNotifier
*ban
, *ban_next
;
1821 block_job_cancel_sync(bs
->job
);
1823 bdrv_drain_all(); /* complete I/O */
1825 bdrv_drain_all(); /* in case flush left pending I/O */
1826 notifier_list_notify(&bs
->close_notifiers
, bs
);
1829 if (bs
->backing_hd
) {
1830 BlockDriverState
*backing_hd
= bs
->backing_hd
;
1831 bdrv_set_backing_hd(bs
, NULL
);
1832 bdrv_unref(backing_hd
);
1834 bs
->drv
->bdrv_close(bs
);
1838 bs
->copy_on_read
= 0;
1839 bs
->backing_file
[0] = '\0';
1840 bs
->backing_format
[0] = '\0';
1841 bs
->total_sectors
= 0;
1846 bs
->zero_beyond_eof
= false;
1847 QDECREF(bs
->options
);
1849 QDECREF(bs
->full_open_options
);
1850 bs
->full_open_options
= NULL
;
1852 if (bs
->file
!= NULL
) {
1853 bdrv_unref(bs
->file
);
1859 blk_dev_change_media_cb(bs
->blk
, false);
1862 /*throttling disk I/O limits*/
1863 if (bs
->io_limits_enabled
) {
1864 bdrv_io_limits_disable(bs
);
1867 QLIST_FOREACH_SAFE(ban
, &bs
->aio_notifiers
, list
, ban_next
) {
1870 QLIST_INIT(&bs
->aio_notifiers
);
1873 void bdrv_close_all(void)
1875 BlockDriverState
*bs
;
1877 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
1878 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
1880 aio_context_acquire(aio_context
);
1882 aio_context_release(aio_context
);
1886 /* Check if any requests are in-flight (including throttled requests) */
1887 static bool bdrv_requests_pending(BlockDriverState
*bs
)
1889 if (!QLIST_EMPTY(&bs
->tracked_requests
)) {
1892 if (!qemu_co_queue_empty(&bs
->throttled_reqs
[0])) {
1895 if (!qemu_co_queue_empty(&bs
->throttled_reqs
[1])) {
1898 if (bs
->file
&& bdrv_requests_pending(bs
->file
)) {
1901 if (bs
->backing_hd
&& bdrv_requests_pending(bs
->backing_hd
)) {
1908 * Wait for pending requests to complete across all BlockDriverStates
1910 * This function does not flush data to disk, use bdrv_flush_all() for that
1911 * after calling this function.
1913 * Note that completion of an asynchronous I/O operation can trigger any
1914 * number of other I/O operations on other devices---for example a coroutine
1915 * can be arbitrarily complex and a constant flow of I/O can come until the
1916 * coroutine is complete. Because of this, it is not possible to have a
1917 * function to drain a single device's I/O queue.
1919 void bdrv_drain_all(void)
1921 /* Always run first iteration so any pending completion BHs run */
1923 BlockDriverState
*bs
;
1928 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
1929 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
1932 aio_context_acquire(aio_context
);
1933 bdrv_flush_io_queue(bs
);
1934 bdrv_start_throttled_reqs(bs
);
1935 bs_busy
= bdrv_requests_pending(bs
);
1936 bs_busy
|= aio_poll(aio_context
, bs_busy
);
1937 aio_context_release(aio_context
);
1944 /* make a BlockDriverState anonymous by removing from bdrv_state and
1945 * graph_bdrv_state list.
1946 Also, NULL terminate the device_name to prevent double remove */
1947 void bdrv_make_anon(BlockDriverState
*bs
)
1950 * Take care to remove bs from bdrv_states only when it's actually
1951 * in it. Note that bs->device_list.tqe_prev is initially null,
1952 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
1953 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1954 * resetting it to null on remove.
1956 if (bs
->device_list
.tqe_prev
) {
1957 QTAILQ_REMOVE(&bdrv_states
, bs
, device_list
);
1958 bs
->device_list
.tqe_prev
= NULL
;
1960 if (bs
->node_name
[0] != '\0') {
1961 QTAILQ_REMOVE(&graph_bdrv_states
, bs
, node_list
);
1963 bs
->node_name
[0] = '\0';
1966 static void bdrv_rebind(BlockDriverState
*bs
)
1968 if (bs
->drv
&& bs
->drv
->bdrv_rebind
) {
1969 bs
->drv
->bdrv_rebind(bs
);
1973 static void bdrv_move_feature_fields(BlockDriverState
*bs_dest
,
1974 BlockDriverState
*bs_src
)
1976 /* move some fields that need to stay attached to the device */
1979 bs_dest
->guest_block_size
= bs_src
->guest_block_size
;
1980 bs_dest
->copy_on_read
= bs_src
->copy_on_read
;
1982 bs_dest
->enable_write_cache
= bs_src
->enable_write_cache
;
1984 /* i/o throttled req */
1985 memcpy(&bs_dest
->throttle_state
,
1986 &bs_src
->throttle_state
,
1987 sizeof(ThrottleState
));
1988 bs_dest
->throttled_reqs
[0] = bs_src
->throttled_reqs
[0];
1989 bs_dest
->throttled_reqs
[1] = bs_src
->throttled_reqs
[1];
1990 bs_dest
->io_limits_enabled
= bs_src
->io_limits_enabled
;
1993 bs_dest
->on_read_error
= bs_src
->on_read_error
;
1994 bs_dest
->on_write_error
= bs_src
->on_write_error
;
1997 bs_dest
->iostatus_enabled
= bs_src
->iostatus_enabled
;
1998 bs_dest
->iostatus
= bs_src
->iostatus
;
2001 bs_dest
->dirty_bitmaps
= bs_src
->dirty_bitmaps
;
2003 /* reference count */
2004 bs_dest
->refcnt
= bs_src
->refcnt
;
2007 bs_dest
->job
= bs_src
->job
;
2009 /* keep the same entry in bdrv_states */
2010 bs_dest
->device_list
= bs_src
->device_list
;
2011 bs_dest
->blk
= bs_src
->blk
;
2013 memcpy(bs_dest
->op_blockers
, bs_src
->op_blockers
,
2014 sizeof(bs_dest
->op_blockers
));
2018 * Swap bs contents for two image chains while they are live,
2019 * while keeping required fields on the BlockDriverState that is
2020 * actually attached to a device.
2022 * This will modify the BlockDriverState fields, and swap contents
2023 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2025 * bs_new must not be attached to a BlockBackend.
2027 * This function does not create any image files.
2029 void bdrv_swap(BlockDriverState
*bs_new
, BlockDriverState
*bs_old
)
2031 BlockDriverState tmp
;
2033 /* The code needs to swap the node_name but simply swapping node_list won't
2034 * work so first remove the nodes from the graph list, do the swap then
2035 * insert them back if needed.
2037 if (bs_new
->node_name
[0] != '\0') {
2038 QTAILQ_REMOVE(&graph_bdrv_states
, bs_new
, node_list
);
2040 if (bs_old
->node_name
[0] != '\0') {
2041 QTAILQ_REMOVE(&graph_bdrv_states
, bs_old
, node_list
);
2044 /* bs_new must be unattached and shouldn't have anything fancy enabled */
2045 assert(!bs_new
->blk
);
2046 assert(QLIST_EMPTY(&bs_new
->dirty_bitmaps
));
2047 assert(bs_new
->job
== NULL
);
2048 assert(bs_new
->io_limits_enabled
== false);
2049 assert(!throttle_have_timer(&bs_new
->throttle_state
));
2055 /* there are some fields that should not be swapped, move them back */
2056 bdrv_move_feature_fields(&tmp
, bs_old
);
2057 bdrv_move_feature_fields(bs_old
, bs_new
);
2058 bdrv_move_feature_fields(bs_new
, &tmp
);
2060 /* bs_new must remain unattached */
2061 assert(!bs_new
->blk
);
2063 /* Check a few fields that should remain attached to the device */
2064 assert(bs_new
->job
== NULL
);
2065 assert(bs_new
->io_limits_enabled
== false);
2066 assert(!throttle_have_timer(&bs_new
->throttle_state
));
2068 /* insert the nodes back into the graph node list if needed */
2069 if (bs_new
->node_name
[0] != '\0') {
2070 QTAILQ_INSERT_TAIL(&graph_bdrv_states
, bs_new
, node_list
);
2072 if (bs_old
->node_name
[0] != '\0') {
2073 QTAILQ_INSERT_TAIL(&graph_bdrv_states
, bs_old
, node_list
);
2076 bdrv_rebind(bs_new
);
2077 bdrv_rebind(bs_old
);
2081 * Add new bs contents at the top of an image chain while the chain is
2082 * live, while keeping required fields on the top layer.
2084 * This will modify the BlockDriverState fields, and swap contents
2085 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2087 * bs_new must not be attached to a BlockBackend.
2089 * This function does not create any image files.
2091 void bdrv_append(BlockDriverState
*bs_new
, BlockDriverState
*bs_top
)
2093 bdrv_swap(bs_new
, bs_top
);
2095 /* The contents of 'tmp' will become bs_top, as we are
2096 * swapping bs_new and bs_top contents. */
2097 bdrv_set_backing_hd(bs_top
, bs_new
);
2100 static void bdrv_delete(BlockDriverState
*bs
)
2103 assert(bdrv_op_blocker_is_empty(bs
));
2104 assert(!bs
->refcnt
);
2105 assert(QLIST_EMPTY(&bs
->dirty_bitmaps
));
2109 /* remove from list, if necessary */
2116 * Run consistency checks on an image
2118 * Returns 0 if the check could be completed (it doesn't mean that the image is
2119 * free of errors) or -errno when an internal error occurred. The results of the
2120 * check are stored in res.
2122 int bdrv_check(BlockDriverState
*bs
, BdrvCheckResult
*res
, BdrvCheckMode fix
)
2124 if (bs
->drv
== NULL
) {
2127 if (bs
->drv
->bdrv_check
== NULL
) {
2131 memset(res
, 0, sizeof(*res
));
2132 return bs
->drv
->bdrv_check(bs
, res
, fix
);
2135 #define COMMIT_BUF_SECTORS 2048
2137 /* commit COW file into the raw image */
2138 int bdrv_commit(BlockDriverState
*bs
)
2140 BlockDriver
*drv
= bs
->drv
;
2141 int64_t sector
, total_sectors
, length
, backing_length
;
2142 int n
, ro
, open_flags
;
2144 uint8_t *buf
= NULL
;
2145 char filename
[PATH_MAX
];
2150 if (!bs
->backing_hd
) {
2154 if (bdrv_op_is_blocked(bs
, BLOCK_OP_TYPE_COMMIT
, NULL
) ||
2155 bdrv_op_is_blocked(bs
->backing_hd
, BLOCK_OP_TYPE_COMMIT
, NULL
)) {
2159 ro
= bs
->backing_hd
->read_only
;
2160 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2161 pstrcpy(filename
, sizeof(filename
), bs
->backing_hd
->filename
);
2162 open_flags
= bs
->backing_hd
->open_flags
;
2165 if (bdrv_reopen(bs
->backing_hd
, open_flags
| BDRV_O_RDWR
, NULL
)) {
2170 length
= bdrv_getlength(bs
);
2176 backing_length
= bdrv_getlength(bs
->backing_hd
);
2177 if (backing_length
< 0) {
2178 ret
= backing_length
;
2182 /* If our top snapshot is larger than the backing file image,
2183 * grow the backing file image if possible. If not possible,
2184 * we must return an error */
2185 if (length
> backing_length
) {
2186 ret
= bdrv_truncate(bs
->backing_hd
, length
);
2192 total_sectors
= length
>> BDRV_SECTOR_BITS
;
2194 /* qemu_try_blockalign() for bs will choose an alignment that works for
2195 * bs->backing_hd as well, so no need to compare the alignment manually. */
2196 buf
= qemu_try_blockalign(bs
, COMMIT_BUF_SECTORS
* BDRV_SECTOR_SIZE
);
2202 for (sector
= 0; sector
< total_sectors
; sector
+= n
) {
2203 ret
= bdrv_is_allocated(bs
, sector
, COMMIT_BUF_SECTORS
, &n
);
2208 ret
= bdrv_read(bs
, sector
, buf
, n
);
2213 ret
= bdrv_write(bs
->backing_hd
, sector
, buf
, n
);
2220 if (drv
->bdrv_make_empty
) {
2221 ret
= drv
->bdrv_make_empty(bs
);
2229 * Make sure all data we wrote to the backing device is actually
2232 if (bs
->backing_hd
) {
2233 bdrv_flush(bs
->backing_hd
);
2241 /* ignoring error return here */
2242 bdrv_reopen(bs
->backing_hd
, open_flags
& ~BDRV_O_RDWR
, NULL
);
2248 int bdrv_commit_all(void)
2250 BlockDriverState
*bs
;
2252 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
2253 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
2255 aio_context_acquire(aio_context
);
2256 if (bs
->drv
&& bs
->backing_hd
) {
2257 int ret
= bdrv_commit(bs
);
2259 aio_context_release(aio_context
);
2263 aio_context_release(aio_context
);
2269 * Remove an active request from the tracked requests list
2271 * This function should be called when a tracked request is completing.
2273 static void tracked_request_end(BdrvTrackedRequest
*req
)
2275 if (req
->serialising
) {
2276 req
->bs
->serialising_in_flight
--;
2279 QLIST_REMOVE(req
, list
);
2280 qemu_co_queue_restart_all(&req
->wait_queue
);
2284 * Add an active request to the tracked requests list
2286 static void tracked_request_begin(BdrvTrackedRequest
*req
,
2287 BlockDriverState
*bs
,
2289 unsigned int bytes
, bool is_write
)
2291 *req
= (BdrvTrackedRequest
){
2295 .is_write
= is_write
,
2296 .co
= qemu_coroutine_self(),
2297 .serialising
= false,
2298 .overlap_offset
= offset
,
2299 .overlap_bytes
= bytes
,
2302 qemu_co_queue_init(&req
->wait_queue
);
2304 QLIST_INSERT_HEAD(&bs
->tracked_requests
, req
, list
);
2307 static void mark_request_serialising(BdrvTrackedRequest
*req
, uint64_t align
)
2309 int64_t overlap_offset
= req
->offset
& ~(align
- 1);
2310 unsigned int overlap_bytes
= ROUND_UP(req
->offset
+ req
->bytes
, align
)
2313 if (!req
->serialising
) {
2314 req
->bs
->serialising_in_flight
++;
2315 req
->serialising
= true;
2318 req
->overlap_offset
= MIN(req
->overlap_offset
, overlap_offset
);
2319 req
->overlap_bytes
= MAX(req
->overlap_bytes
, overlap_bytes
);
2323 * Round a region to cluster boundaries
2325 void bdrv_round_to_clusters(BlockDriverState
*bs
,
2326 int64_t sector_num
, int nb_sectors
,
2327 int64_t *cluster_sector_num
,
2328 int *cluster_nb_sectors
)
2330 BlockDriverInfo bdi
;
2332 if (bdrv_get_info(bs
, &bdi
) < 0 || bdi
.cluster_size
== 0) {
2333 *cluster_sector_num
= sector_num
;
2334 *cluster_nb_sectors
= nb_sectors
;
2336 int64_t c
= bdi
.cluster_size
/ BDRV_SECTOR_SIZE
;
2337 *cluster_sector_num
= QEMU_ALIGN_DOWN(sector_num
, c
);
2338 *cluster_nb_sectors
= QEMU_ALIGN_UP(sector_num
- *cluster_sector_num
+
2343 static int bdrv_get_cluster_size(BlockDriverState
*bs
)
2345 BlockDriverInfo bdi
;
2348 ret
= bdrv_get_info(bs
, &bdi
);
2349 if (ret
< 0 || bdi
.cluster_size
== 0) {
2350 return bs
->request_alignment
;
2352 return bdi
.cluster_size
;
2356 static bool tracked_request_overlaps(BdrvTrackedRequest
*req
,
2357 int64_t offset
, unsigned int bytes
)
2360 if (offset
>= req
->overlap_offset
+ req
->overlap_bytes
) {
2364 if (req
->overlap_offset
>= offset
+ bytes
) {
2370 static bool coroutine_fn
wait_serialising_requests(BdrvTrackedRequest
*self
)
2372 BlockDriverState
*bs
= self
->bs
;
2373 BdrvTrackedRequest
*req
;
2375 bool waited
= false;
2377 if (!bs
->serialising_in_flight
) {
2383 QLIST_FOREACH(req
, &bs
->tracked_requests
, list
) {
2384 if (req
== self
|| (!req
->serialising
&& !self
->serialising
)) {
2387 if (tracked_request_overlaps(req
, self
->overlap_offset
,
2388 self
->overlap_bytes
))
2390 /* Hitting this means there was a reentrant request, for
2391 * example, a block driver issuing nested requests. This must
2392 * never happen since it means deadlock.
2394 assert(qemu_coroutine_self() != req
->co
);
2396 /* If the request is already (indirectly) waiting for us, or
2397 * will wait for us as soon as it wakes up, then just go on
2398 * (instead of producing a deadlock in the former case). */
2399 if (!req
->waiting_for
) {
2400 self
->waiting_for
= req
;
2401 qemu_co_queue_wait(&req
->wait_queue
);
2402 self
->waiting_for
= NULL
;
2417 * -EINVAL - backing format specified, but no file
2418 * -ENOSPC - can't update the backing file because no space is left in the
2420 * -ENOTSUP - format driver doesn't support changing the backing file
2422 int bdrv_change_backing_file(BlockDriverState
*bs
,
2423 const char *backing_file
, const char *backing_fmt
)
2425 BlockDriver
*drv
= bs
->drv
;
2428 /* Backing file format doesn't make sense without a backing file */
2429 if (backing_fmt
&& !backing_file
) {
2433 if (drv
->bdrv_change_backing_file
!= NULL
) {
2434 ret
= drv
->bdrv_change_backing_file(bs
, backing_file
, backing_fmt
);
2440 pstrcpy(bs
->backing_file
, sizeof(bs
->backing_file
), backing_file
?: "");
2441 pstrcpy(bs
->backing_format
, sizeof(bs
->backing_format
), backing_fmt
?: "");
2447 * Finds the image layer in the chain that has 'bs' as its backing file.
2449 * active is the current topmost image.
2451 * Returns NULL if bs is not found in active's image chain,
2452 * or if active == bs.
2454 * Returns the bottommost base image if bs == NULL.
2456 BlockDriverState
*bdrv_find_overlay(BlockDriverState
*active
,
2457 BlockDriverState
*bs
)
2459 while (active
&& bs
!= active
->backing_hd
) {
2460 active
= active
->backing_hd
;
2466 /* Given a BDS, searches for the base layer. */
2467 BlockDriverState
*bdrv_find_base(BlockDriverState
*bs
)
2469 return bdrv_find_overlay(bs
, NULL
);
2472 typedef struct BlkIntermediateStates
{
2473 BlockDriverState
*bs
;
2474 QSIMPLEQ_ENTRY(BlkIntermediateStates
) entry
;
2475 } BlkIntermediateStates
;
2479 * Drops images above 'base' up to and including 'top', and sets the image
2480 * above 'top' to have base as its backing file.
2482 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2483 * information in 'bs' can be properly updated.
2485 * E.g., this will convert the following chain:
2486 * bottom <- base <- intermediate <- top <- active
2490 * bottom <- base <- active
2492 * It is allowed for bottom==base, in which case it converts:
2494 * base <- intermediate <- top <- active
2500 * If backing_file_str is non-NULL, it will be used when modifying top's
2501 * overlay image metadata.
2504 * if active == top, that is considered an error
2507 int bdrv_drop_intermediate(BlockDriverState
*active
, BlockDriverState
*top
,
2508 BlockDriverState
*base
, const char *backing_file_str
)
2510 BlockDriverState
*intermediate
;
2511 BlockDriverState
*base_bs
= NULL
;
2512 BlockDriverState
*new_top_bs
= NULL
;
2513 BlkIntermediateStates
*intermediate_state
, *next
;
2516 QSIMPLEQ_HEAD(states_to_delete
, BlkIntermediateStates
) states_to_delete
;
2517 QSIMPLEQ_INIT(&states_to_delete
);
2519 if (!top
->drv
|| !base
->drv
) {
2523 new_top_bs
= bdrv_find_overlay(active
, top
);
2525 if (new_top_bs
== NULL
) {
2526 /* we could not find the image above 'top', this is an error */
2530 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2531 * to do, no intermediate images */
2532 if (new_top_bs
->backing_hd
== base
) {
2539 /* now we will go down through the list, and add each BDS we find
2540 * into our deletion queue, until we hit the 'base'
2542 while (intermediate
) {
2543 intermediate_state
= g_new0(BlkIntermediateStates
, 1);
2544 intermediate_state
->bs
= intermediate
;
2545 QSIMPLEQ_INSERT_TAIL(&states_to_delete
, intermediate_state
, entry
);
2547 if (intermediate
->backing_hd
== base
) {
2548 base_bs
= intermediate
->backing_hd
;
2551 intermediate
= intermediate
->backing_hd
;
2553 if (base_bs
== NULL
) {
2554 /* something went wrong, we did not end at the base. safely
2555 * unravel everything, and exit with error */
2559 /* success - we can delete the intermediate states, and link top->base */
2560 backing_file_str
= backing_file_str
? backing_file_str
: base_bs
->filename
;
2561 ret
= bdrv_change_backing_file(new_top_bs
, backing_file_str
,
2562 base_bs
->drv
? base_bs
->drv
->format_name
: "");
2566 bdrv_set_backing_hd(new_top_bs
, base_bs
);
2568 QSIMPLEQ_FOREACH_SAFE(intermediate_state
, &states_to_delete
, entry
, next
) {
2569 /* so that bdrv_close() does not recursively close the chain */
2570 bdrv_set_backing_hd(intermediate_state
->bs
, NULL
);
2571 bdrv_unref(intermediate_state
->bs
);
2576 QSIMPLEQ_FOREACH_SAFE(intermediate_state
, &states_to_delete
, entry
, next
) {
2577 g_free(intermediate_state
);
2583 static int bdrv_check_byte_request(BlockDriverState
*bs
, int64_t offset
,
2588 if (size
> INT_MAX
) {
2592 if (!bdrv_is_inserted(bs
))
2598 len
= bdrv_getlength(bs
);
2603 if ((offset
> len
) || (len
- offset
< size
))
2609 static int bdrv_check_request(BlockDriverState
*bs
, int64_t sector_num
,
2612 if (nb_sectors
< 0 || nb_sectors
> INT_MAX
/ BDRV_SECTOR_SIZE
) {
2616 return bdrv_check_byte_request(bs
, sector_num
* BDRV_SECTOR_SIZE
,
2617 nb_sectors
* BDRV_SECTOR_SIZE
);
2620 typedef struct RwCo
{
2621 BlockDriverState
*bs
;
2626 BdrvRequestFlags flags
;
2629 static void coroutine_fn
bdrv_rw_co_entry(void *opaque
)
2631 RwCo
*rwco
= opaque
;
2633 if (!rwco
->is_write
) {
2634 rwco
->ret
= bdrv_co_do_preadv(rwco
->bs
, rwco
->offset
,
2635 rwco
->qiov
->size
, rwco
->qiov
,
2638 rwco
->ret
= bdrv_co_do_pwritev(rwco
->bs
, rwco
->offset
,
2639 rwco
->qiov
->size
, rwco
->qiov
,
2645 * Process a vectored synchronous request using coroutines
2647 static int bdrv_prwv_co(BlockDriverState
*bs
, int64_t offset
,
2648 QEMUIOVector
*qiov
, bool is_write
,
2649 BdrvRequestFlags flags
)
2656 .is_write
= is_write
,
2662 * In sync call context, when the vcpu is blocked, this throttling timer
2663 * will not fire; so the I/O throttling function has to be disabled here
2664 * if it has been enabled.
2666 if (bs
->io_limits_enabled
) {
2667 fprintf(stderr
, "Disabling I/O throttling on '%s' due "
2668 "to synchronous I/O.\n", bdrv_get_device_name(bs
));
2669 bdrv_io_limits_disable(bs
);
2672 if (qemu_in_coroutine()) {
2673 /* Fast-path if already in coroutine context */
2674 bdrv_rw_co_entry(&rwco
);
2676 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
2678 co
= qemu_coroutine_create(bdrv_rw_co_entry
);
2679 qemu_coroutine_enter(co
, &rwco
);
2680 while (rwco
.ret
== NOT_DONE
) {
2681 aio_poll(aio_context
, true);
2688 * Process a synchronous request using coroutines
2690 static int bdrv_rw_co(BlockDriverState
*bs
, int64_t sector_num
, uint8_t *buf
,
2691 int nb_sectors
, bool is_write
, BdrvRequestFlags flags
)
2694 struct iovec iov
= {
2695 .iov_base
= (void *)buf
,
2696 .iov_len
= nb_sectors
* BDRV_SECTOR_SIZE
,
2699 if (nb_sectors
< 0 || nb_sectors
> INT_MAX
/ BDRV_SECTOR_SIZE
) {
2703 qemu_iovec_init_external(&qiov
, &iov
, 1);
2704 return bdrv_prwv_co(bs
, sector_num
<< BDRV_SECTOR_BITS
,
2705 &qiov
, is_write
, flags
);
2708 /* return < 0 if error. See bdrv_write() for the return codes */
2709 int bdrv_read(BlockDriverState
*bs
, int64_t sector_num
,
2710 uint8_t *buf
, int nb_sectors
)
2712 return bdrv_rw_co(bs
, sector_num
, buf
, nb_sectors
, false, 0);
2715 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2716 int bdrv_read_unthrottled(BlockDriverState
*bs
, int64_t sector_num
,
2717 uint8_t *buf
, int nb_sectors
)
2722 enabled
= bs
->io_limits_enabled
;
2723 bs
->io_limits_enabled
= false;
2724 ret
= bdrv_read(bs
, sector_num
, buf
, nb_sectors
);
2725 bs
->io_limits_enabled
= enabled
;
2729 /* Return < 0 if error. Important errors are:
2730 -EIO generic I/O error (may happen for all errors)
2731 -ENOMEDIUM No media inserted.
2732 -EINVAL Invalid sector number or nb_sectors
2733 -EACCES Trying to write a read-only device
2735 int bdrv_write(BlockDriverState
*bs
, int64_t sector_num
,
2736 const uint8_t *buf
, int nb_sectors
)
2738 return bdrv_rw_co(bs
, sector_num
, (uint8_t *)buf
, nb_sectors
, true, 0);
2741 int bdrv_write_zeroes(BlockDriverState
*bs
, int64_t sector_num
,
2742 int nb_sectors
, BdrvRequestFlags flags
)
2744 return bdrv_rw_co(bs
, sector_num
, NULL
, nb_sectors
, true,
2745 BDRV_REQ_ZERO_WRITE
| flags
);
2749 * Completely zero out a block device with the help of bdrv_write_zeroes.
2750 * The operation is sped up by checking the block status and only writing
2751 * zeroes to the device if they currently do not return zeroes. Optional
2752 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2754 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2756 int bdrv_make_zero(BlockDriverState
*bs
, BdrvRequestFlags flags
)
2758 int64_t target_sectors
, ret
, nb_sectors
, sector_num
= 0;
2761 target_sectors
= bdrv_nb_sectors(bs
);
2762 if (target_sectors
< 0) {
2763 return target_sectors
;
2767 nb_sectors
= target_sectors
- sector_num
;
2768 if (nb_sectors
<= 0) {
2771 if (nb_sectors
> INT_MAX
) {
2772 nb_sectors
= INT_MAX
;
2774 ret
= bdrv_get_block_status(bs
, sector_num
, nb_sectors
, &n
);
2776 error_report("error getting block status at sector %" PRId64
": %s",
2777 sector_num
, strerror(-ret
));
2780 if (ret
& BDRV_BLOCK_ZERO
) {
2784 ret
= bdrv_write_zeroes(bs
, sector_num
, n
, flags
);
2786 error_report("error writing zeroes at sector %" PRId64
": %s",
2787 sector_num
, strerror(-ret
));
2794 int bdrv_pread(BlockDriverState
*bs
, int64_t offset
, void *buf
, int bytes
)
2797 struct iovec iov
= {
2798 .iov_base
= (void *)buf
,
2807 qemu_iovec_init_external(&qiov
, &iov
, 1);
2808 ret
= bdrv_prwv_co(bs
, offset
, &qiov
, false, 0);
2816 int bdrv_pwritev(BlockDriverState
*bs
, int64_t offset
, QEMUIOVector
*qiov
)
2820 ret
= bdrv_prwv_co(bs
, offset
, qiov
, true, 0);
2828 int bdrv_pwrite(BlockDriverState
*bs
, int64_t offset
,
2829 const void *buf
, int bytes
)
2832 struct iovec iov
= {
2833 .iov_base
= (void *) buf
,
2841 qemu_iovec_init_external(&qiov
, &iov
, 1);
2842 return bdrv_pwritev(bs
, offset
, &qiov
);
2846 * Writes to the file and ensures that no writes are reordered across this
2847 * request (acts as a barrier)
2849 * Returns 0 on success, -errno in error cases.
2851 int bdrv_pwrite_sync(BlockDriverState
*bs
, int64_t offset
,
2852 const void *buf
, int count
)
2856 ret
= bdrv_pwrite(bs
, offset
, buf
, count
);
2861 /* No flush needed for cache modes that already do it */
2862 if (bs
->enable_write_cache
) {
2869 static int coroutine_fn
bdrv_co_do_copy_on_readv(BlockDriverState
*bs
,
2870 int64_t sector_num
, int nb_sectors
, QEMUIOVector
*qiov
)
2872 /* Perform I/O through a temporary buffer so that users who scribble over
2873 * their read buffer while the operation is in progress do not end up
2874 * modifying the image file. This is critical for zero-copy guest I/O
2875 * where anything might happen inside guest memory.
2877 void *bounce_buffer
;
2879 BlockDriver
*drv
= bs
->drv
;
2881 QEMUIOVector bounce_qiov
;
2882 int64_t cluster_sector_num
;
2883 int cluster_nb_sectors
;
2887 /* Cover entire cluster so no additional backing file I/O is required when
2888 * allocating cluster in the image file.
2890 bdrv_round_to_clusters(bs
, sector_num
, nb_sectors
,
2891 &cluster_sector_num
, &cluster_nb_sectors
);
2893 trace_bdrv_co_do_copy_on_readv(bs
, sector_num
, nb_sectors
,
2894 cluster_sector_num
, cluster_nb_sectors
);
2896 iov
.iov_len
= cluster_nb_sectors
* BDRV_SECTOR_SIZE
;
2897 iov
.iov_base
= bounce_buffer
= qemu_try_blockalign(bs
, iov
.iov_len
);
2898 if (bounce_buffer
== NULL
) {
2903 qemu_iovec_init_external(&bounce_qiov
, &iov
, 1);
2905 ret
= drv
->bdrv_co_readv(bs
, cluster_sector_num
, cluster_nb_sectors
,
2911 if (drv
->bdrv_co_write_zeroes
&&
2912 buffer_is_zero(bounce_buffer
, iov
.iov_len
)) {
2913 ret
= bdrv_co_do_write_zeroes(bs
, cluster_sector_num
,
2914 cluster_nb_sectors
, 0);
2916 /* This does not change the data on the disk, it is not necessary
2917 * to flush even in cache=writethrough mode.
2919 ret
= drv
->bdrv_co_writev(bs
, cluster_sector_num
, cluster_nb_sectors
,
2924 /* It might be okay to ignore write errors for guest requests. If this
2925 * is a deliberate copy-on-read then we don't want to ignore the error.
2926 * Simply report it in all cases.
2931 skip_bytes
= (sector_num
- cluster_sector_num
) * BDRV_SECTOR_SIZE
;
2932 qemu_iovec_from_buf(qiov
, 0, bounce_buffer
+ skip_bytes
,
2933 nb_sectors
* BDRV_SECTOR_SIZE
);
2936 qemu_vfree(bounce_buffer
);
2941 * Forwards an already correctly aligned request to the BlockDriver. This
2942 * handles copy on read and zeroing after EOF; any other features must be
2943 * implemented by the caller.
2945 static int coroutine_fn
bdrv_aligned_preadv(BlockDriverState
*bs
,
2946 BdrvTrackedRequest
*req
, int64_t offset
, unsigned int bytes
,
2947 int64_t align
, QEMUIOVector
*qiov
, int flags
)
2949 BlockDriver
*drv
= bs
->drv
;
2952 int64_t sector_num
= offset
>> BDRV_SECTOR_BITS
;
2953 unsigned int nb_sectors
= bytes
>> BDRV_SECTOR_BITS
;
2955 assert((offset
& (BDRV_SECTOR_SIZE
- 1)) == 0);
2956 assert((bytes
& (BDRV_SECTOR_SIZE
- 1)) == 0);
2957 assert(!qiov
|| bytes
== qiov
->size
);
2959 /* Handle Copy on Read and associated serialisation */
2960 if (flags
& BDRV_REQ_COPY_ON_READ
) {
2961 /* If we touch the same cluster it counts as an overlap. This
2962 * guarantees that allocating writes will be serialized and not race
2963 * with each other for the same cluster. For example, in copy-on-read
2964 * it ensures that the CoR read and write operations are atomic and
2965 * guest writes cannot interleave between them. */
2966 mark_request_serialising(req
, bdrv_get_cluster_size(bs
));
2969 wait_serialising_requests(req
);
2971 if (flags
& BDRV_REQ_COPY_ON_READ
) {
2974 ret
= bdrv_is_allocated(bs
, sector_num
, nb_sectors
, &pnum
);
2979 if (!ret
|| pnum
!= nb_sectors
) {
2980 ret
= bdrv_co_do_copy_on_readv(bs
, sector_num
, nb_sectors
, qiov
);
2985 /* Forward the request to the BlockDriver */
2986 if (!(bs
->zero_beyond_eof
&& bs
->growable
)) {
2987 ret
= drv
->bdrv_co_readv(bs
, sector_num
, nb_sectors
, qiov
);
2989 /* Read zeros after EOF of growable BDSes */
2990 int64_t total_sectors
, max_nb_sectors
;
2992 total_sectors
= bdrv_nb_sectors(bs
);
2993 if (total_sectors
< 0) {
2994 ret
= total_sectors
;
2998 max_nb_sectors
= ROUND_UP(MAX(0, total_sectors
- sector_num
),
2999 align
>> BDRV_SECTOR_BITS
);
3000 if (max_nb_sectors
> 0) {
3001 QEMUIOVector local_qiov
;
3002 size_t local_sectors
;
3004 max_nb_sectors
= MIN(max_nb_sectors
, SIZE_MAX
/ BDRV_SECTOR_BITS
);
3005 local_sectors
= MIN(max_nb_sectors
, nb_sectors
);
3007 qemu_iovec_init(&local_qiov
, qiov
->niov
);
3008 qemu_iovec_concat(&local_qiov
, qiov
, 0,
3009 local_sectors
* BDRV_SECTOR_SIZE
);
3011 ret
= drv
->bdrv_co_readv(bs
, sector_num
, local_sectors
,
3014 qemu_iovec_destroy(&local_qiov
);
3019 /* Reading beyond end of file is supposed to produce zeroes */
3020 if (ret
== 0 && total_sectors
< sector_num
+ nb_sectors
) {
3021 uint64_t offset
= MAX(0, total_sectors
- sector_num
);
3022 uint64_t bytes
= (sector_num
+ nb_sectors
- offset
) *
3024 qemu_iovec_memset(qiov
, offset
* BDRV_SECTOR_SIZE
, 0, bytes
);
3033 * Handle a read request in coroutine context
3035 static int coroutine_fn
bdrv_co_do_preadv(BlockDriverState
*bs
,
3036 int64_t offset
, unsigned int bytes
, QEMUIOVector
*qiov
,
3037 BdrvRequestFlags flags
)
3039 BlockDriver
*drv
= bs
->drv
;
3040 BdrvTrackedRequest req
;
3042 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3043 uint64_t align
= MAX(BDRV_SECTOR_SIZE
, bs
->request_alignment
);
3044 uint8_t *head_buf
= NULL
;
3045 uint8_t *tail_buf
= NULL
;
3046 QEMUIOVector local_qiov
;
3047 bool use_local_qiov
= false;
3053 if (bdrv_check_byte_request(bs
, offset
, bytes
)) {
3057 if (bs
->copy_on_read
) {
3058 flags
|= BDRV_REQ_COPY_ON_READ
;
3061 /* throttling disk I/O */
3062 if (bs
->io_limits_enabled
) {
3063 bdrv_io_limits_intercept(bs
, bytes
, false);
3066 /* Align read if necessary by padding qiov */
3067 if (offset
& (align
- 1)) {
3068 head_buf
= qemu_blockalign(bs
, align
);
3069 qemu_iovec_init(&local_qiov
, qiov
->niov
+ 2);
3070 qemu_iovec_add(&local_qiov
, head_buf
, offset
& (align
- 1));
3071 qemu_iovec_concat(&local_qiov
, qiov
, 0, qiov
->size
);
3072 use_local_qiov
= true;
3074 bytes
+= offset
& (align
- 1);
3075 offset
= offset
& ~(align
- 1);
3078 if ((offset
+ bytes
) & (align
- 1)) {
3079 if (!use_local_qiov
) {
3080 qemu_iovec_init(&local_qiov
, qiov
->niov
+ 1);
3081 qemu_iovec_concat(&local_qiov
, qiov
, 0, qiov
->size
);
3082 use_local_qiov
= true;
3084 tail_buf
= qemu_blockalign(bs
, align
);
3085 qemu_iovec_add(&local_qiov
, tail_buf
,
3086 align
- ((offset
+ bytes
) & (align
- 1)));
3088 bytes
= ROUND_UP(bytes
, align
);
3091 tracked_request_begin(&req
, bs
, offset
, bytes
, false);
3092 ret
= bdrv_aligned_preadv(bs
, &req
, offset
, bytes
, align
,
3093 use_local_qiov
? &local_qiov
: qiov
,
3095 tracked_request_end(&req
);
3097 if (use_local_qiov
) {
3098 qemu_iovec_destroy(&local_qiov
);
3099 qemu_vfree(head_buf
);
3100 qemu_vfree(tail_buf
);
3106 static int coroutine_fn
bdrv_co_do_readv(BlockDriverState
*bs
,
3107 int64_t sector_num
, int nb_sectors
, QEMUIOVector
*qiov
,
3108 BdrvRequestFlags flags
)
3110 if (nb_sectors
< 0 || nb_sectors
> (UINT_MAX
>> BDRV_SECTOR_BITS
)) {
3114 return bdrv_co_do_preadv(bs
, sector_num
<< BDRV_SECTOR_BITS
,
3115 nb_sectors
<< BDRV_SECTOR_BITS
, qiov
, flags
);
3118 int coroutine_fn
bdrv_co_readv(BlockDriverState
*bs
, int64_t sector_num
,
3119 int nb_sectors
, QEMUIOVector
*qiov
)
3121 trace_bdrv_co_readv(bs
, sector_num
, nb_sectors
);
3123 return bdrv_co_do_readv(bs
, sector_num
, nb_sectors
, qiov
, 0);
3126 int coroutine_fn
bdrv_co_copy_on_readv(BlockDriverState
*bs
,
3127 int64_t sector_num
, int nb_sectors
, QEMUIOVector
*qiov
)
3129 trace_bdrv_co_copy_on_readv(bs
, sector_num
, nb_sectors
);
3131 return bdrv_co_do_readv(bs
, sector_num
, nb_sectors
, qiov
,
3132 BDRV_REQ_COPY_ON_READ
);
3135 /* if no limit is specified in the BlockLimits use a default
3136 * of 32768 512-byte sectors (16 MiB) per request.
3138 #define MAX_WRITE_ZEROES_DEFAULT 32768
3140 static int coroutine_fn
bdrv_co_do_write_zeroes(BlockDriverState
*bs
,
3141 int64_t sector_num
, int nb_sectors
, BdrvRequestFlags flags
)
3143 BlockDriver
*drv
= bs
->drv
;
3145 struct iovec iov
= {0};
3148 int max_write_zeroes
= bs
->bl
.max_write_zeroes
?
3149 bs
->bl
.max_write_zeroes
: MAX_WRITE_ZEROES_DEFAULT
;
3151 while (nb_sectors
> 0 && !ret
) {
3152 int num
= nb_sectors
;
3154 /* Align request. Block drivers can expect the "bulk" of the request
3157 if (bs
->bl
.write_zeroes_alignment
3158 && num
> bs
->bl
.write_zeroes_alignment
) {
3159 if (sector_num
% bs
->bl
.write_zeroes_alignment
!= 0) {
3160 /* Make a small request up to the first aligned sector. */
3161 num
= bs
->bl
.write_zeroes_alignment
;
3162 num
-= sector_num
% bs
->bl
.write_zeroes_alignment
;
3163 } else if ((sector_num
+ num
) % bs
->bl
.write_zeroes_alignment
!= 0) {
3164 /* Shorten the request to the last aligned sector. num cannot
3165 * underflow because num > bs->bl.write_zeroes_alignment.
3167 num
-= (sector_num
+ num
) % bs
->bl
.write_zeroes_alignment
;
3171 /* limit request size */
3172 if (num
> max_write_zeroes
) {
3173 num
= max_write_zeroes
;
3177 /* First try the efficient write zeroes operation */
3178 if (drv
->bdrv_co_write_zeroes
) {
3179 ret
= drv
->bdrv_co_write_zeroes(bs
, sector_num
, num
, flags
);
3182 if (ret
== -ENOTSUP
) {
3183 /* Fall back to bounce buffer if write zeroes is unsupported */
3184 iov
.iov_len
= num
* BDRV_SECTOR_SIZE
;
3185 if (iov
.iov_base
== NULL
) {
3186 iov
.iov_base
= qemu_try_blockalign(bs
, num
* BDRV_SECTOR_SIZE
);
3187 if (iov
.iov_base
== NULL
) {
3191 memset(iov
.iov_base
, 0, num
* BDRV_SECTOR_SIZE
);
3193 qemu_iovec_init_external(&qiov
, &iov
, 1);
3195 ret
= drv
->bdrv_co_writev(bs
, sector_num
, num
, &qiov
);
3197 /* Keep bounce buffer around if it is big enough for all
3198 * all future requests.
3200 if (num
< max_write_zeroes
) {
3201 qemu_vfree(iov
.iov_base
);
3202 iov
.iov_base
= NULL
;
3211 qemu_vfree(iov
.iov_base
);
3216 * Forwards an already correctly aligned write request to the BlockDriver.
3218 static int coroutine_fn
bdrv_aligned_pwritev(BlockDriverState
*bs
,
3219 BdrvTrackedRequest
*req
, int64_t offset
, unsigned int bytes
,
3220 QEMUIOVector
*qiov
, int flags
)
3222 BlockDriver
*drv
= bs
->drv
;
3226 int64_t sector_num
= offset
>> BDRV_SECTOR_BITS
;
3227 unsigned int nb_sectors
= bytes
>> BDRV_SECTOR_BITS
;
3229 assert((offset
& (BDRV_SECTOR_SIZE
- 1)) == 0);
3230 assert((bytes
& (BDRV_SECTOR_SIZE
- 1)) == 0);
3231 assert(!qiov
|| bytes
== qiov
->size
);
3233 waited
= wait_serialising_requests(req
);
3234 assert(!waited
|| !req
->serialising
);
3235 assert(req
->overlap_offset
<= offset
);
3236 assert(offset
+ bytes
<= req
->overlap_offset
+ req
->overlap_bytes
);
3238 ret
= notifier_with_return_list_notify(&bs
->before_write_notifiers
, req
);
3240 if (!ret
&& bs
->detect_zeroes
!= BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF
&&
3241 !(flags
& BDRV_REQ_ZERO_WRITE
) && drv
->bdrv_co_write_zeroes
&&
3242 qemu_iovec_is_zero(qiov
)) {
3243 flags
|= BDRV_REQ_ZERO_WRITE
;
3244 if (bs
->detect_zeroes
== BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP
) {
3245 flags
|= BDRV_REQ_MAY_UNMAP
;
3250 /* Do nothing, write notifier decided to fail this request */
3251 } else if (flags
& BDRV_REQ_ZERO_WRITE
) {
3252 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV_ZERO
);
3253 ret
= bdrv_co_do_write_zeroes(bs
, sector_num
, nb_sectors
, flags
);
3255 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV
);
3256 ret
= drv
->bdrv_co_writev(bs
, sector_num
, nb_sectors
, qiov
);
3258 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV_DONE
);
3260 if (ret
== 0 && !bs
->enable_write_cache
) {
3261 ret
= bdrv_co_flush(bs
);
3264 bdrv_set_dirty(bs
, sector_num
, nb_sectors
);
3266 block_acct_highest_sector(&bs
->stats
, sector_num
, nb_sectors
);
3268 if (bs
->growable
&& ret
>= 0) {
3269 bs
->total_sectors
= MAX(bs
->total_sectors
, sector_num
+ nb_sectors
);
3276 * Handle a write request in coroutine context
3278 static int coroutine_fn
bdrv_co_do_pwritev(BlockDriverState
*bs
,
3279 int64_t offset
, unsigned int bytes
, QEMUIOVector
*qiov
,
3280 BdrvRequestFlags flags
)
3282 BdrvTrackedRequest req
;
3283 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3284 uint64_t align
= MAX(BDRV_SECTOR_SIZE
, bs
->request_alignment
);
3285 uint8_t *head_buf
= NULL
;
3286 uint8_t *tail_buf
= NULL
;
3287 QEMUIOVector local_qiov
;
3288 bool use_local_qiov
= false;
3294 if (bs
->read_only
) {
3297 if (bdrv_check_byte_request(bs
, offset
, bytes
)) {
3301 /* throttling disk I/O */
3302 if (bs
->io_limits_enabled
) {
3303 bdrv_io_limits_intercept(bs
, bytes
, true);
3307 * Align write if necessary by performing a read-modify-write cycle.
3308 * Pad qiov with the read parts and be sure to have a tracked request not
3309 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3311 tracked_request_begin(&req
, bs
, offset
, bytes
, true);
3313 if (offset
& (align
- 1)) {
3314 QEMUIOVector head_qiov
;
3315 struct iovec head_iov
;
3317 mark_request_serialising(&req
, align
);
3318 wait_serialising_requests(&req
);
3320 head_buf
= qemu_blockalign(bs
, align
);
3321 head_iov
= (struct iovec
) {
3322 .iov_base
= head_buf
,
3325 qemu_iovec_init_external(&head_qiov
, &head_iov
, 1);
3327 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV_RMW_HEAD
);
3328 ret
= bdrv_aligned_preadv(bs
, &req
, offset
& ~(align
- 1), align
,
3329 align
, &head_qiov
, 0);
3333 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV_RMW_AFTER_HEAD
);
3335 qemu_iovec_init(&local_qiov
, qiov
->niov
+ 2);
3336 qemu_iovec_add(&local_qiov
, head_buf
, offset
& (align
- 1));
3337 qemu_iovec_concat(&local_qiov
, qiov
, 0, qiov
->size
);
3338 use_local_qiov
= true;
3340 bytes
+= offset
& (align
- 1);
3341 offset
= offset
& ~(align
- 1);
3344 if ((offset
+ bytes
) & (align
- 1)) {
3345 QEMUIOVector tail_qiov
;
3346 struct iovec tail_iov
;
3350 mark_request_serialising(&req
, align
);
3351 waited
= wait_serialising_requests(&req
);
3352 assert(!waited
|| !use_local_qiov
);
3354 tail_buf
= qemu_blockalign(bs
, align
);
3355 tail_iov
= (struct iovec
) {
3356 .iov_base
= tail_buf
,
3359 qemu_iovec_init_external(&tail_qiov
, &tail_iov
, 1);
3361 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV_RMW_TAIL
);
3362 ret
= bdrv_aligned_preadv(bs
, &req
, (offset
+ bytes
) & ~(align
- 1), align
,
3363 align
, &tail_qiov
, 0);
3367 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV_RMW_AFTER_TAIL
);
3369 if (!use_local_qiov
) {
3370 qemu_iovec_init(&local_qiov
, qiov
->niov
+ 1);
3371 qemu_iovec_concat(&local_qiov
, qiov
, 0, qiov
->size
);
3372 use_local_qiov
= true;
3375 tail_bytes
= (offset
+ bytes
) & (align
- 1);
3376 qemu_iovec_add(&local_qiov
, tail_buf
+ tail_bytes
, align
- tail_bytes
);
3378 bytes
= ROUND_UP(bytes
, align
);
3381 ret
= bdrv_aligned_pwritev(bs
, &req
, offset
, bytes
,
3382 use_local_qiov
? &local_qiov
: qiov
,
3386 tracked_request_end(&req
);
3388 if (use_local_qiov
) {
3389 qemu_iovec_destroy(&local_qiov
);
3391 qemu_vfree(head_buf
);
3392 qemu_vfree(tail_buf
);
3397 static int coroutine_fn
bdrv_co_do_writev(BlockDriverState
*bs
,
3398 int64_t sector_num
, int nb_sectors
, QEMUIOVector
*qiov
,
3399 BdrvRequestFlags flags
)
3401 if (nb_sectors
< 0 || nb_sectors
> (INT_MAX
>> BDRV_SECTOR_BITS
)) {
3405 return bdrv_co_do_pwritev(bs
, sector_num
<< BDRV_SECTOR_BITS
,
3406 nb_sectors
<< BDRV_SECTOR_BITS
, qiov
, flags
);
3409 int coroutine_fn
bdrv_co_writev(BlockDriverState
*bs
, int64_t sector_num
,
3410 int nb_sectors
, QEMUIOVector
*qiov
)
3412 trace_bdrv_co_writev(bs
, sector_num
, nb_sectors
);
3414 return bdrv_co_do_writev(bs
, sector_num
, nb_sectors
, qiov
, 0);
3417 int coroutine_fn
bdrv_co_write_zeroes(BlockDriverState
*bs
,
3418 int64_t sector_num
, int nb_sectors
,
3419 BdrvRequestFlags flags
)
3421 trace_bdrv_co_write_zeroes(bs
, sector_num
, nb_sectors
, flags
);
3423 if (!(bs
->open_flags
& BDRV_O_UNMAP
)) {
3424 flags
&= ~BDRV_REQ_MAY_UNMAP
;
3427 return bdrv_co_do_writev(bs
, sector_num
, nb_sectors
, NULL
,
3428 BDRV_REQ_ZERO_WRITE
| flags
);
3432 * Truncate file to 'offset' bytes (needed only for file protocols)
3434 int bdrv_truncate(BlockDriverState
*bs
, int64_t offset
)
3436 BlockDriver
*drv
= bs
->drv
;
3440 if (!drv
->bdrv_truncate
)
3445 ret
= drv
->bdrv_truncate(bs
, offset
);
3447 ret
= refresh_total_sectors(bs
, offset
>> BDRV_SECTOR_BITS
);
3449 blk_dev_resize_cb(bs
->blk
);
3456 * Length of a allocated file in bytes. Sparse files are counted by actual
3457 * allocated space. Return < 0 if error or unknown.
3459 int64_t bdrv_get_allocated_file_size(BlockDriverState
*bs
)
3461 BlockDriver
*drv
= bs
->drv
;
3465 if (drv
->bdrv_get_allocated_file_size
) {
3466 return drv
->bdrv_get_allocated_file_size(bs
);
3469 return bdrv_get_allocated_file_size(bs
->file
);
3475 * Return number of sectors on success, -errno on error.
3477 int64_t bdrv_nb_sectors(BlockDriverState
*bs
)
3479 BlockDriver
*drv
= bs
->drv
;
3484 if (drv
->has_variable_length
) {
3485 int ret
= refresh_total_sectors(bs
, bs
->total_sectors
);
3490 return bs
->total_sectors
;
3494 * Return length in bytes on success, -errno on error.
3495 * The length is always a multiple of BDRV_SECTOR_SIZE.
3497 int64_t bdrv_getlength(BlockDriverState
*bs
)
3499 int64_t ret
= bdrv_nb_sectors(bs
);
3501 return ret
< 0 ? ret
: ret
* BDRV_SECTOR_SIZE
;
3504 /* return 0 as number of sectors if no device present or error */
3505 void bdrv_get_geometry(BlockDriverState
*bs
, uint64_t *nb_sectors_ptr
)
3507 int64_t nb_sectors
= bdrv_nb_sectors(bs
);
3509 *nb_sectors_ptr
= nb_sectors
< 0 ? 0 : nb_sectors
;
3512 void bdrv_set_on_error(BlockDriverState
*bs
, BlockdevOnError on_read_error
,
3513 BlockdevOnError on_write_error
)
3515 bs
->on_read_error
= on_read_error
;
3516 bs
->on_write_error
= on_write_error
;
3519 BlockdevOnError
bdrv_get_on_error(BlockDriverState
*bs
, bool is_read
)
3521 return is_read
? bs
->on_read_error
: bs
->on_write_error
;
3524 BlockErrorAction
bdrv_get_error_action(BlockDriverState
*bs
, bool is_read
, int error
)
3526 BlockdevOnError on_err
= is_read
? bs
->on_read_error
: bs
->on_write_error
;
3529 case BLOCKDEV_ON_ERROR_ENOSPC
:
3530 return (error
== ENOSPC
) ?
3531 BLOCK_ERROR_ACTION_STOP
: BLOCK_ERROR_ACTION_REPORT
;
3532 case BLOCKDEV_ON_ERROR_STOP
:
3533 return BLOCK_ERROR_ACTION_STOP
;
3534 case BLOCKDEV_ON_ERROR_REPORT
:
3535 return BLOCK_ERROR_ACTION_REPORT
;
3536 case BLOCKDEV_ON_ERROR_IGNORE
:
3537 return BLOCK_ERROR_ACTION_IGNORE
;
3543 static void send_qmp_error_event(BlockDriverState
*bs
,
3544 BlockErrorAction action
,
3545 bool is_read
, int error
)
3547 IoOperationType optype
;
3549 optype
= is_read
? IO_OPERATION_TYPE_READ
: IO_OPERATION_TYPE_WRITE
;
3550 qapi_event_send_block_io_error(bdrv_get_device_name(bs
), optype
, action
,
3551 bdrv_iostatus_is_enabled(bs
),
3552 error
== ENOSPC
, strerror(error
),
3556 /* This is done by device models because, while the block layer knows
3557 * about the error, it does not know whether an operation comes from
3558 * the device or the block layer (from a job, for example).
3560 void bdrv_error_action(BlockDriverState
*bs
, BlockErrorAction action
,
3561 bool is_read
, int error
)
3565 if (action
== BLOCK_ERROR_ACTION_STOP
) {
3566 /* First set the iostatus, so that "info block" returns an iostatus
3567 * that matches the events raised so far (an additional error iostatus
3568 * is fine, but not a lost one).
3570 bdrv_iostatus_set_err(bs
, error
);
3572 /* Then raise the request to stop the VM and the event.
3573 * qemu_system_vmstop_request_prepare has two effects. First,
3574 * it ensures that the STOP event always comes after the
3575 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3576 * can observe the STOP event and do a "cont" before the STOP
3577 * event is issued, the VM will not stop. In this case, vm_start()
3578 * also ensures that the STOP/RESUME pair of events is emitted.
3580 qemu_system_vmstop_request_prepare();
3581 send_qmp_error_event(bs
, action
, is_read
, error
);
3582 qemu_system_vmstop_request(RUN_STATE_IO_ERROR
);
3584 send_qmp_error_event(bs
, action
, is_read
, error
);
3588 int bdrv_is_read_only(BlockDriverState
*bs
)
3590 return bs
->read_only
;
3593 int bdrv_is_sg(BlockDriverState
*bs
)
3598 int bdrv_enable_write_cache(BlockDriverState
*bs
)
3600 return bs
->enable_write_cache
;
3603 void bdrv_set_enable_write_cache(BlockDriverState
*bs
, bool wce
)
3605 bs
->enable_write_cache
= wce
;
3607 /* so a reopen() will preserve wce */
3609 bs
->open_flags
|= BDRV_O_CACHE_WB
;
3611 bs
->open_flags
&= ~BDRV_O_CACHE_WB
;
3615 int bdrv_is_encrypted(BlockDriverState
*bs
)
3617 if (bs
->backing_hd
&& bs
->backing_hd
->encrypted
)
3619 return bs
->encrypted
;
3622 int bdrv_key_required(BlockDriverState
*bs
)
3624 BlockDriverState
*backing_hd
= bs
->backing_hd
;
3626 if (backing_hd
&& backing_hd
->encrypted
&& !backing_hd
->valid_key
)
3628 return (bs
->encrypted
&& !bs
->valid_key
);
3631 int bdrv_set_key(BlockDriverState
*bs
, const char *key
)
3634 if (bs
->backing_hd
&& bs
->backing_hd
->encrypted
) {
3635 ret
= bdrv_set_key(bs
->backing_hd
, key
);
3641 if (!bs
->encrypted
) {
3643 } else if (!bs
->drv
|| !bs
->drv
->bdrv_set_key
) {
3646 ret
= bs
->drv
->bdrv_set_key(bs
, key
);
3649 } else if (!bs
->valid_key
) {
3652 /* call the change callback now, we skipped it on open */
3653 blk_dev_change_media_cb(bs
->blk
, true);
3659 const char *bdrv_get_format_name(BlockDriverState
*bs
)
3661 return bs
->drv
? bs
->drv
->format_name
: NULL
;
3664 static int qsort_strcmp(const void *a
, const void *b
)
3666 return strcmp(a
, b
);
3669 void bdrv_iterate_format(void (*it
)(void *opaque
, const char *name
),
3675 const char **formats
= NULL
;
3677 QLIST_FOREACH(drv
, &bdrv_drivers
, list
) {
3678 if (drv
->format_name
) {
3681 while (formats
&& i
&& !found
) {
3682 found
= !strcmp(formats
[--i
], drv
->format_name
);
3686 formats
= g_renew(const char *, formats
, count
+ 1);
3687 formats
[count
++] = drv
->format_name
;
3692 qsort(formats
, count
, sizeof(formats
[0]), qsort_strcmp
);
3694 for (i
= 0; i
< count
; i
++) {
3695 it(opaque
, formats
[i
]);
3701 /* This function is to find block backend bs */
3702 /* TODO convert callers to blk_by_name(), then remove */
3703 BlockDriverState
*bdrv_find(const char *name
)
3705 BlockBackend
*blk
= blk_by_name(name
);
3707 return blk
? blk_bs(blk
) : NULL
;
3710 /* This function is to find a node in the bs graph */
3711 BlockDriverState
*bdrv_find_node(const char *node_name
)
3713 BlockDriverState
*bs
;
3717 QTAILQ_FOREACH(bs
, &graph_bdrv_states
, node_list
) {
3718 if (!strcmp(node_name
, bs
->node_name
)) {
3725 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3726 BlockDeviceInfoList
*bdrv_named_nodes_list(void)
3728 BlockDeviceInfoList
*list
, *entry
;
3729 BlockDriverState
*bs
;
3732 QTAILQ_FOREACH(bs
, &graph_bdrv_states
, node_list
) {
3733 entry
= g_malloc0(sizeof(*entry
));
3734 entry
->value
= bdrv_block_device_info(bs
);
3742 BlockDriverState
*bdrv_lookup_bs(const char *device
,
3743 const char *node_name
,
3747 BlockDriverState
*bs
;
3750 blk
= blk_by_name(device
);
3758 bs
= bdrv_find_node(node_name
);
3765 error_setg(errp
, "Cannot find device=%s nor node_name=%s",
3766 device
? device
: "",
3767 node_name
? node_name
: "");
3771 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3772 * return false. If either argument is NULL, return false. */
3773 bool bdrv_chain_contains(BlockDriverState
*top
, BlockDriverState
*base
)
3775 while (top
&& top
!= base
) {
3776 top
= top
->backing_hd
;
3782 BlockDriverState
*bdrv_next(BlockDriverState
*bs
)
3785 return QTAILQ_FIRST(&bdrv_states
);
3787 return QTAILQ_NEXT(bs
, device_list
);
3790 /* TODO check what callers really want: bs->node_name or blk_name() */
3791 const char *bdrv_get_device_name(const BlockDriverState
*bs
)
3793 return bs
->blk
? blk_name(bs
->blk
) : "";
3796 int bdrv_get_flags(BlockDriverState
*bs
)
3798 return bs
->open_flags
;
3801 int bdrv_flush_all(void)
3803 BlockDriverState
*bs
;
3806 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
3807 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
3810 aio_context_acquire(aio_context
);
3811 ret
= bdrv_flush(bs
);
3812 if (ret
< 0 && !result
) {
3815 aio_context_release(aio_context
);
3821 int bdrv_has_zero_init_1(BlockDriverState
*bs
)
3826 int bdrv_has_zero_init(BlockDriverState
*bs
)
3830 /* If BS is a copy on write image, it is initialized to
3831 the contents of the base image, which may not be zeroes. */
3832 if (bs
->backing_hd
) {
3835 if (bs
->drv
->bdrv_has_zero_init
) {
3836 return bs
->drv
->bdrv_has_zero_init(bs
);
3843 bool bdrv_unallocated_blocks_are_zero(BlockDriverState
*bs
)
3845 BlockDriverInfo bdi
;
3847 if (bs
->backing_hd
) {
3851 if (bdrv_get_info(bs
, &bdi
) == 0) {
3852 return bdi
.unallocated_blocks_are_zero
;
3858 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState
*bs
)
3860 BlockDriverInfo bdi
;
3862 if (bs
->backing_hd
|| !(bs
->open_flags
& BDRV_O_UNMAP
)) {
3866 if (bdrv_get_info(bs
, &bdi
) == 0) {
3867 return bdi
.can_write_zeroes_with_unmap
;
3873 typedef struct BdrvCoGetBlockStatusData
{
3874 BlockDriverState
*bs
;
3875 BlockDriverState
*base
;
3881 } BdrvCoGetBlockStatusData
;
3884 * Returns true iff the specified sector is present in the disk image. Drivers
3885 * not implementing the functionality are assumed to not support backing files,
3886 * hence all their sectors are reported as allocated.
3888 * If 'sector_num' is beyond the end of the disk image the return value is 0
3889 * and 'pnum' is set to 0.
3891 * 'pnum' is set to the number of sectors (including and immediately following
3892 * the specified sector) that are known to be in the same
3893 * allocated/unallocated state.
3895 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3896 * beyond the end of the disk image it will be clamped.
3898 static int64_t coroutine_fn
bdrv_co_get_block_status(BlockDriverState
*bs
,
3900 int nb_sectors
, int *pnum
)
3902 int64_t total_sectors
;
3906 total_sectors
= bdrv_nb_sectors(bs
);
3907 if (total_sectors
< 0) {
3908 return total_sectors
;
3911 if (sector_num
>= total_sectors
) {
3916 n
= total_sectors
- sector_num
;
3917 if (n
< nb_sectors
) {
3921 if (!bs
->drv
->bdrv_co_get_block_status
) {
3923 ret
= BDRV_BLOCK_DATA
| BDRV_BLOCK_ALLOCATED
;
3924 if (bs
->drv
->protocol_name
) {
3925 ret
|= BDRV_BLOCK_OFFSET_VALID
| (sector_num
* BDRV_SECTOR_SIZE
);
3930 ret
= bs
->drv
->bdrv_co_get_block_status(bs
, sector_num
, nb_sectors
, pnum
);
3936 if (ret
& BDRV_BLOCK_RAW
) {
3937 assert(ret
& BDRV_BLOCK_OFFSET_VALID
);
3938 return bdrv_get_block_status(bs
->file
, ret
>> BDRV_SECTOR_BITS
,
3942 if (ret
& (BDRV_BLOCK_DATA
| BDRV_BLOCK_ZERO
)) {
3943 ret
|= BDRV_BLOCK_ALLOCATED
;
3946 if (!(ret
& BDRV_BLOCK_DATA
) && !(ret
& BDRV_BLOCK_ZERO
)) {
3947 if (bdrv_unallocated_blocks_are_zero(bs
)) {
3948 ret
|= BDRV_BLOCK_ZERO
;
3949 } else if (bs
->backing_hd
) {
3950 BlockDriverState
*bs2
= bs
->backing_hd
;
3951 int64_t nb_sectors2
= bdrv_nb_sectors(bs2
);
3952 if (nb_sectors2
>= 0 && sector_num
>= nb_sectors2
) {
3953 ret
|= BDRV_BLOCK_ZERO
;
3959 (ret
& BDRV_BLOCK_DATA
) && !(ret
& BDRV_BLOCK_ZERO
) &&
3960 (ret
& BDRV_BLOCK_OFFSET_VALID
)) {
3963 ret2
= bdrv_co_get_block_status(bs
->file
, ret
>> BDRV_SECTOR_BITS
,
3966 /* Ignore errors. This is just providing extra information, it
3967 * is useful but not necessary.
3970 /* !file_pnum indicates an offset at or beyond the EOF; it is
3971 * perfectly valid for the format block driver to point to such
3972 * offsets, so catch it and mark everything as zero */
3973 ret
|= BDRV_BLOCK_ZERO
;
3975 /* Limit request to the range reported by the protocol driver */
3977 ret
|= (ret2
& BDRV_BLOCK_ZERO
);
3985 /* Coroutine wrapper for bdrv_get_block_status() */
3986 static void coroutine_fn
bdrv_get_block_status_co_entry(void *opaque
)
3988 BdrvCoGetBlockStatusData
*data
= opaque
;
3989 BlockDriverState
*bs
= data
->bs
;
3991 data
->ret
= bdrv_co_get_block_status(bs
, data
->sector_num
, data
->nb_sectors
,
3997 * Synchronous wrapper around bdrv_co_get_block_status().
3999 * See bdrv_co_get_block_status() for details.
4001 int64_t bdrv_get_block_status(BlockDriverState
*bs
, int64_t sector_num
,
4002 int nb_sectors
, int *pnum
)
4005 BdrvCoGetBlockStatusData data
= {
4007 .sector_num
= sector_num
,
4008 .nb_sectors
= nb_sectors
,
4013 if (qemu_in_coroutine()) {
4014 /* Fast-path if already in coroutine context */
4015 bdrv_get_block_status_co_entry(&data
);
4017 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
4019 co
= qemu_coroutine_create(bdrv_get_block_status_co_entry
);
4020 qemu_coroutine_enter(co
, &data
);
4021 while (!data
.done
) {
4022 aio_poll(aio_context
, true);
4028 int coroutine_fn
bdrv_is_allocated(BlockDriverState
*bs
, int64_t sector_num
,
4029 int nb_sectors
, int *pnum
)
4031 int64_t ret
= bdrv_get_block_status(bs
, sector_num
, nb_sectors
, pnum
);
4035 return !!(ret
& BDRV_BLOCK_ALLOCATED
);
4039 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4041 * Return true if the given sector is allocated in any image between
4042 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4043 * sector is allocated in any image of the chain. Return false otherwise.
4045 * 'pnum' is set to the number of sectors (including and immediately following
4046 * the specified sector) that are known to be in the same
4047 * allocated/unallocated state.
4050 int bdrv_is_allocated_above(BlockDriverState
*top
,
4051 BlockDriverState
*base
,
4053 int nb_sectors
, int *pnum
)
4055 BlockDriverState
*intermediate
;
4056 int ret
, n
= nb_sectors
;
4059 while (intermediate
&& intermediate
!= base
) {
4061 ret
= bdrv_is_allocated(intermediate
, sector_num
, nb_sectors
,
4071 * [sector_num, nb_sectors] is unallocated on top but intermediate
4074 * [sector_num+x, nr_sectors] allocated.
4076 if (n
> pnum_inter
&&
4077 (intermediate
== top
||
4078 sector_num
+ pnum_inter
< intermediate
->total_sectors
)) {
4082 intermediate
= intermediate
->backing_hd
;
4089 const char *bdrv_get_encrypted_filename(BlockDriverState
*bs
)
4091 if (bs
->backing_hd
&& bs
->backing_hd
->encrypted
)
4092 return bs
->backing_file
;
4093 else if (bs
->encrypted
)
4094 return bs
->filename
;
4099 void bdrv_get_backing_filename(BlockDriverState
*bs
,
4100 char *filename
, int filename_size
)
4102 pstrcpy(filename
, filename_size
, bs
->backing_file
);
4105 int bdrv_write_compressed(BlockDriverState
*bs
, int64_t sector_num
,
4106 const uint8_t *buf
, int nb_sectors
)
4108 BlockDriver
*drv
= bs
->drv
;
4111 if (!drv
->bdrv_write_compressed
)
4113 if (bdrv_check_request(bs
, sector_num
, nb_sectors
))
4116 assert(QLIST_EMPTY(&bs
->dirty_bitmaps
));
4118 return drv
->bdrv_write_compressed(bs
, sector_num
, buf
, nb_sectors
);
4121 int bdrv_get_info(BlockDriverState
*bs
, BlockDriverInfo
*bdi
)
4123 BlockDriver
*drv
= bs
->drv
;
4126 if (!drv
->bdrv_get_info
)
4128 memset(bdi
, 0, sizeof(*bdi
));
4129 return drv
->bdrv_get_info(bs
, bdi
);
4132 ImageInfoSpecific
*bdrv_get_specific_info(BlockDriverState
*bs
)
4134 BlockDriver
*drv
= bs
->drv
;
4135 if (drv
&& drv
->bdrv_get_specific_info
) {
4136 return drv
->bdrv_get_specific_info(bs
);
4141 int bdrv_save_vmstate(BlockDriverState
*bs
, const uint8_t *buf
,
4142 int64_t pos
, int size
)
4145 struct iovec iov
= {
4146 .iov_base
= (void *) buf
,
4150 qemu_iovec_init_external(&qiov
, &iov
, 1);
4151 return bdrv_writev_vmstate(bs
, &qiov
, pos
);
4154 int bdrv_writev_vmstate(BlockDriverState
*bs
, QEMUIOVector
*qiov
, int64_t pos
)
4156 BlockDriver
*drv
= bs
->drv
;
4160 } else if (drv
->bdrv_save_vmstate
) {
4161 return drv
->bdrv_save_vmstate(bs
, qiov
, pos
);
4162 } else if (bs
->file
) {
4163 return bdrv_writev_vmstate(bs
->file
, qiov
, pos
);
4169 int bdrv_load_vmstate(BlockDriverState
*bs
, uint8_t *buf
,
4170 int64_t pos
, int size
)
4172 BlockDriver
*drv
= bs
->drv
;
4175 if (drv
->bdrv_load_vmstate
)
4176 return drv
->bdrv_load_vmstate(bs
, buf
, pos
, size
);
4178 return bdrv_load_vmstate(bs
->file
, buf
, pos
, size
);
4182 void bdrv_debug_event(BlockDriverState
*bs
, BlkDebugEvent event
)
4184 if (!bs
|| !bs
->drv
|| !bs
->drv
->bdrv_debug_event
) {
4188 bs
->drv
->bdrv_debug_event(bs
, event
);
4191 int bdrv_debug_breakpoint(BlockDriverState
*bs
, const char *event
,
4194 while (bs
&& bs
->drv
&& !bs
->drv
->bdrv_debug_breakpoint
) {
4198 if (bs
&& bs
->drv
&& bs
->drv
->bdrv_debug_breakpoint
) {
4199 return bs
->drv
->bdrv_debug_breakpoint(bs
, event
, tag
);
4205 int bdrv_debug_remove_breakpoint(BlockDriverState
*bs
, const char *tag
)
4207 while (bs
&& bs
->drv
&& !bs
->drv
->bdrv_debug_remove_breakpoint
) {
4211 if (bs
&& bs
->drv
&& bs
->drv
->bdrv_debug_remove_breakpoint
) {
4212 return bs
->drv
->bdrv_debug_remove_breakpoint(bs
, tag
);
4218 int bdrv_debug_resume(BlockDriverState
*bs
, const char *tag
)
4220 while (bs
&& (!bs
->drv
|| !bs
->drv
->bdrv_debug_resume
)) {
4224 if (bs
&& bs
->drv
&& bs
->drv
->bdrv_debug_resume
) {
4225 return bs
->drv
->bdrv_debug_resume(bs
, tag
);
4231 bool bdrv_debug_is_suspended(BlockDriverState
*bs
, const char *tag
)
4233 while (bs
&& bs
->drv
&& !bs
->drv
->bdrv_debug_is_suspended
) {
4237 if (bs
&& bs
->drv
&& bs
->drv
->bdrv_debug_is_suspended
) {
4238 return bs
->drv
->bdrv_debug_is_suspended(bs
, tag
);
4244 int bdrv_is_snapshot(BlockDriverState
*bs
)
4246 return !!(bs
->open_flags
& BDRV_O_SNAPSHOT
);
4249 /* backing_file can either be relative, or absolute, or a protocol. If it is
4250 * relative, it must be relative to the chain. So, passing in bs->filename
4251 * from a BDS as backing_file should not be done, as that may be relative to
4252 * the CWD rather than the chain. */
4253 BlockDriverState
*bdrv_find_backing_image(BlockDriverState
*bs
,
4254 const char *backing_file
)
4256 char *filename_full
= NULL
;
4257 char *backing_file_full
= NULL
;
4258 char *filename_tmp
= NULL
;
4259 int is_protocol
= 0;
4260 BlockDriverState
*curr_bs
= NULL
;
4261 BlockDriverState
*retval
= NULL
;
4263 if (!bs
|| !bs
->drv
|| !backing_file
) {
4267 filename_full
= g_malloc(PATH_MAX
);
4268 backing_file_full
= g_malloc(PATH_MAX
);
4269 filename_tmp
= g_malloc(PATH_MAX
);
4271 is_protocol
= path_has_protocol(backing_file
);
4273 for (curr_bs
= bs
; curr_bs
->backing_hd
; curr_bs
= curr_bs
->backing_hd
) {
4275 /* If either of the filename paths is actually a protocol, then
4276 * compare unmodified paths; otherwise make paths relative */
4277 if (is_protocol
|| path_has_protocol(curr_bs
->backing_file
)) {
4278 if (strcmp(backing_file
, curr_bs
->backing_file
) == 0) {
4279 retval
= curr_bs
->backing_hd
;
4283 /* If not an absolute filename path, make it relative to the current
4284 * image's filename path */
4285 path_combine(filename_tmp
, PATH_MAX
, curr_bs
->filename
,
4288 /* We are going to compare absolute pathnames */
4289 if (!realpath(filename_tmp
, filename_full
)) {
4293 /* We need to make sure the backing filename we are comparing against
4294 * is relative to the current image filename (or absolute) */
4295 path_combine(filename_tmp
, PATH_MAX
, curr_bs
->filename
,
4296 curr_bs
->backing_file
);
4298 if (!realpath(filename_tmp
, backing_file_full
)) {
4302 if (strcmp(backing_file_full
, filename_full
) == 0) {
4303 retval
= curr_bs
->backing_hd
;
4309 g_free(filename_full
);
4310 g_free(backing_file_full
);
4311 g_free(filename_tmp
);
4315 int bdrv_get_backing_file_depth(BlockDriverState
*bs
)
4321 if (!bs
->backing_hd
) {
4325 return 1 + bdrv_get_backing_file_depth(bs
->backing_hd
);
4328 /**************************************************************/
4331 BlockAIOCB
*bdrv_aio_readv(BlockDriverState
*bs
, int64_t sector_num
,
4332 QEMUIOVector
*qiov
, int nb_sectors
,
4333 BlockCompletionFunc
*cb
, void *opaque
)
4335 trace_bdrv_aio_readv(bs
, sector_num
, nb_sectors
, opaque
);
4337 return bdrv_co_aio_rw_vector(bs
, sector_num
, qiov
, nb_sectors
, 0,
4341 BlockAIOCB
*bdrv_aio_writev(BlockDriverState
*bs
, int64_t sector_num
,
4342 QEMUIOVector
*qiov
, int nb_sectors
,
4343 BlockCompletionFunc
*cb
, void *opaque
)
4345 trace_bdrv_aio_writev(bs
, sector_num
, nb_sectors
, opaque
);
4347 return bdrv_co_aio_rw_vector(bs
, sector_num
, qiov
, nb_sectors
, 0,
4351 BlockAIOCB
*bdrv_aio_write_zeroes(BlockDriverState
*bs
,
4352 int64_t sector_num
, int nb_sectors
, BdrvRequestFlags flags
,
4353 BlockCompletionFunc
*cb
, void *opaque
)
4355 trace_bdrv_aio_write_zeroes(bs
, sector_num
, nb_sectors
, flags
, opaque
);
4357 return bdrv_co_aio_rw_vector(bs
, sector_num
, NULL
, nb_sectors
,
4358 BDRV_REQ_ZERO_WRITE
| flags
,
4363 typedef struct MultiwriteCB
{
4368 BlockCompletionFunc
*cb
;
4370 QEMUIOVector
*free_qiov
;
4374 static void multiwrite_user_cb(MultiwriteCB
*mcb
)
4378 for (i
= 0; i
< mcb
->num_callbacks
; i
++) {
4379 mcb
->callbacks
[i
].cb(mcb
->callbacks
[i
].opaque
, mcb
->error
);
4380 if (mcb
->callbacks
[i
].free_qiov
) {
4381 qemu_iovec_destroy(mcb
->callbacks
[i
].free_qiov
);
4383 g_free(mcb
->callbacks
[i
].free_qiov
);
4387 static void multiwrite_cb(void *opaque
, int ret
)
4389 MultiwriteCB
*mcb
= opaque
;
4391 trace_multiwrite_cb(mcb
, ret
);
4393 if (ret
< 0 && !mcb
->error
) {
4397 mcb
->num_requests
--;
4398 if (mcb
->num_requests
== 0) {
4399 multiwrite_user_cb(mcb
);
4404 static int multiwrite_req_compare(const void *a
, const void *b
)
4406 const BlockRequest
*req1
= a
, *req2
= b
;
4409 * Note that we can't simply subtract req2->sector from req1->sector
4410 * here as that could overflow the return value.
4412 if (req1
->sector
> req2
->sector
) {
4414 } else if (req1
->sector
< req2
->sector
) {
4422 * Takes a bunch of requests and tries to merge them. Returns the number of
4423 * requests that remain after merging.
4425 static int multiwrite_merge(BlockDriverState
*bs
, BlockRequest
*reqs
,
4426 int num_reqs
, MultiwriteCB
*mcb
)
4430 // Sort requests by start sector
4431 qsort(reqs
, num_reqs
, sizeof(*reqs
), &multiwrite_req_compare
);
4433 // Check if adjacent requests touch the same clusters. If so, combine them,
4434 // filling up gaps with zero sectors.
4436 for (i
= 1; i
< num_reqs
; i
++) {
4438 int64_t oldreq_last
= reqs
[outidx
].sector
+ reqs
[outidx
].nb_sectors
;
4440 // Handle exactly sequential writes and overlapping writes.
4441 if (reqs
[i
].sector
<= oldreq_last
) {
4445 if (reqs
[outidx
].qiov
->niov
+ reqs
[i
].qiov
->niov
+ 1 > IOV_MAX
) {
4449 if (bs
->bl
.max_transfer_length
&& reqs
[outidx
].nb_sectors
+
4450 reqs
[i
].nb_sectors
> bs
->bl
.max_transfer_length
) {
4456 QEMUIOVector
*qiov
= g_malloc0(sizeof(*qiov
));
4457 qemu_iovec_init(qiov
,
4458 reqs
[outidx
].qiov
->niov
+ reqs
[i
].qiov
->niov
+ 1);
4460 // Add the first request to the merged one. If the requests are
4461 // overlapping, drop the last sectors of the first request.
4462 size
= (reqs
[i
].sector
- reqs
[outidx
].sector
) << 9;
4463 qemu_iovec_concat(qiov
, reqs
[outidx
].qiov
, 0, size
);
4465 // We should need to add any zeros between the two requests
4466 assert (reqs
[i
].sector
<= oldreq_last
);
4468 // Add the second request
4469 qemu_iovec_concat(qiov
, reqs
[i
].qiov
, 0, reqs
[i
].qiov
->size
);
4471 // Add tail of first request, if necessary
4472 if (qiov
->size
< reqs
[outidx
].qiov
->size
) {
4473 qemu_iovec_concat(qiov
, reqs
[outidx
].qiov
, qiov
->size
,
4474 reqs
[outidx
].qiov
->size
- qiov
->size
);
4477 reqs
[outidx
].nb_sectors
= qiov
->size
>> 9;
4478 reqs
[outidx
].qiov
= qiov
;
4480 mcb
->callbacks
[i
].free_qiov
= reqs
[outidx
].qiov
;
4483 reqs
[outidx
].sector
= reqs
[i
].sector
;
4484 reqs
[outidx
].nb_sectors
= reqs
[i
].nb_sectors
;
4485 reqs
[outidx
].qiov
= reqs
[i
].qiov
;
4493 * Submit multiple AIO write requests at once.
4495 * On success, the function returns 0 and all requests in the reqs array have
4496 * been submitted. In error case this function returns -1, and any of the
4497 * requests may or may not be submitted yet. In particular, this means that the
4498 * callback will be called for some of the requests, for others it won't. The
4499 * caller must check the error field of the BlockRequest to wait for the right
4500 * callbacks (if error != 0, no callback will be called).
4502 * The implementation may modify the contents of the reqs array, e.g. to merge
4503 * requests. However, the fields opaque and error are left unmodified as they
4504 * are used to signal failure for a single request to the caller.
4506 int bdrv_aio_multiwrite(BlockDriverState
*bs
, BlockRequest
*reqs
, int num_reqs
)
4511 /* don't submit writes if we don't have a medium */
4512 if (bs
->drv
== NULL
) {
4513 for (i
= 0; i
< num_reqs
; i
++) {
4514 reqs
[i
].error
= -ENOMEDIUM
;
4519 if (num_reqs
== 0) {
4523 // Create MultiwriteCB structure
4524 mcb
= g_malloc0(sizeof(*mcb
) + num_reqs
* sizeof(*mcb
->callbacks
));
4525 mcb
->num_requests
= 0;
4526 mcb
->num_callbacks
= num_reqs
;
4528 for (i
= 0; i
< num_reqs
; i
++) {
4529 mcb
->callbacks
[i
].cb
= reqs
[i
].cb
;
4530 mcb
->callbacks
[i
].opaque
= reqs
[i
].opaque
;
4533 // Check for mergable requests
4534 num_reqs
= multiwrite_merge(bs
, reqs
, num_reqs
, mcb
);
4536 trace_bdrv_aio_multiwrite(mcb
, mcb
->num_callbacks
, num_reqs
);
4538 /* Run the aio requests. */
4539 mcb
->num_requests
= num_reqs
;
4540 for (i
= 0; i
< num_reqs
; i
++) {
4541 bdrv_co_aio_rw_vector(bs
, reqs
[i
].sector
, reqs
[i
].qiov
,
4542 reqs
[i
].nb_sectors
, reqs
[i
].flags
,
4550 void bdrv_aio_cancel(BlockAIOCB
*acb
)
4553 bdrv_aio_cancel_async(acb
);
4554 while (acb
->refcnt
> 1) {
4555 if (acb
->aiocb_info
->get_aio_context
) {
4556 aio_poll(acb
->aiocb_info
->get_aio_context(acb
), true);
4557 } else if (acb
->bs
) {
4558 aio_poll(bdrv_get_aio_context(acb
->bs
), true);
4563 qemu_aio_unref(acb
);
4566 /* Async version of aio cancel. The caller is not blocked if the acb implements
4567 * cancel_async, otherwise we do nothing and let the request normally complete.
4568 * In either case the completion callback must be called. */
4569 void bdrv_aio_cancel_async(BlockAIOCB
*acb
)
4571 if (acb
->aiocb_info
->cancel_async
) {
4572 acb
->aiocb_info
->cancel_async(acb
);
4576 /**************************************************************/
4577 /* async block device emulation */
4579 typedef struct BlockAIOCBSync
{
4583 /* vector translation state */
4589 static const AIOCBInfo bdrv_em_aiocb_info
= {
4590 .aiocb_size
= sizeof(BlockAIOCBSync
),
4593 static void bdrv_aio_bh_cb(void *opaque
)
4595 BlockAIOCBSync
*acb
= opaque
;
4597 if (!acb
->is_write
&& acb
->ret
>= 0) {
4598 qemu_iovec_from_buf(acb
->qiov
, 0, acb
->bounce
, acb
->qiov
->size
);
4600 qemu_vfree(acb
->bounce
);
4601 acb
->common
.cb(acb
->common
.opaque
, acb
->ret
);
4602 qemu_bh_delete(acb
->bh
);
4604 qemu_aio_unref(acb
);
4607 static BlockAIOCB
*bdrv_aio_rw_vector(BlockDriverState
*bs
,
4611 BlockCompletionFunc
*cb
,
4616 BlockAIOCBSync
*acb
;
4618 acb
= qemu_aio_get(&bdrv_em_aiocb_info
, bs
, cb
, opaque
);
4619 acb
->is_write
= is_write
;
4621 acb
->bounce
= qemu_try_blockalign(bs
, qiov
->size
);
4622 acb
->bh
= aio_bh_new(bdrv_get_aio_context(bs
), bdrv_aio_bh_cb
, acb
);
4624 if (acb
->bounce
== NULL
) {
4626 } else if (is_write
) {
4627 qemu_iovec_to_buf(acb
->qiov
, 0, acb
->bounce
, qiov
->size
);
4628 acb
->ret
= bs
->drv
->bdrv_write(bs
, sector_num
, acb
->bounce
, nb_sectors
);
4630 acb
->ret
= bs
->drv
->bdrv_read(bs
, sector_num
, acb
->bounce
, nb_sectors
);
4633 qemu_bh_schedule(acb
->bh
);
4635 return &acb
->common
;
4638 static BlockAIOCB
*bdrv_aio_readv_em(BlockDriverState
*bs
,
4639 int64_t sector_num
, QEMUIOVector
*qiov
, int nb_sectors
,
4640 BlockCompletionFunc
*cb
, void *opaque
)
4642 return bdrv_aio_rw_vector(bs
, sector_num
, qiov
, nb_sectors
, cb
, opaque
, 0);
4645 static BlockAIOCB
*bdrv_aio_writev_em(BlockDriverState
*bs
,
4646 int64_t sector_num
, QEMUIOVector
*qiov
, int nb_sectors
,
4647 BlockCompletionFunc
*cb
, void *opaque
)
4649 return bdrv_aio_rw_vector(bs
, sector_num
, qiov
, nb_sectors
, cb
, opaque
, 1);
4653 typedef struct BlockAIOCBCoroutine
{
4659 } BlockAIOCBCoroutine
;
4661 static const AIOCBInfo bdrv_em_co_aiocb_info
= {
4662 .aiocb_size
= sizeof(BlockAIOCBCoroutine
),
4665 static void bdrv_co_em_bh(void *opaque
)
4667 BlockAIOCBCoroutine
*acb
= opaque
;
4669 acb
->common
.cb(acb
->common
.opaque
, acb
->req
.error
);
4671 qemu_bh_delete(acb
->bh
);
4672 qemu_aio_unref(acb
);
4675 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4676 static void coroutine_fn
bdrv_co_do_rw(void *opaque
)
4678 BlockAIOCBCoroutine
*acb
= opaque
;
4679 BlockDriverState
*bs
= acb
->common
.bs
;
4681 if (!acb
->is_write
) {
4682 acb
->req
.error
= bdrv_co_do_readv(bs
, acb
->req
.sector
,
4683 acb
->req
.nb_sectors
, acb
->req
.qiov
, acb
->req
.flags
);
4685 acb
->req
.error
= bdrv_co_do_writev(bs
, acb
->req
.sector
,
4686 acb
->req
.nb_sectors
, acb
->req
.qiov
, acb
->req
.flags
);
4689 acb
->bh
= aio_bh_new(bdrv_get_aio_context(bs
), bdrv_co_em_bh
, acb
);
4690 qemu_bh_schedule(acb
->bh
);
4693 static BlockAIOCB
*bdrv_co_aio_rw_vector(BlockDriverState
*bs
,
4697 BdrvRequestFlags flags
,
4698 BlockCompletionFunc
*cb
,
4703 BlockAIOCBCoroutine
*acb
;
4705 acb
= qemu_aio_get(&bdrv_em_co_aiocb_info
, bs
, cb
, opaque
);
4706 acb
->req
.sector
= sector_num
;
4707 acb
->req
.nb_sectors
= nb_sectors
;
4708 acb
->req
.qiov
= qiov
;
4709 acb
->req
.flags
= flags
;
4710 acb
->is_write
= is_write
;
4712 co
= qemu_coroutine_create(bdrv_co_do_rw
);
4713 qemu_coroutine_enter(co
, acb
);
4715 return &acb
->common
;
4718 static void coroutine_fn
bdrv_aio_flush_co_entry(void *opaque
)
4720 BlockAIOCBCoroutine
*acb
= opaque
;
4721 BlockDriverState
*bs
= acb
->common
.bs
;
4723 acb
->req
.error
= bdrv_co_flush(bs
);
4724 acb
->bh
= aio_bh_new(bdrv_get_aio_context(bs
), bdrv_co_em_bh
, acb
);
4725 qemu_bh_schedule(acb
->bh
);
4728 BlockAIOCB
*bdrv_aio_flush(BlockDriverState
*bs
,
4729 BlockCompletionFunc
*cb
, void *opaque
)
4731 trace_bdrv_aio_flush(bs
, opaque
);
4734 BlockAIOCBCoroutine
*acb
;
4736 acb
= qemu_aio_get(&bdrv_em_co_aiocb_info
, bs
, cb
, opaque
);
4738 co
= qemu_coroutine_create(bdrv_aio_flush_co_entry
);
4739 qemu_coroutine_enter(co
, acb
);
4741 return &acb
->common
;
4744 static void coroutine_fn
bdrv_aio_discard_co_entry(void *opaque
)
4746 BlockAIOCBCoroutine
*acb
= opaque
;
4747 BlockDriverState
*bs
= acb
->common
.bs
;
4749 acb
->req
.error
= bdrv_co_discard(bs
, acb
->req
.sector
, acb
->req
.nb_sectors
);
4750 acb
->bh
= aio_bh_new(bdrv_get_aio_context(bs
), bdrv_co_em_bh
, acb
);
4751 qemu_bh_schedule(acb
->bh
);
4754 BlockAIOCB
*bdrv_aio_discard(BlockDriverState
*bs
,
4755 int64_t sector_num
, int nb_sectors
,
4756 BlockCompletionFunc
*cb
, void *opaque
)
4759 BlockAIOCBCoroutine
*acb
;
4761 trace_bdrv_aio_discard(bs
, sector_num
, nb_sectors
, opaque
);
4763 acb
= qemu_aio_get(&bdrv_em_co_aiocb_info
, bs
, cb
, opaque
);
4764 acb
->req
.sector
= sector_num
;
4765 acb
->req
.nb_sectors
= nb_sectors
;
4766 co
= qemu_coroutine_create(bdrv_aio_discard_co_entry
);
4767 qemu_coroutine_enter(co
, acb
);
4769 return &acb
->common
;
4772 void bdrv_init(void)
4774 module_call_init(MODULE_INIT_BLOCK
);
4777 void bdrv_init_with_whitelist(void)
4779 use_bdrv_whitelist
= 1;
4783 void *qemu_aio_get(const AIOCBInfo
*aiocb_info
, BlockDriverState
*bs
,
4784 BlockCompletionFunc
*cb
, void *opaque
)
4788 acb
= g_slice_alloc(aiocb_info
->aiocb_size
);
4789 acb
->aiocb_info
= aiocb_info
;
4792 acb
->opaque
= opaque
;
4797 void qemu_aio_ref(void *p
)
4799 BlockAIOCB
*acb
= p
;
4803 void qemu_aio_unref(void *p
)
4805 BlockAIOCB
*acb
= p
;
4806 assert(acb
->refcnt
> 0);
4807 if (--acb
->refcnt
== 0) {
4808 g_slice_free1(acb
->aiocb_info
->aiocb_size
, acb
);
4812 /**************************************************************/
4813 /* Coroutine block device emulation */
4815 typedef struct CoroutineIOCompletion
{
4816 Coroutine
*coroutine
;
4818 } CoroutineIOCompletion
;
4820 static void bdrv_co_io_em_complete(void *opaque
, int ret
)
4822 CoroutineIOCompletion
*co
= opaque
;
4825 qemu_coroutine_enter(co
->coroutine
, NULL
);
4828 static int coroutine_fn
bdrv_co_io_em(BlockDriverState
*bs
, int64_t sector_num
,
4829 int nb_sectors
, QEMUIOVector
*iov
,
4832 CoroutineIOCompletion co
= {
4833 .coroutine
= qemu_coroutine_self(),
4838 acb
= bs
->drv
->bdrv_aio_writev(bs
, sector_num
, iov
, nb_sectors
,
4839 bdrv_co_io_em_complete
, &co
);
4841 acb
= bs
->drv
->bdrv_aio_readv(bs
, sector_num
, iov
, nb_sectors
,
4842 bdrv_co_io_em_complete
, &co
);
4845 trace_bdrv_co_io_em(bs
, sector_num
, nb_sectors
, is_write
, acb
);
4849 qemu_coroutine_yield();
4854 static int coroutine_fn
bdrv_co_readv_em(BlockDriverState
*bs
,
4855 int64_t sector_num
, int nb_sectors
,
4858 return bdrv_co_io_em(bs
, sector_num
, nb_sectors
, iov
, false);
4861 static int coroutine_fn
bdrv_co_writev_em(BlockDriverState
*bs
,
4862 int64_t sector_num
, int nb_sectors
,
4865 return bdrv_co_io_em(bs
, sector_num
, nb_sectors
, iov
, true);
4868 static void coroutine_fn
bdrv_flush_co_entry(void *opaque
)
4870 RwCo
*rwco
= opaque
;
4872 rwco
->ret
= bdrv_co_flush(rwco
->bs
);
4875 int coroutine_fn
bdrv_co_flush(BlockDriverState
*bs
)
4879 if (!bs
|| !bdrv_is_inserted(bs
) || bdrv_is_read_only(bs
)) {
4883 /* Write back cached data to the OS even with cache=unsafe */
4884 BLKDBG_EVENT(bs
->file
, BLKDBG_FLUSH_TO_OS
);
4885 if (bs
->drv
->bdrv_co_flush_to_os
) {
4886 ret
= bs
->drv
->bdrv_co_flush_to_os(bs
);
4892 /* But don't actually force it to the disk with cache=unsafe */
4893 if (bs
->open_flags
& BDRV_O_NO_FLUSH
) {
4897 BLKDBG_EVENT(bs
->file
, BLKDBG_FLUSH_TO_DISK
);
4898 if (bs
->drv
->bdrv_co_flush_to_disk
) {
4899 ret
= bs
->drv
->bdrv_co_flush_to_disk(bs
);
4900 } else if (bs
->drv
->bdrv_aio_flush
) {
4902 CoroutineIOCompletion co
= {
4903 .coroutine
= qemu_coroutine_self(),
4906 acb
= bs
->drv
->bdrv_aio_flush(bs
, bdrv_co_io_em_complete
, &co
);
4910 qemu_coroutine_yield();
4915 * Some block drivers always operate in either writethrough or unsafe
4916 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4917 * know how the server works (because the behaviour is hardcoded or
4918 * depends on server-side configuration), so we can't ensure that
4919 * everything is safe on disk. Returning an error doesn't work because
4920 * that would break guests even if the server operates in writethrough
4923 * Let's hope the user knows what he's doing.
4931 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4932 * in the case of cache=unsafe, so there are no useless flushes.
4935 return bdrv_co_flush(bs
->file
);
4938 void bdrv_invalidate_cache(BlockDriverState
*bs
, Error
**errp
)
4940 Error
*local_err
= NULL
;
4947 if (!(bs
->open_flags
& BDRV_O_INCOMING
)) {
4950 bs
->open_flags
&= ~BDRV_O_INCOMING
;
4952 if (bs
->drv
->bdrv_invalidate_cache
) {
4953 bs
->drv
->bdrv_invalidate_cache(bs
, &local_err
);
4954 } else if (bs
->file
) {
4955 bdrv_invalidate_cache(bs
->file
, &local_err
);
4958 error_propagate(errp
, local_err
);
4962 ret
= refresh_total_sectors(bs
, bs
->total_sectors
);
4964 error_setg_errno(errp
, -ret
, "Could not refresh total sector count");
4969 void bdrv_invalidate_cache_all(Error
**errp
)
4971 BlockDriverState
*bs
;
4972 Error
*local_err
= NULL
;
4974 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
4975 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
4977 aio_context_acquire(aio_context
);
4978 bdrv_invalidate_cache(bs
, &local_err
);
4979 aio_context_release(aio_context
);
4981 error_propagate(errp
, local_err
);
4987 int bdrv_flush(BlockDriverState
*bs
)
4995 if (qemu_in_coroutine()) {
4996 /* Fast-path if already in coroutine context */
4997 bdrv_flush_co_entry(&rwco
);
4999 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
5001 co
= qemu_coroutine_create(bdrv_flush_co_entry
);
5002 qemu_coroutine_enter(co
, &rwco
);
5003 while (rwco
.ret
== NOT_DONE
) {
5004 aio_poll(aio_context
, true);
5011 typedef struct DiscardCo
{
5012 BlockDriverState
*bs
;
5017 static void coroutine_fn
bdrv_discard_co_entry(void *opaque
)
5019 DiscardCo
*rwco
= opaque
;
5021 rwco
->ret
= bdrv_co_discard(rwco
->bs
, rwco
->sector_num
, rwco
->nb_sectors
);
5024 /* if no limit is specified in the BlockLimits use a default
5025 * of 32768 512-byte sectors (16 MiB) per request.
5027 #define MAX_DISCARD_DEFAULT 32768
5029 int coroutine_fn
bdrv_co_discard(BlockDriverState
*bs
, int64_t sector_num
,
5036 } else if (bdrv_check_request(bs
, sector_num
, nb_sectors
)) {
5038 } else if (bs
->read_only
) {
5042 bdrv_reset_dirty(bs
, sector_num
, nb_sectors
);
5044 /* Do nothing if disabled. */
5045 if (!(bs
->open_flags
& BDRV_O_UNMAP
)) {
5049 if (!bs
->drv
->bdrv_co_discard
&& !bs
->drv
->bdrv_aio_discard
) {
5053 max_discard
= bs
->bl
.max_discard
? bs
->bl
.max_discard
: MAX_DISCARD_DEFAULT
;
5054 while (nb_sectors
> 0) {
5056 int num
= nb_sectors
;
5059 if (bs
->bl
.discard_alignment
&&
5060 num
>= bs
->bl
.discard_alignment
&&
5061 sector_num
% bs
->bl
.discard_alignment
) {
5062 if (num
> bs
->bl
.discard_alignment
) {
5063 num
= bs
->bl
.discard_alignment
;
5065 num
-= sector_num
% bs
->bl
.discard_alignment
;
5068 /* limit request size */
5069 if (num
> max_discard
) {
5073 if (bs
->drv
->bdrv_co_discard
) {
5074 ret
= bs
->drv
->bdrv_co_discard(bs
, sector_num
, num
);
5077 CoroutineIOCompletion co
= {
5078 .coroutine
= qemu_coroutine_self(),
5081 acb
= bs
->drv
->bdrv_aio_discard(bs
, sector_num
, nb_sectors
,
5082 bdrv_co_io_em_complete
, &co
);
5086 qemu_coroutine_yield();
5090 if (ret
&& ret
!= -ENOTSUP
) {
5100 int bdrv_discard(BlockDriverState
*bs
, int64_t sector_num
, int nb_sectors
)
5105 .sector_num
= sector_num
,
5106 .nb_sectors
= nb_sectors
,
5110 if (qemu_in_coroutine()) {
5111 /* Fast-path if already in coroutine context */
5112 bdrv_discard_co_entry(&rwco
);
5114 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
5116 co
= qemu_coroutine_create(bdrv_discard_co_entry
);
5117 qemu_coroutine_enter(co
, &rwco
);
5118 while (rwco
.ret
== NOT_DONE
) {
5119 aio_poll(aio_context
, true);
5126 /**************************************************************/
5127 /* removable device support */
5130 * Return TRUE if the media is present
5132 int bdrv_is_inserted(BlockDriverState
*bs
)
5134 BlockDriver
*drv
= bs
->drv
;
5138 if (!drv
->bdrv_is_inserted
)
5140 return drv
->bdrv_is_inserted(bs
);
5144 * Return whether the media changed since the last call to this
5145 * function, or -ENOTSUP if we don't know. Most drivers don't know.
5147 int bdrv_media_changed(BlockDriverState
*bs
)
5149 BlockDriver
*drv
= bs
->drv
;
5151 if (drv
&& drv
->bdrv_media_changed
) {
5152 return drv
->bdrv_media_changed(bs
);
5158 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5160 void bdrv_eject(BlockDriverState
*bs
, bool eject_flag
)
5162 BlockDriver
*drv
= bs
->drv
;
5163 const char *device_name
;
5165 if (drv
&& drv
->bdrv_eject
) {
5166 drv
->bdrv_eject(bs
, eject_flag
);
5169 device_name
= bdrv_get_device_name(bs
);
5170 if (device_name
[0] != '\0') {
5171 qapi_event_send_device_tray_moved(device_name
,
5172 eject_flag
, &error_abort
);
5177 * Lock or unlock the media (if it is locked, the user won't be able
5178 * to eject it manually).
5180 void bdrv_lock_medium(BlockDriverState
*bs
, bool locked
)
5182 BlockDriver
*drv
= bs
->drv
;
5184 trace_bdrv_lock_medium(bs
, locked
);
5186 if (drv
&& drv
->bdrv_lock_medium
) {
5187 drv
->bdrv_lock_medium(bs
, locked
);
5191 /* needed for generic scsi interface */
5193 int bdrv_ioctl(BlockDriverState
*bs
, unsigned long int req
, void *buf
)
5195 BlockDriver
*drv
= bs
->drv
;
5197 if (drv
&& drv
->bdrv_ioctl
)
5198 return drv
->bdrv_ioctl(bs
, req
, buf
);
5202 BlockAIOCB
*bdrv_aio_ioctl(BlockDriverState
*bs
,
5203 unsigned long int req
, void *buf
,
5204 BlockCompletionFunc
*cb
, void *opaque
)
5206 BlockDriver
*drv
= bs
->drv
;
5208 if (drv
&& drv
->bdrv_aio_ioctl
)
5209 return drv
->bdrv_aio_ioctl(bs
, req
, buf
, cb
, opaque
);
5213 void bdrv_set_guest_block_size(BlockDriverState
*bs
, int align
)
5215 bs
->guest_block_size
= align
;
5218 void *qemu_blockalign(BlockDriverState
*bs
, size_t size
)
5220 return qemu_memalign(bdrv_opt_mem_align(bs
), size
);
5223 void *qemu_blockalign0(BlockDriverState
*bs
, size_t size
)
5225 return memset(qemu_blockalign(bs
, size
), 0, size
);
5228 void *qemu_try_blockalign(BlockDriverState
*bs
, size_t size
)
5230 size_t align
= bdrv_opt_mem_align(bs
);
5232 /* Ensure that NULL is never returned on success */
5238 return qemu_try_memalign(align
, size
);
5241 void *qemu_try_blockalign0(BlockDriverState
*bs
, size_t size
)
5243 void *mem
= qemu_try_blockalign(bs
, size
);
5246 memset(mem
, 0, size
);
5253 * Check if all memory in this vector is sector aligned.
5255 bool bdrv_qiov_is_aligned(BlockDriverState
*bs
, QEMUIOVector
*qiov
)
5258 size_t alignment
= bdrv_opt_mem_align(bs
);
5260 for (i
= 0; i
< qiov
->niov
; i
++) {
5261 if ((uintptr_t) qiov
->iov
[i
].iov_base
% alignment
) {
5264 if (qiov
->iov
[i
].iov_len
% alignment
) {
5272 BdrvDirtyBitmap
*bdrv_create_dirty_bitmap(BlockDriverState
*bs
, int granularity
,
5275 int64_t bitmap_size
;
5276 BdrvDirtyBitmap
*bitmap
;
5278 assert((granularity
& (granularity
- 1)) == 0);
5280 granularity
>>= BDRV_SECTOR_BITS
;
5281 assert(granularity
);
5282 bitmap_size
= bdrv_nb_sectors(bs
);
5283 if (bitmap_size
< 0) {
5284 error_setg_errno(errp
, -bitmap_size
, "could not get length of device");
5285 errno
= -bitmap_size
;
5288 bitmap
= g_new0(BdrvDirtyBitmap
, 1);
5289 bitmap
->bitmap
= hbitmap_alloc(bitmap_size
, ffs(granularity
) - 1);
5290 QLIST_INSERT_HEAD(&bs
->dirty_bitmaps
, bitmap
, list
);
5294 void bdrv_release_dirty_bitmap(BlockDriverState
*bs
, BdrvDirtyBitmap
*bitmap
)
5296 BdrvDirtyBitmap
*bm
, *next
;
5297 QLIST_FOREACH_SAFE(bm
, &bs
->dirty_bitmaps
, list
, next
) {
5299 QLIST_REMOVE(bitmap
, list
);
5300 hbitmap_free(bitmap
->bitmap
);
5307 BlockDirtyInfoList
*bdrv_query_dirty_bitmaps(BlockDriverState
*bs
)
5309 BdrvDirtyBitmap
*bm
;
5310 BlockDirtyInfoList
*list
= NULL
;
5311 BlockDirtyInfoList
**plist
= &list
;
5313 QLIST_FOREACH(bm
, &bs
->dirty_bitmaps
, list
) {
5314 BlockDirtyInfo
*info
= g_new0(BlockDirtyInfo
, 1);
5315 BlockDirtyInfoList
*entry
= g_new0(BlockDirtyInfoList
, 1);
5316 info
->count
= bdrv_get_dirty_count(bs
, bm
);
5318 ((int64_t) BDRV_SECTOR_SIZE
<< hbitmap_granularity(bm
->bitmap
));
5319 entry
->value
= info
;
5321 plist
= &entry
->next
;
5327 int bdrv_get_dirty(BlockDriverState
*bs
, BdrvDirtyBitmap
*bitmap
, int64_t sector
)
5330 return hbitmap_get(bitmap
->bitmap
, sector
);
5336 void bdrv_dirty_iter_init(BlockDriverState
*bs
,
5337 BdrvDirtyBitmap
*bitmap
, HBitmapIter
*hbi
)
5339 hbitmap_iter_init(hbi
, bitmap
->bitmap
, 0);
5342 void bdrv_set_dirty(BlockDriverState
*bs
, int64_t cur_sector
,
5345 BdrvDirtyBitmap
*bitmap
;
5346 QLIST_FOREACH(bitmap
, &bs
->dirty_bitmaps
, list
) {
5347 hbitmap_set(bitmap
->bitmap
, cur_sector
, nr_sectors
);
5351 void bdrv_reset_dirty(BlockDriverState
*bs
, int64_t cur_sector
, int nr_sectors
)
5353 BdrvDirtyBitmap
*bitmap
;
5354 QLIST_FOREACH(bitmap
, &bs
->dirty_bitmaps
, list
) {
5355 hbitmap_reset(bitmap
->bitmap
, cur_sector
, nr_sectors
);
5359 int64_t bdrv_get_dirty_count(BlockDriverState
*bs
, BdrvDirtyBitmap
*bitmap
)
5361 return hbitmap_count(bitmap
->bitmap
);
5364 /* Get a reference to bs */
5365 void bdrv_ref(BlockDriverState
*bs
)
5370 /* Release a previously grabbed reference to bs.
5371 * If after releasing, reference count is zero, the BlockDriverState is
5373 void bdrv_unref(BlockDriverState
*bs
)
5378 assert(bs
->refcnt
> 0);
5379 if (--bs
->refcnt
== 0) {
5384 struct BdrvOpBlocker
{
5386 QLIST_ENTRY(BdrvOpBlocker
) list
;
5389 bool bdrv_op_is_blocked(BlockDriverState
*bs
, BlockOpType op
, Error
**errp
)
5391 BdrvOpBlocker
*blocker
;
5392 assert((int) op
>= 0 && op
< BLOCK_OP_TYPE_MAX
);
5393 if (!QLIST_EMPTY(&bs
->op_blockers
[op
])) {
5394 blocker
= QLIST_FIRST(&bs
->op_blockers
[op
]);
5396 error_setg(errp
, "Device '%s' is busy: %s",
5397 bdrv_get_device_name(bs
),
5398 error_get_pretty(blocker
->reason
));
5405 void bdrv_op_block(BlockDriverState
*bs
, BlockOpType op
, Error
*reason
)
5407 BdrvOpBlocker
*blocker
;
5408 assert((int) op
>= 0 && op
< BLOCK_OP_TYPE_MAX
);
5410 blocker
= g_new0(BdrvOpBlocker
, 1);
5411 blocker
->reason
= reason
;
5412 QLIST_INSERT_HEAD(&bs
->op_blockers
[op
], blocker
, list
);
5415 void bdrv_op_unblock(BlockDriverState
*bs
, BlockOpType op
, Error
*reason
)
5417 BdrvOpBlocker
*blocker
, *next
;
5418 assert((int) op
>= 0 && op
< BLOCK_OP_TYPE_MAX
);
5419 QLIST_FOREACH_SAFE(blocker
, &bs
->op_blockers
[op
], list
, next
) {
5420 if (blocker
->reason
== reason
) {
5421 QLIST_REMOVE(blocker
, list
);
5427 void bdrv_op_block_all(BlockDriverState
*bs
, Error
*reason
)
5430 for (i
= 0; i
< BLOCK_OP_TYPE_MAX
; i
++) {
5431 bdrv_op_block(bs
, i
, reason
);
5435 void bdrv_op_unblock_all(BlockDriverState
*bs
, Error
*reason
)
5438 for (i
= 0; i
< BLOCK_OP_TYPE_MAX
; i
++) {
5439 bdrv_op_unblock(bs
, i
, reason
);
5443 bool bdrv_op_blocker_is_empty(BlockDriverState
*bs
)
5447 for (i
= 0; i
< BLOCK_OP_TYPE_MAX
; i
++) {
5448 if (!QLIST_EMPTY(&bs
->op_blockers
[i
])) {
5455 void bdrv_iostatus_enable(BlockDriverState
*bs
)
5457 bs
->iostatus_enabled
= true;
5458 bs
->iostatus
= BLOCK_DEVICE_IO_STATUS_OK
;
5461 /* The I/O status is only enabled if the drive explicitly
5462 * enables it _and_ the VM is configured to stop on errors */
5463 bool bdrv_iostatus_is_enabled(const BlockDriverState
*bs
)
5465 return (bs
->iostatus_enabled
&&
5466 (bs
->on_write_error
== BLOCKDEV_ON_ERROR_ENOSPC
||
5467 bs
->on_write_error
== BLOCKDEV_ON_ERROR_STOP
||
5468 bs
->on_read_error
== BLOCKDEV_ON_ERROR_STOP
));
5471 void bdrv_iostatus_disable(BlockDriverState
*bs
)
5473 bs
->iostatus_enabled
= false;
5476 void bdrv_iostatus_reset(BlockDriverState
*bs
)
5478 if (bdrv_iostatus_is_enabled(bs
)) {
5479 bs
->iostatus
= BLOCK_DEVICE_IO_STATUS_OK
;
5481 block_job_iostatus_reset(bs
->job
);
5486 void bdrv_iostatus_set_err(BlockDriverState
*bs
, int error
)
5488 assert(bdrv_iostatus_is_enabled(bs
));
5489 if (bs
->iostatus
== BLOCK_DEVICE_IO_STATUS_OK
) {
5490 bs
->iostatus
= error
== ENOSPC
? BLOCK_DEVICE_IO_STATUS_NOSPACE
:
5491 BLOCK_DEVICE_IO_STATUS_FAILED
;
5495 void bdrv_img_create(const char *filename
, const char *fmt
,
5496 const char *base_filename
, const char *base_fmt
,
5497 char *options
, uint64_t img_size
, int flags
,
5498 Error
**errp
, bool quiet
)
5500 QemuOptsList
*create_opts
= NULL
;
5501 QemuOpts
*opts
= NULL
;
5502 const char *backing_fmt
, *backing_file
;
5504 BlockDriver
*drv
, *proto_drv
;
5505 BlockDriver
*backing_drv
= NULL
;
5506 Error
*local_err
= NULL
;
5509 /* Find driver and parse its options */
5510 drv
= bdrv_find_format(fmt
);
5512 error_setg(errp
, "Unknown file format '%s'", fmt
);
5516 proto_drv
= bdrv_find_protocol(filename
, true);
5518 error_setg(errp
, "Unknown protocol '%s'", filename
);
5522 create_opts
= qemu_opts_append(create_opts
, drv
->create_opts
);
5523 create_opts
= qemu_opts_append(create_opts
, proto_drv
->create_opts
);
5525 /* Create parameter list with default values */
5526 opts
= qemu_opts_create(create_opts
, NULL
, 0, &error_abort
);
5527 qemu_opt_set_number(opts
, BLOCK_OPT_SIZE
, img_size
);
5529 /* Parse -o options */
5531 if (qemu_opts_do_parse(opts
, options
, NULL
) != 0) {
5532 error_setg(errp
, "Invalid options for file format '%s'", fmt
);
5537 if (base_filename
) {
5538 if (qemu_opt_set(opts
, BLOCK_OPT_BACKING_FILE
, base_filename
)) {
5539 error_setg(errp
, "Backing file not supported for file format '%s'",
5546 if (qemu_opt_set(opts
, BLOCK_OPT_BACKING_FMT
, base_fmt
)) {
5547 error_setg(errp
, "Backing file format not supported for file "
5548 "format '%s'", fmt
);
5553 backing_file
= qemu_opt_get(opts
, BLOCK_OPT_BACKING_FILE
);
5555 if (!strcmp(filename
, backing_file
)) {
5556 error_setg(errp
, "Error: Trying to create an image with the "
5557 "same filename as the backing file");
5562 backing_fmt
= qemu_opt_get(opts
, BLOCK_OPT_BACKING_FMT
);
5564 backing_drv
= bdrv_find_format(backing_fmt
);
5566 error_setg(errp
, "Unknown backing file format '%s'",
5572 // The size for the image must always be specified, with one exception:
5573 // If we are using a backing file, we can obtain the size from there
5574 size
= qemu_opt_get_size(opts
, BLOCK_OPT_SIZE
, 0);
5577 BlockDriverState
*bs
;
5581 /* backing files always opened read-only */
5583 flags
& ~(BDRV_O_RDWR
| BDRV_O_SNAPSHOT
| BDRV_O_NO_BACKING
);
5586 ret
= bdrv_open(&bs
, backing_file
, NULL
, NULL
, back_flags
,
5587 backing_drv
, &local_err
);
5589 error_setg_errno(errp
, -ret
, "Could not open '%s': %s",
5591 error_get_pretty(local_err
));
5592 error_free(local_err
);
5596 size
= bdrv_getlength(bs
);
5598 error_setg_errno(errp
, -size
, "Could not get size of '%s'",
5604 qemu_opt_set_number(opts
, BLOCK_OPT_SIZE
, size
);
5608 error_setg(errp
, "Image creation needs a size parameter");
5614 printf("Formatting '%s', fmt=%s ", filename
, fmt
);
5615 qemu_opts_print(opts
);
5619 ret
= bdrv_create(drv
, filename
, opts
, &local_err
);
5621 if (ret
== -EFBIG
) {
5622 /* This is generally a better message than whatever the driver would
5623 * deliver (especially because of the cluster_size_hint), since that
5624 * is most probably not much different from "image too large". */
5625 const char *cluster_size_hint
= "";
5626 if (qemu_opt_get_size(opts
, BLOCK_OPT_CLUSTER_SIZE
, 0)) {
5627 cluster_size_hint
= " (try using a larger cluster size)";
5629 error_setg(errp
, "The image size is too large for file format '%s'"
5630 "%s", fmt
, cluster_size_hint
);
5631 error_free(local_err
);
5636 qemu_opts_del(opts
);
5637 qemu_opts_free(create_opts
);
5639 error_propagate(errp
, local_err
);
5643 AioContext
*bdrv_get_aio_context(BlockDriverState
*bs
)
5645 return bs
->aio_context
;
5648 void bdrv_detach_aio_context(BlockDriverState
*bs
)
5650 BdrvAioNotifier
*baf
;
5656 QLIST_FOREACH(baf
, &bs
->aio_notifiers
, list
) {
5657 baf
->detach_aio_context(baf
->opaque
);
5660 if (bs
->io_limits_enabled
) {
5661 throttle_detach_aio_context(&bs
->throttle_state
);
5663 if (bs
->drv
->bdrv_detach_aio_context
) {
5664 bs
->drv
->bdrv_detach_aio_context(bs
);
5667 bdrv_detach_aio_context(bs
->file
);
5669 if (bs
->backing_hd
) {
5670 bdrv_detach_aio_context(bs
->backing_hd
);
5673 bs
->aio_context
= NULL
;
5676 void bdrv_attach_aio_context(BlockDriverState
*bs
,
5677 AioContext
*new_context
)
5679 BdrvAioNotifier
*ban
;
5685 bs
->aio_context
= new_context
;
5687 if (bs
->backing_hd
) {
5688 bdrv_attach_aio_context(bs
->backing_hd
, new_context
);
5691 bdrv_attach_aio_context(bs
->file
, new_context
);
5693 if (bs
->drv
->bdrv_attach_aio_context
) {
5694 bs
->drv
->bdrv_attach_aio_context(bs
, new_context
);
5696 if (bs
->io_limits_enabled
) {
5697 throttle_attach_aio_context(&bs
->throttle_state
, new_context
);
5700 QLIST_FOREACH(ban
, &bs
->aio_notifiers
, list
) {
5701 ban
->attached_aio_context(new_context
, ban
->opaque
);
5705 void bdrv_set_aio_context(BlockDriverState
*bs
, AioContext
*new_context
)
5707 bdrv_drain_all(); /* ensure there are no in-flight requests */
5709 bdrv_detach_aio_context(bs
);
5711 /* This function executes in the old AioContext so acquire the new one in
5712 * case it runs in a different thread.
5714 aio_context_acquire(new_context
);
5715 bdrv_attach_aio_context(bs
, new_context
);
5716 aio_context_release(new_context
);
5719 void bdrv_add_aio_context_notifier(BlockDriverState
*bs
,
5720 void (*attached_aio_context
)(AioContext
*new_context
, void *opaque
),
5721 void (*detach_aio_context
)(void *opaque
), void *opaque
)
5723 BdrvAioNotifier
*ban
= g_new(BdrvAioNotifier
, 1);
5724 *ban
= (BdrvAioNotifier
){
5725 .attached_aio_context
= attached_aio_context
,
5726 .detach_aio_context
= detach_aio_context
,
5730 QLIST_INSERT_HEAD(&bs
->aio_notifiers
, ban
, list
);
5733 void bdrv_remove_aio_context_notifier(BlockDriverState
*bs
,
5734 void (*attached_aio_context
)(AioContext
*,
5736 void (*detach_aio_context
)(void *),
5739 BdrvAioNotifier
*ban
, *ban_next
;
5741 QLIST_FOREACH_SAFE(ban
, &bs
->aio_notifiers
, list
, ban_next
) {
5742 if (ban
->attached_aio_context
== attached_aio_context
&&
5743 ban
->detach_aio_context
== detach_aio_context
&&
5744 ban
->opaque
== opaque
)
5746 QLIST_REMOVE(ban
, list
);
5756 void bdrv_add_before_write_notifier(BlockDriverState
*bs
,
5757 NotifierWithReturn
*notifier
)
5759 notifier_with_return_list_add(&bs
->before_write_notifiers
, notifier
);
5762 int bdrv_amend_options(BlockDriverState
*bs
, QemuOpts
*opts
)
5764 if (!bs
->drv
->bdrv_amend_options
) {
5767 return bs
->drv
->bdrv_amend_options(bs
, opts
);
5770 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5771 * of block filter and by bdrv_is_first_non_filter.
5772 * It is used to test if the given bs is the candidate or recurse more in the
5775 bool bdrv_recurse_is_first_non_filter(BlockDriverState
*bs
,
5776 BlockDriverState
*candidate
)
5778 /* return false if basic checks fails */
5779 if (!bs
|| !bs
->drv
) {
5783 /* the code reached a non block filter driver -> check if the bs is
5784 * the same as the candidate. It's the recursion termination condition.
5786 if (!bs
->drv
->is_filter
) {
5787 return bs
== candidate
;
5789 /* Down this path the driver is a block filter driver */
5791 /* If the block filter recursion method is defined use it to recurse down
5794 if (bs
->drv
->bdrv_recurse_is_first_non_filter
) {
5795 return bs
->drv
->bdrv_recurse_is_first_non_filter(bs
, candidate
);
5798 /* the driver is a block filter but don't allow to recurse -> return false
5803 /* This function checks if the candidate is the first non filter bs down it's
5804 * bs chain. Since we don't have pointers to parents it explore all bs chains
5805 * from the top. Some filters can choose not to pass down the recursion.
5807 bool bdrv_is_first_non_filter(BlockDriverState
*candidate
)
5809 BlockDriverState
*bs
;
5811 /* walk down the bs forest recursively */
5812 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
5815 /* try to recurse in this top level bs */
5816 perm
= bdrv_recurse_is_first_non_filter(bs
, candidate
);
5818 /* candidate is the first non filter */
5827 BlockDriverState
*check_to_replace_node(const char *node_name
, Error
**errp
)
5829 BlockDriverState
*to_replace_bs
= bdrv_find_node(node_name
);
5830 if (!to_replace_bs
) {
5831 error_setg(errp
, "Node name '%s' not found", node_name
);
5835 if (bdrv_op_is_blocked(to_replace_bs
, BLOCK_OP_TYPE_REPLACE
, errp
)) {
5839 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5840 * most non filter in order to prevent data corruption.
5841 * Another benefit is that this tests exclude backing files which are
5842 * blocked by the backing blockers.
5844 if (!bdrv_is_first_non_filter(to_replace_bs
)) {
5845 error_setg(errp
, "Only top most non filter can be replaced");
5849 return to_replace_bs
;
5852 void bdrv_io_plug(BlockDriverState
*bs
)
5854 BlockDriver
*drv
= bs
->drv
;
5855 if (drv
&& drv
->bdrv_io_plug
) {
5856 drv
->bdrv_io_plug(bs
);
5857 } else if (bs
->file
) {
5858 bdrv_io_plug(bs
->file
);
5862 void bdrv_io_unplug(BlockDriverState
*bs
)
5864 BlockDriver
*drv
= bs
->drv
;
5865 if (drv
&& drv
->bdrv_io_unplug
) {
5866 drv
->bdrv_io_unplug(bs
);
5867 } else if (bs
->file
) {
5868 bdrv_io_unplug(bs
->file
);
5872 void bdrv_flush_io_queue(BlockDriverState
*bs
)
5874 BlockDriver
*drv
= bs
->drv
;
5875 if (drv
&& drv
->bdrv_flush_io_queue
) {
5876 drv
->bdrv_flush_io_queue(bs
);
5877 } else if (bs
->file
) {
5878 bdrv_flush_io_queue(bs
->file
);
5882 static bool append_open_options(QDict
*d
, BlockDriverState
*bs
)
5884 const QDictEntry
*entry
;
5885 bool found_any
= false;
5887 for (entry
= qdict_first(bs
->options
); entry
;
5888 entry
= qdict_next(bs
->options
, entry
))
5890 /* Only take options for this level and exclude all non-driver-specific
5892 if (!strchr(qdict_entry_key(entry
), '.') &&
5893 strcmp(qdict_entry_key(entry
), "node-name"))
5895 qobject_incref(qdict_entry_value(entry
));
5896 qdict_put_obj(d
, qdict_entry_key(entry
), qdict_entry_value(entry
));
5904 /* Updates the following BDS fields:
5905 * - exact_filename: A filename which may be used for opening a block device
5906 * which (mostly) equals the given BDS (even without any
5907 * other options; so reading and writing must return the same
5908 * results, but caching etc. may be different)
5909 * - full_open_options: Options which, when given when opening a block device
5910 * (without a filename), result in a BDS (mostly)
5911 * equalling the given one
5912 * - filename: If exact_filename is set, it is copied here. Otherwise,
5913 * full_open_options is converted to a JSON object, prefixed with
5914 * "json:" (for use through the JSON pseudo protocol) and put here.
5916 void bdrv_refresh_filename(BlockDriverState
*bs
)
5918 BlockDriver
*drv
= bs
->drv
;
5925 /* This BDS's file name will most probably depend on its file's name, so
5926 * refresh that first */
5928 bdrv_refresh_filename(bs
->file
);
5931 if (drv
->bdrv_refresh_filename
) {
5932 /* Obsolete information is of no use here, so drop the old file name
5933 * information before refreshing it */
5934 bs
->exact_filename
[0] = '\0';
5935 if (bs
->full_open_options
) {
5936 QDECREF(bs
->full_open_options
);
5937 bs
->full_open_options
= NULL
;
5940 drv
->bdrv_refresh_filename(bs
);
5941 } else if (bs
->file
) {
5942 /* Try to reconstruct valid information from the underlying file */
5943 bool has_open_options
;
5945 bs
->exact_filename
[0] = '\0';
5946 if (bs
->full_open_options
) {
5947 QDECREF(bs
->full_open_options
);
5948 bs
->full_open_options
= NULL
;
5952 has_open_options
= append_open_options(opts
, bs
);
5954 /* If no specific options have been given for this BDS, the filename of
5955 * the underlying file should suffice for this one as well */
5956 if (bs
->file
->exact_filename
[0] && !has_open_options
) {
5957 strcpy(bs
->exact_filename
, bs
->file
->exact_filename
);
5959 /* Reconstructing the full options QDict is simple for most format block
5960 * drivers, as long as the full options are known for the underlying
5961 * file BDS. The full options QDict of that file BDS should somehow
5962 * contain a representation of the filename, therefore the following
5963 * suffices without querying the (exact_)filename of this BDS. */
5964 if (bs
->file
->full_open_options
) {
5965 qdict_put_obj(opts
, "driver",
5966 QOBJECT(qstring_from_str(drv
->format_name
)));
5967 QINCREF(bs
->file
->full_open_options
);
5968 qdict_put_obj(opts
, "file", QOBJECT(bs
->file
->full_open_options
));
5970 bs
->full_open_options
= opts
;
5974 } else if (!bs
->full_open_options
&& qdict_size(bs
->options
)) {
5975 /* There is no underlying file BDS (at least referenced by BDS.file),
5976 * so the full options QDict should be equal to the options given
5977 * specifically for this block device when it was opened (plus the
5978 * driver specification).
5979 * Because those options don't change, there is no need to update
5980 * full_open_options when it's already set. */
5983 append_open_options(opts
, bs
);
5984 qdict_put_obj(opts
, "driver",
5985 QOBJECT(qstring_from_str(drv
->format_name
)));
5987 if (bs
->exact_filename
[0]) {
5988 /* This may not work for all block protocol drivers (some may
5989 * require this filename to be parsed), but we have to find some
5990 * default solution here, so just include it. If some block driver
5991 * does not support pure options without any filename at all or
5992 * needs some special format of the options QDict, it needs to
5993 * implement the driver-specific bdrv_refresh_filename() function.
5995 qdict_put_obj(opts
, "filename",
5996 QOBJECT(qstring_from_str(bs
->exact_filename
)));
5999 bs
->full_open_options
= opts
;
6002 if (bs
->exact_filename
[0]) {
6003 pstrcpy(bs
->filename
, sizeof(bs
->filename
), bs
->exact_filename
);
6004 } else if (bs
->full_open_options
) {
6005 QString
*json
= qobject_to_json(QOBJECT(bs
->full_open_options
));
6006 snprintf(bs
->filename
, sizeof(bs
->filename
), "json:%s",
6007 qstring_get_str(json
));
6012 /* This accessor function purpose is to allow the device models to access the
6013 * BlockAcctStats structure embedded inside a BlockDriverState without being
6014 * aware of the BlockDriverState structure layout.
6015 * It will go away when the BlockAcctStats structure will be moved inside
6016 * the device models.
6018 BlockAcctStats
*bdrv_get_stats(BlockDriverState
*bs
)