2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include "config-host.h"
25 #include "qemu-common.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
40 #include <sys/types.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
53 struct BdrvDirtyBitmap
{
55 QLIST_ENTRY(BdrvDirtyBitmap
) list
;
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60 static void bdrv_dev_change_media_cb(BlockDriverState
*bs
, bool load
);
61 static BlockDriverAIOCB
*bdrv_aio_readv_em(BlockDriverState
*bs
,
62 int64_t sector_num
, QEMUIOVector
*qiov
, int nb_sectors
,
63 BlockDriverCompletionFunc
*cb
, void *opaque
);
64 static BlockDriverAIOCB
*bdrv_aio_writev_em(BlockDriverState
*bs
,
65 int64_t sector_num
, QEMUIOVector
*qiov
, int nb_sectors
,
66 BlockDriverCompletionFunc
*cb
, void *opaque
);
67 static int coroutine_fn
bdrv_co_readv_em(BlockDriverState
*bs
,
68 int64_t sector_num
, int nb_sectors
,
70 static int coroutine_fn
bdrv_co_writev_em(BlockDriverState
*bs
,
71 int64_t sector_num
, int nb_sectors
,
73 static int coroutine_fn
bdrv_co_do_preadv(BlockDriverState
*bs
,
74 int64_t offset
, unsigned int bytes
, QEMUIOVector
*qiov
,
75 BdrvRequestFlags flags
);
76 static int coroutine_fn
bdrv_co_do_pwritev(BlockDriverState
*bs
,
77 int64_t offset
, unsigned int bytes
, QEMUIOVector
*qiov
,
78 BdrvRequestFlags flags
);
79 static BlockDriverAIOCB
*bdrv_co_aio_rw_vector(BlockDriverState
*bs
,
83 BdrvRequestFlags flags
,
84 BlockDriverCompletionFunc
*cb
,
87 static void coroutine_fn
bdrv_co_do_rw(void *opaque
);
88 static int coroutine_fn
bdrv_co_do_write_zeroes(BlockDriverState
*bs
,
89 int64_t sector_num
, int nb_sectors
, BdrvRequestFlags flags
);
91 static QTAILQ_HEAD(, BlockDriverState
) bdrv_states
=
92 QTAILQ_HEAD_INITIALIZER(bdrv_states
);
94 static QTAILQ_HEAD(, BlockDriverState
) graph_bdrv_states
=
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states
);
97 static QLIST_HEAD(, BlockDriver
) bdrv_drivers
=
98 QLIST_HEAD_INITIALIZER(bdrv_drivers
);
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist
;
104 static int is_windows_drive_prefix(const char *filename
)
106 return (((filename
[0] >= 'a' && filename
[0] <= 'z') ||
107 (filename
[0] >= 'A' && filename
[0] <= 'Z')) &&
111 int is_windows_drive(const char *filename
)
113 if (is_windows_drive_prefix(filename
) &&
116 if (strstart(filename
, "\\\\.\\", NULL
) ||
117 strstart(filename
, "//./", NULL
))
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState
*bs
,
129 throttle_config(&bs
->throttle_state
, cfg
);
131 for (i
= 0; i
< 2; i
++) {
132 qemu_co_enter_next(&bs
->throttled_reqs
[i
]);
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState
*bs
)
139 bool drained
= false;
140 bool enabled
= bs
->io_limits_enabled
;
143 bs
->io_limits_enabled
= false;
145 for (i
= 0; i
< 2; i
++) {
146 while (qemu_co_enter_next(&bs
->throttled_reqs
[i
])) {
151 bs
->io_limits_enabled
= enabled
;
156 void bdrv_io_limits_disable(BlockDriverState
*bs
)
158 bs
->io_limits_enabled
= false;
160 bdrv_start_throttled_reqs(bs
);
162 throttle_destroy(&bs
->throttle_state
);
165 static void bdrv_throttle_read_timer_cb(void *opaque
)
167 BlockDriverState
*bs
= opaque
;
168 qemu_co_enter_next(&bs
->throttled_reqs
[0]);
171 static void bdrv_throttle_write_timer_cb(void *opaque
)
173 BlockDriverState
*bs
= opaque
;
174 qemu_co_enter_next(&bs
->throttled_reqs
[1]);
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState
*bs
)
180 assert(!bs
->io_limits_enabled
);
181 throttle_init(&bs
->throttle_state
,
183 bdrv_throttle_read_timer_cb
,
184 bdrv_throttle_write_timer_cb
,
186 bs
->io_limits_enabled
= true;
189 /* This function makes an IO wait if needed
191 * @nb_sectors: the number of sectors of the IO
192 * @is_write: is the IO a write
194 static void bdrv_io_limits_intercept(BlockDriverState
*bs
,
198 /* does this io must wait */
199 bool must_wait
= throttle_schedule_timer(&bs
->throttle_state
, is_write
);
201 /* if must wait or any request of this type throttled queue the IO */
203 !qemu_co_queue_empty(&bs
->throttled_reqs
[is_write
])) {
204 qemu_co_queue_wait(&bs
->throttled_reqs
[is_write
]);
207 /* the IO will be executed, do the accounting */
208 throttle_account(&bs
->throttle_state
, is_write
, bytes
);
211 /* if the next request must wait -> do nothing */
212 if (throttle_schedule_timer(&bs
->throttle_state
, is_write
)) {
216 /* else queue next request for execution */
217 qemu_co_queue_next(&bs
->throttled_reqs
[is_write
]);
220 size_t bdrv_opt_mem_align(BlockDriverState
*bs
)
222 if (!bs
|| !bs
->drv
) {
223 /* 4k should be on the safe side */
227 return bs
->bl
.opt_mem_alignment
;
230 /* check if the path starts with "<protocol>:" */
231 static int path_has_protocol(const char *path
)
236 if (is_windows_drive(path
) ||
237 is_windows_drive_prefix(path
)) {
240 p
= path
+ strcspn(path
, ":/\\");
242 p
= path
+ strcspn(path
, ":/");
248 int path_is_absolute(const char *path
)
251 /* specific case for names like: "\\.\d:" */
252 if (is_windows_drive(path
) || is_windows_drive_prefix(path
)) {
255 return (*path
== '/' || *path
== '\\');
257 return (*path
== '/');
261 /* if filename is absolute, just copy it to dest. Otherwise, build a
262 path to it by considering it is relative to base_path. URL are
264 void path_combine(char *dest
, int dest_size
,
265 const char *base_path
,
266 const char *filename
)
273 if (path_is_absolute(filename
)) {
274 pstrcpy(dest
, dest_size
, filename
);
276 p
= strchr(base_path
, ':');
281 p1
= strrchr(base_path
, '/');
285 p2
= strrchr(base_path
, '\\');
297 if (len
> dest_size
- 1)
299 memcpy(dest
, base_path
, len
);
301 pstrcat(dest
, dest_size
, filename
);
305 void bdrv_get_full_backing_filename(BlockDriverState
*bs
, char *dest
, size_t sz
)
307 if (bs
->backing_file
[0] == '\0' || path_has_protocol(bs
->backing_file
)) {
308 pstrcpy(dest
, sz
, bs
->backing_file
);
310 path_combine(dest
, sz
, bs
->filename
, bs
->backing_file
);
314 void bdrv_register(BlockDriver
*bdrv
)
316 /* Block drivers without coroutine functions need emulation */
317 if (!bdrv
->bdrv_co_readv
) {
318 bdrv
->bdrv_co_readv
= bdrv_co_readv_em
;
319 bdrv
->bdrv_co_writev
= bdrv_co_writev_em
;
321 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322 * the block driver lacks aio we need to emulate that too.
324 if (!bdrv
->bdrv_aio_readv
) {
325 /* add AIO emulation layer */
326 bdrv
->bdrv_aio_readv
= bdrv_aio_readv_em
;
327 bdrv
->bdrv_aio_writev
= bdrv_aio_writev_em
;
331 QLIST_INSERT_HEAD(&bdrv_drivers
, bdrv
, list
);
334 /* create a new block device (by default it is empty) */
335 BlockDriverState
*bdrv_new(const char *device_name
)
337 BlockDriverState
*bs
;
339 bs
= g_malloc0(sizeof(BlockDriverState
));
340 QLIST_INIT(&bs
->dirty_bitmaps
);
341 pstrcpy(bs
->device_name
, sizeof(bs
->device_name
), device_name
);
342 if (device_name
[0] != '\0') {
343 QTAILQ_INSERT_TAIL(&bdrv_states
, bs
, device_list
);
345 bdrv_iostatus_disable(bs
);
346 notifier_list_init(&bs
->close_notifiers
);
347 notifier_with_return_list_init(&bs
->before_write_notifiers
);
348 qemu_co_queue_init(&bs
->throttled_reqs
[0]);
349 qemu_co_queue_init(&bs
->throttled_reqs
[1]);
355 void bdrv_add_close_notifier(BlockDriverState
*bs
, Notifier
*notify
)
357 notifier_list_add(&bs
->close_notifiers
, notify
);
360 BlockDriver
*bdrv_find_format(const char *format_name
)
363 QLIST_FOREACH(drv1
, &bdrv_drivers
, list
) {
364 if (!strcmp(drv1
->format_name
, format_name
)) {
371 static int bdrv_is_whitelisted(BlockDriver
*drv
, bool read_only
)
373 static const char *whitelist_rw
[] = {
374 CONFIG_BDRV_RW_WHITELIST
376 static const char *whitelist_ro
[] = {
377 CONFIG_BDRV_RO_WHITELIST
381 if (!whitelist_rw
[0] && !whitelist_ro
[0]) {
382 return 1; /* no whitelist, anything goes */
385 for (p
= whitelist_rw
; *p
; p
++) {
386 if (!strcmp(drv
->format_name
, *p
)) {
391 for (p
= whitelist_ro
; *p
; p
++) {
392 if (!strcmp(drv
->format_name
, *p
)) {
400 BlockDriver
*bdrv_find_whitelisted_format(const char *format_name
,
403 BlockDriver
*drv
= bdrv_find_format(format_name
);
404 return drv
&& bdrv_is_whitelisted(drv
, read_only
) ? drv
: NULL
;
407 typedef struct CreateCo
{
410 QEMUOptionParameter
*options
;
415 static void coroutine_fn
bdrv_create_co_entry(void *opaque
)
417 Error
*local_err
= NULL
;
420 CreateCo
*cco
= opaque
;
423 ret
= cco
->drv
->bdrv_create(cco
->filename
, cco
->options
, &local_err
);
425 error_propagate(&cco
->err
, local_err
);
430 int bdrv_create(BlockDriver
*drv
, const char* filename
,
431 QEMUOptionParameter
*options
, Error
**errp
)
438 .filename
= g_strdup(filename
),
444 if (!drv
->bdrv_create
) {
445 error_setg(errp
, "Driver '%s' does not support image creation", drv
->format_name
);
450 if (qemu_in_coroutine()) {
451 /* Fast-path if already in coroutine context */
452 bdrv_create_co_entry(&cco
);
454 co
= qemu_coroutine_create(bdrv_create_co_entry
);
455 qemu_coroutine_enter(co
, &cco
);
456 while (cco
.ret
== NOT_DONE
) {
464 error_propagate(errp
, cco
.err
);
466 error_setg_errno(errp
, -ret
, "Could not create image");
471 g_free(cco
.filename
);
475 int bdrv_create_file(const char* filename
, QEMUOptionParameter
*options
,
479 Error
*local_err
= NULL
;
482 drv
= bdrv_find_protocol(filename
, true);
484 error_setg(errp
, "Could not find protocol for file '%s'", filename
);
488 ret
= bdrv_create(drv
, filename
, options
, &local_err
);
490 error_propagate(errp
, local_err
);
495 int bdrv_refresh_limits(BlockDriverState
*bs
)
497 BlockDriver
*drv
= bs
->drv
;
499 memset(&bs
->bl
, 0, sizeof(bs
->bl
));
505 /* Take some limits from the children as a default */
507 bdrv_refresh_limits(bs
->file
);
508 bs
->bl
.opt_transfer_length
= bs
->file
->bl
.opt_transfer_length
;
509 bs
->bl
.opt_mem_alignment
= bs
->file
->bl
.opt_mem_alignment
;
511 bs
->bl
.opt_mem_alignment
= 512;
514 if (bs
->backing_hd
) {
515 bdrv_refresh_limits(bs
->backing_hd
);
516 bs
->bl
.opt_transfer_length
=
517 MAX(bs
->bl
.opt_transfer_length
,
518 bs
->backing_hd
->bl
.opt_transfer_length
);
519 bs
->bl
.opt_mem_alignment
=
520 MAX(bs
->bl
.opt_mem_alignment
,
521 bs
->backing_hd
->bl
.opt_mem_alignment
);
524 /* Then let the driver override it */
525 if (drv
->bdrv_refresh_limits
) {
526 return drv
->bdrv_refresh_limits(bs
);
533 * Create a uniquely-named empty temporary file.
534 * Return 0 upon success, otherwise a negative errno value.
536 int get_tmp_filename(char *filename
, int size
)
539 char temp_dir
[MAX_PATH
];
540 /* GetTempFileName requires that its output buffer (4th param)
541 have length MAX_PATH or greater. */
542 assert(size
>= MAX_PATH
);
543 return (GetTempPath(MAX_PATH
, temp_dir
)
544 && GetTempFileName(temp_dir
, "qem", 0, filename
)
545 ? 0 : -GetLastError());
549 tmpdir
= getenv("TMPDIR");
552 if (snprintf(filename
, size
, "%s/vl.XXXXXX", tmpdir
) >= size
) {
555 fd
= mkstemp(filename
);
559 if (close(fd
) != 0) {
568 * Detect host devices. By convention, /dev/cdrom[N] is always
569 * recognized as a host CDROM.
571 static BlockDriver
*find_hdev_driver(const char *filename
)
573 int score_max
= 0, score
;
574 BlockDriver
*drv
= NULL
, *d
;
576 QLIST_FOREACH(d
, &bdrv_drivers
, list
) {
577 if (d
->bdrv_probe_device
) {
578 score
= d
->bdrv_probe_device(filename
);
579 if (score
> score_max
) {
589 BlockDriver
*bdrv_find_protocol(const char *filename
,
590 bool allow_protocol_prefix
)
597 /* TODO Drivers without bdrv_file_open must be specified explicitly */
600 * XXX(hch): we really should not let host device detection
601 * override an explicit protocol specification, but moving this
602 * later breaks access to device names with colons in them.
603 * Thanks to the brain-dead persistent naming schemes on udev-
604 * based Linux systems those actually are quite common.
606 drv1
= find_hdev_driver(filename
);
611 if (!path_has_protocol(filename
) || !allow_protocol_prefix
) {
612 return bdrv_find_format("file");
615 p
= strchr(filename
, ':');
618 if (len
> sizeof(protocol
) - 1)
619 len
= sizeof(protocol
) - 1;
620 memcpy(protocol
, filename
, len
);
621 protocol
[len
] = '\0';
622 QLIST_FOREACH(drv1
, &bdrv_drivers
, list
) {
623 if (drv1
->protocol_name
&&
624 !strcmp(drv1
->protocol_name
, protocol
)) {
631 static int find_image_format(BlockDriverState
*bs
, const char *filename
,
632 BlockDriver
**pdrv
, Error
**errp
)
634 int score
, score_max
;
635 BlockDriver
*drv1
, *drv
;
639 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
640 if (bs
->sg
|| !bdrv_is_inserted(bs
) || bdrv_getlength(bs
) == 0) {
641 drv
= bdrv_find_format("raw");
643 error_setg(errp
, "Could not find raw image format");
650 ret
= bdrv_pread(bs
, 0, buf
, sizeof(buf
));
652 error_setg_errno(errp
, -ret
, "Could not read image for determining its "
660 QLIST_FOREACH(drv1
, &bdrv_drivers
, list
) {
661 if (drv1
->bdrv_probe
) {
662 score
= drv1
->bdrv_probe(buf
, ret
, filename
);
663 if (score
> score_max
) {
670 error_setg(errp
, "Could not determine image format: No compatible "
679 * Set the current 'total_sectors' value
681 static int refresh_total_sectors(BlockDriverState
*bs
, int64_t hint
)
683 BlockDriver
*drv
= bs
->drv
;
685 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
689 /* query actual device if possible, otherwise just trust the hint */
690 if (drv
->bdrv_getlength
) {
691 int64_t length
= drv
->bdrv_getlength(bs
);
695 hint
= DIV_ROUND_UP(length
, BDRV_SECTOR_SIZE
);
698 bs
->total_sectors
= hint
;
703 * Set open flags for a given discard mode
705 * Return 0 on success, -1 if the discard mode was invalid.
707 int bdrv_parse_discard_flags(const char *mode
, int *flags
)
709 *flags
&= ~BDRV_O_UNMAP
;
711 if (!strcmp(mode
, "off") || !strcmp(mode
, "ignore")) {
713 } else if (!strcmp(mode
, "on") || !strcmp(mode
, "unmap")) {
714 *flags
|= BDRV_O_UNMAP
;
723 * Set open flags for a given cache mode
725 * Return 0 on success, -1 if the cache mode was invalid.
727 int bdrv_parse_cache_flags(const char *mode
, int *flags
)
729 *flags
&= ~BDRV_O_CACHE_MASK
;
731 if (!strcmp(mode
, "off") || !strcmp(mode
, "none")) {
732 *flags
|= BDRV_O_NOCACHE
| BDRV_O_CACHE_WB
;
733 } else if (!strcmp(mode
, "directsync")) {
734 *flags
|= BDRV_O_NOCACHE
;
735 } else if (!strcmp(mode
, "writeback")) {
736 *flags
|= BDRV_O_CACHE_WB
;
737 } else if (!strcmp(mode
, "unsafe")) {
738 *flags
|= BDRV_O_CACHE_WB
;
739 *flags
|= BDRV_O_NO_FLUSH
;
740 } else if (!strcmp(mode
, "writethrough")) {
741 /* this is the default */
750 * The copy-on-read flag is actually a reference count so multiple users may
751 * use the feature without worrying about clobbering its previous state.
752 * Copy-on-read stays enabled until all users have called to disable it.
754 void bdrv_enable_copy_on_read(BlockDriverState
*bs
)
759 void bdrv_disable_copy_on_read(BlockDriverState
*bs
)
761 assert(bs
->copy_on_read
> 0);
765 static int bdrv_open_flags(BlockDriverState
*bs
, int flags
)
767 int open_flags
= flags
| BDRV_O_CACHE_WB
;
770 * Clear flags that are internal to the block layer before opening the
773 open_flags
&= ~(BDRV_O_SNAPSHOT
| BDRV_O_NO_BACKING
);
776 * Snapshots should be writable.
778 if (bs
->is_temporary
) {
779 open_flags
|= BDRV_O_RDWR
;
785 static int bdrv_assign_node_name(BlockDriverState
*bs
,
786 const char *node_name
,
793 /* empty string node name is invalid */
794 if (node_name
[0] == '\0') {
795 error_setg(errp
, "Empty node name");
799 /* takes care of avoiding namespaces collisions */
800 if (bdrv_find(node_name
)) {
801 error_setg(errp
, "node-name=%s is conflicting with a device id",
806 /* takes care of avoiding duplicates node names */
807 if (bdrv_find_node(node_name
)) {
808 error_setg(errp
, "Duplicate node name");
812 /* copy node name into the bs and insert it into the graph list */
813 pstrcpy(bs
->node_name
, sizeof(bs
->node_name
), node_name
);
814 QTAILQ_INSERT_TAIL(&graph_bdrv_states
, bs
, node_list
);
820 * Common part for opening disk images and files
822 * Removes all processed options from *options.
824 static int bdrv_open_common(BlockDriverState
*bs
, BlockDriverState
*file
,
825 QDict
*options
, int flags
, BlockDriver
*drv
, Error
**errp
)
828 const char *filename
;
829 const char *node_name
= NULL
;
830 Error
*local_err
= NULL
;
833 assert(bs
->file
== NULL
);
834 assert(options
!= NULL
&& bs
->options
!= options
);
837 filename
= file
->filename
;
839 filename
= qdict_get_try_str(options
, "filename");
842 if (drv
->bdrv_needs_filename
&& !filename
) {
843 error_setg(errp
, "The '%s' block driver requires a file name",
848 trace_bdrv_open_common(bs
, filename
?: "", flags
, drv
->format_name
);
850 node_name
= qdict_get_try_str(options
, "node-name");
851 ret
= bdrv_assign_node_name(bs
, node_name
, errp
);
855 qdict_del(options
, "node-name");
857 /* bdrv_open() with directly using a protocol as drv. This layer is already
858 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
859 * and return immediately. */
860 if (file
!= NULL
&& drv
->bdrv_file_open
) {
865 bs
->open_flags
= flags
;
866 bs
->guest_block_size
= 512;
867 bs
->request_alignment
= 512;
868 bs
->zero_beyond_eof
= true;
869 open_flags
= bdrv_open_flags(bs
, flags
);
870 bs
->read_only
= !(open_flags
& BDRV_O_RDWR
);
872 if (use_bdrv_whitelist
&& !bdrv_is_whitelisted(drv
, bs
->read_only
)) {
874 !bs
->read_only
&& bdrv_is_whitelisted(drv
, true)
875 ? "Driver '%s' can only be used for read-only devices"
876 : "Driver '%s' is not whitelisted",
881 assert(bs
->copy_on_read
== 0); /* bdrv_new() and bdrv_close() make it so */
882 if (flags
& BDRV_O_COPY_ON_READ
) {
883 if (!bs
->read_only
) {
884 bdrv_enable_copy_on_read(bs
);
886 error_setg(errp
, "Can't use copy-on-read on read-only device");
891 if (filename
!= NULL
) {
892 pstrcpy(bs
->filename
, sizeof(bs
->filename
), filename
);
894 bs
->filename
[0] = '\0';
898 bs
->opaque
= g_malloc0(drv
->instance_size
);
900 bs
->enable_write_cache
= !!(flags
& BDRV_O_CACHE_WB
);
902 /* Open the image, either directly or using a protocol */
903 if (drv
->bdrv_file_open
) {
904 assert(file
== NULL
);
905 assert(!drv
->bdrv_needs_filename
|| filename
!= NULL
);
906 ret
= drv
->bdrv_file_open(bs
, options
, open_flags
, &local_err
);
909 error_setg(errp
, "Can't use '%s' as a block driver for the "
910 "protocol level", drv
->format_name
);
915 ret
= drv
->bdrv_open(bs
, options
, open_flags
, &local_err
);
920 error_propagate(errp
, local_err
);
921 } else if (bs
->filename
[0]) {
922 error_setg_errno(errp
, -ret
, "Could not open '%s'", bs
->filename
);
924 error_setg_errno(errp
, -ret
, "Could not open image");
929 ret
= refresh_total_sectors(bs
, bs
->total_sectors
);
931 error_setg_errno(errp
, -ret
, "Could not refresh total sector count");
935 bdrv_refresh_limits(bs
);
936 assert(bdrv_opt_mem_align(bs
) != 0);
937 assert(bs
->request_alignment
!= 0);
940 if (bs
->is_temporary
) {
941 assert(bs
->filename
[0] != '\0');
942 unlink(bs
->filename
);
956 * Opens a file using a protocol (file, host_device, nbd, ...)
958 * options is a QDict of options to pass to the block drivers, or NULL for an
959 * empty set of options. The reference to the QDict belongs to the block layer
960 * after the call (even on failure), so if the caller intends to reuse the
961 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
963 static int bdrv_file_open(BlockDriverState
**pbs
, const char *filename
,
964 QDict
*options
, int flags
, Error
**errp
)
966 BlockDriverState
*bs
= NULL
;
969 bool allow_protocol_prefix
= false;
970 Error
*local_err
= NULL
;
973 /* NULL means an empty set of options */
974 if (options
== NULL
) {
975 options
= qdict_new();
979 bs
->options
= options
;
980 options
= qdict_clone_shallow(options
);
982 /* Fetch the file name from the options QDict if necessary */
984 filename
= qdict_get_try_str(options
, "filename");
985 } else if (filename
&& !qdict_haskey(options
, "filename")) {
986 qdict_put(options
, "filename", qstring_from_str(filename
));
987 allow_protocol_prefix
= true;
989 error_setg(errp
, "Can't specify 'file' and 'filename' options at the "
995 /* Find the right block driver */
996 drvname
= qdict_get_try_str(options
, "driver");
998 drv
= bdrv_find_format(drvname
);
1000 error_setg(errp
, "Unknown driver '%s'", drvname
);
1002 qdict_del(options
, "driver");
1003 } else if (filename
) {
1004 drv
= bdrv_find_protocol(filename
, allow_protocol_prefix
);
1006 error_setg(errp
, "Unknown protocol");
1009 error_setg(errp
, "Must specify either driver or file");
1014 /* errp has been set already */
1019 /* Parse the filename and open it */
1020 if (drv
->bdrv_parse_filename
&& filename
) {
1021 drv
->bdrv_parse_filename(filename
, options
, &local_err
);
1023 error_propagate(errp
, local_err
);
1027 qdict_del(options
, "filename");
1030 if (!drv
->bdrv_file_open
) {
1031 ret
= bdrv_open(&bs
, filename
, NULL
, options
, flags
, drv
, &local_err
);
1034 ret
= bdrv_open_common(bs
, NULL
, options
, flags
, drv
, &local_err
);
1037 error_propagate(errp
, local_err
);
1041 /* Check if any unknown options were used */
1042 if (options
&& (qdict_size(options
) != 0)) {
1043 const QDictEntry
*entry
= qdict_first(options
);
1044 error_setg(errp
, "Block protocol '%s' doesn't support the option '%s'",
1045 drv
->format_name
, entry
->key
);
1058 QDECREF(bs
->options
);
1065 * Opens the backing file for a BlockDriverState if not yet open
1067 * options is a QDict of options to pass to the block drivers, or NULL for an
1068 * empty set of options. The reference to the QDict is transferred to this
1069 * function (even on failure), so if the caller intends to reuse the dictionary,
1070 * it needs to use QINCREF() before calling bdrv_file_open.
1072 int bdrv_open_backing_file(BlockDriverState
*bs
, QDict
*options
, Error
**errp
)
1074 char backing_filename
[PATH_MAX
];
1075 int back_flags
, ret
;
1076 BlockDriver
*back_drv
= NULL
;
1077 Error
*local_err
= NULL
;
1079 if (bs
->backing_hd
!= NULL
) {
1084 /* NULL means an empty set of options */
1085 if (options
== NULL
) {
1086 options
= qdict_new();
1089 bs
->open_flags
&= ~BDRV_O_NO_BACKING
;
1090 if (qdict_haskey(options
, "file.filename")) {
1091 backing_filename
[0] = '\0';
1092 } else if (bs
->backing_file
[0] == '\0' && qdict_size(options
) == 0) {
1096 bdrv_get_full_backing_filename(bs
, backing_filename
,
1097 sizeof(backing_filename
));
1100 if (bs
->backing_format
[0] != '\0') {
1101 back_drv
= bdrv_find_format(bs
->backing_format
);
1104 /* backing files always opened read-only */
1105 back_flags
= bs
->open_flags
& ~(BDRV_O_RDWR
| BDRV_O_SNAPSHOT
|
1106 BDRV_O_COPY_ON_READ
);
1108 assert(bs
->backing_hd
== NULL
);
1109 ret
= bdrv_open(&bs
->backing_hd
,
1110 *backing_filename
? backing_filename
: NULL
, NULL
, options
,
1111 back_flags
, back_drv
, &local_err
);
1113 bs
->backing_hd
= NULL
;
1114 bs
->open_flags
|= BDRV_O_NO_BACKING
;
1115 error_setg(errp
, "Could not open backing file: %s",
1116 error_get_pretty(local_err
));
1117 error_free(local_err
);
1121 if (bs
->backing_hd
->file
) {
1122 pstrcpy(bs
->backing_file
, sizeof(bs
->backing_file
),
1123 bs
->backing_hd
->file
->filename
);
1126 /* Recalculate the BlockLimits with the backing file */
1127 bdrv_refresh_limits(bs
);
1133 * Opens a disk image whose options are given as BlockdevRef in another block
1136 * If force_raw is true, bdrv_file_open() will be used, thereby preventing any
1137 * image format auto-detection. If it is false and a filename is given,
1138 * bdrv_open() will be used for auto-detection.
1140 * If allow_none is true, no image will be opened if filename is false and no
1141 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1143 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1144 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1145 * itself, all options starting with "${bdref_key}." are considered part of the
1148 * The BlockdevRef will be removed from the options QDict.
1150 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1152 int bdrv_open_image(BlockDriverState
**pbs
, const char *filename
,
1153 QDict
*options
, const char *bdref_key
, int flags
,
1154 bool force_raw
, bool allow_none
, Error
**errp
)
1156 QDict
*image_options
;
1158 char *bdref_key_dot
;
1159 const char *reference
;
1162 assert(*pbs
== NULL
);
1164 bdref_key_dot
= g_strdup_printf("%s.", bdref_key
);
1165 qdict_extract_subqdict(options
, &image_options
, bdref_key_dot
);
1166 g_free(bdref_key_dot
);
1168 reference
= qdict_get_try_str(options
, bdref_key
);
1169 if (!filename
&& !reference
&& !qdict_size(image_options
)) {
1173 error_setg(errp
, "A block device must be specified for \"%s\"",
1180 if (filename
&& !force_raw
) {
1181 /* If a filename is given and the block driver should be detected
1182 automatically (instead of using none), use bdrv_open() in order to do
1183 that auto-detection. */
1185 error_setg(errp
, "Cannot reference an existing block device while "
1186 "giving a filename");
1191 ret
= bdrv_open(pbs
, filename
, NULL
, image_options
, flags
, NULL
, errp
);
1193 ret
= bdrv_open(pbs
, filename
, reference
, image_options
,
1194 flags
| BDRV_O_PROTOCOL
, NULL
, errp
);
1198 qdict_del(options
, bdref_key
);
1203 * Opens a disk image (raw, qcow2, vmdk, ...)
1205 * options is a QDict of options to pass to the block drivers, or NULL for an
1206 * empty set of options. The reference to the QDict belongs to the block layer
1207 * after the call (even on failure), so if the caller intends to reuse the
1208 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1210 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1211 * If it is not NULL, the referenced BDS will be reused.
1213 * The reference parameter may be used to specify an existing block device which
1214 * should be opened. If specified, neither options nor a filename may be given,
1215 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1217 int bdrv_open(BlockDriverState
**pbs
, const char *filename
,
1218 const char *reference
, QDict
*options
, int flags
,
1219 BlockDriver
*drv
, Error
**errp
)
1222 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1223 char tmp_filename
[PATH_MAX
+ 1];
1224 BlockDriverState
*file
= NULL
, *bs
;
1225 const char *drvname
;
1226 Error
*local_err
= NULL
;
1231 bool options_non_empty
= options
? qdict_size(options
) : false;
1235 error_setg(errp
, "Cannot reuse an existing BDS when referencing "
1236 "another block device");
1240 if (filename
|| options_non_empty
) {
1241 error_setg(errp
, "Cannot reference an existing block device with "
1242 "additional options or a new filename");
1246 bs
= bdrv_lookup_bs(reference
, reference
, errp
);
1255 if (flags
& BDRV_O_PROTOCOL
) {
1257 return bdrv_file_open(pbs
, filename
, options
, flags
& ~BDRV_O_PROTOCOL
,
1267 /* NULL means an empty set of options */
1268 if (options
== NULL
) {
1269 options
= qdict_new();
1272 bs
->options
= options
;
1273 options
= qdict_clone_shallow(options
);
1275 /* For snapshot=on, create a temporary qcow2 overlay */
1276 if (flags
& BDRV_O_SNAPSHOT
) {
1277 BlockDriverState
*bs1
;
1279 BlockDriver
*bdrv_qcow2
;
1280 QEMUOptionParameter
*create_options
;
1281 QDict
*snapshot_options
;
1283 /* if snapshot, we create a temporary backing file and open it
1284 instead of opening 'filename' directly */
1286 /* Get the required size from the image */
1289 ret
= bdrv_open(&bs1
, filename
, NULL
, options
, BDRV_O_NO_BACKING
,
1294 total_size
= bdrv_getlength(bs1
) & BDRV_SECTOR_MASK
;
1298 /* Create the temporary image */
1299 ret
= get_tmp_filename(tmp_filename
, sizeof(tmp_filename
));
1301 error_setg_errno(errp
, -ret
, "Could not get temporary filename");
1305 bdrv_qcow2
= bdrv_find_format("qcow2");
1306 create_options
= parse_option_parameters("", bdrv_qcow2
->create_options
,
1309 set_option_parameter_int(create_options
, BLOCK_OPT_SIZE
, total_size
);
1311 ret
= bdrv_create(bdrv_qcow2
, tmp_filename
, create_options
, &local_err
);
1312 free_option_parameters(create_options
);
1314 error_setg_errno(errp
, -ret
, "Could not create temporary overlay "
1315 "'%s': %s", tmp_filename
,
1316 error_get_pretty(local_err
));
1317 error_free(local_err
);
1322 /* Prepare a new options QDict for the temporary file, where user
1323 * options refer to the backing file */
1325 qdict_put(options
, "file.filename", qstring_from_str(filename
));
1328 qdict_put(options
, "driver", qstring_from_str(drv
->format_name
));
1331 snapshot_options
= qdict_new();
1332 qdict_put(snapshot_options
, "backing", options
);
1333 qdict_flatten(snapshot_options
);
1335 bs
->options
= snapshot_options
;
1336 options
= qdict_clone_shallow(bs
->options
);
1338 filename
= tmp_filename
;
1340 bs
->is_temporary
= 1;
1343 /* Open image file without format layer */
1344 if (flags
& BDRV_O_RDWR
) {
1345 flags
|= BDRV_O_ALLOW_RDWR
;
1348 assert(file
== NULL
);
1349 ret
= bdrv_open_image(&file
, filename
, options
, "file",
1350 bdrv_open_flags(bs
, flags
| BDRV_O_UNMAP
), true, true,
1356 /* Find the right image format driver */
1357 drvname
= qdict_get_try_str(options
, "driver");
1359 drv
= bdrv_find_format(drvname
);
1360 qdict_del(options
, "driver");
1362 error_setg(errp
, "Invalid driver: '%s'", drvname
);
1364 goto unlink_and_fail
;
1370 ret
= find_image_format(file
, filename
, &drv
, &local_err
);
1372 error_setg(errp
, "Must specify either driver or file");
1374 goto unlink_and_fail
;
1379 goto unlink_and_fail
;
1382 /* Open the image */
1383 ret
= bdrv_open_common(bs
, file
, options
, flags
, drv
, &local_err
);
1385 goto unlink_and_fail
;
1388 if (file
&& (bs
->file
!= file
)) {
1393 /* If there is a backing file, use it */
1394 if ((flags
& BDRV_O_NO_BACKING
) == 0) {
1395 QDict
*backing_options
;
1397 qdict_extract_subqdict(options
, &backing_options
, "backing.");
1398 ret
= bdrv_open_backing_file(bs
, backing_options
, &local_err
);
1400 goto close_and_fail
;
1404 /* Check if any unknown options were used */
1405 if (qdict_size(options
) != 0) {
1406 const QDictEntry
*entry
= qdict_first(options
);
1407 error_setg(errp
, "Block format '%s' used by device '%s' doesn't "
1408 "support the option '%s'", drv
->format_name
, bs
->device_name
,
1412 goto close_and_fail
;
1416 if (!bdrv_key_required(bs
)) {
1417 bdrv_dev_change_media_cb(bs
, true);
1427 if (bs
->is_temporary
) {
1431 QDECREF(bs
->options
);
1435 /* If *pbs is NULL, a new BDS has been created in this function and
1436 needs to be freed now. Otherwise, it does not need to be closed,
1437 since it has not really been opened yet. */
1441 error_propagate(errp
, local_err
);
1446 /* See fail path, but now the BDS has to be always closed */
1454 error_propagate(errp
, local_err
);
1459 typedef struct BlockReopenQueueEntry
{
1461 BDRVReopenState state
;
1462 QSIMPLEQ_ENTRY(BlockReopenQueueEntry
) entry
;
1463 } BlockReopenQueueEntry
;
1466 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1467 * reopen of multiple devices.
1469 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1470 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1471 * be created and initialized. This newly created BlockReopenQueue should be
1472 * passed back in for subsequent calls that are intended to be of the same
1475 * bs is the BlockDriverState to add to the reopen queue.
1477 * flags contains the open flags for the associated bs
1479 * returns a pointer to bs_queue, which is either the newly allocated
1480 * bs_queue, or the existing bs_queue being used.
1483 BlockReopenQueue
*bdrv_reopen_queue(BlockReopenQueue
*bs_queue
,
1484 BlockDriverState
*bs
, int flags
)
1488 BlockReopenQueueEntry
*bs_entry
;
1489 if (bs_queue
== NULL
) {
1490 bs_queue
= g_new0(BlockReopenQueue
, 1);
1491 QSIMPLEQ_INIT(bs_queue
);
1495 bdrv_reopen_queue(bs_queue
, bs
->file
, flags
);
1498 bs_entry
= g_new0(BlockReopenQueueEntry
, 1);
1499 QSIMPLEQ_INSERT_TAIL(bs_queue
, bs_entry
, entry
);
1501 bs_entry
->state
.bs
= bs
;
1502 bs_entry
->state
.flags
= flags
;
1508 * Reopen multiple BlockDriverStates atomically & transactionally.
1510 * The queue passed in (bs_queue) must have been built up previous
1511 * via bdrv_reopen_queue().
1513 * Reopens all BDS specified in the queue, with the appropriate
1514 * flags. All devices are prepared for reopen, and failure of any
1515 * device will cause all device changes to be abandonded, and intermediate
1518 * If all devices prepare successfully, then the changes are committed
1522 int bdrv_reopen_multiple(BlockReopenQueue
*bs_queue
, Error
**errp
)
1525 BlockReopenQueueEntry
*bs_entry
, *next
;
1526 Error
*local_err
= NULL
;
1528 assert(bs_queue
!= NULL
);
1532 QSIMPLEQ_FOREACH(bs_entry
, bs_queue
, entry
) {
1533 if (bdrv_reopen_prepare(&bs_entry
->state
, bs_queue
, &local_err
)) {
1534 error_propagate(errp
, local_err
);
1537 bs_entry
->prepared
= true;
1540 /* If we reach this point, we have success and just need to apply the
1543 QSIMPLEQ_FOREACH(bs_entry
, bs_queue
, entry
) {
1544 bdrv_reopen_commit(&bs_entry
->state
);
1550 QSIMPLEQ_FOREACH_SAFE(bs_entry
, bs_queue
, entry
, next
) {
1551 if (ret
&& bs_entry
->prepared
) {
1552 bdrv_reopen_abort(&bs_entry
->state
);
1561 /* Reopen a single BlockDriverState with the specified flags. */
1562 int bdrv_reopen(BlockDriverState
*bs
, int bdrv_flags
, Error
**errp
)
1565 Error
*local_err
= NULL
;
1566 BlockReopenQueue
*queue
= bdrv_reopen_queue(NULL
, bs
, bdrv_flags
);
1568 ret
= bdrv_reopen_multiple(queue
, &local_err
);
1569 if (local_err
!= NULL
) {
1570 error_propagate(errp
, local_err
);
1577 * Prepares a BlockDriverState for reopen. All changes are staged in the
1578 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1579 * the block driver layer .bdrv_reopen_prepare()
1581 * bs is the BlockDriverState to reopen
1582 * flags are the new open flags
1583 * queue is the reopen queue
1585 * Returns 0 on success, non-zero on error. On error errp will be set
1588 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1589 * It is the responsibility of the caller to then call the abort() or
1590 * commit() for any other BDS that have been left in a prepare() state
1593 int bdrv_reopen_prepare(BDRVReopenState
*reopen_state
, BlockReopenQueue
*queue
,
1597 Error
*local_err
= NULL
;
1600 assert(reopen_state
!= NULL
);
1601 assert(reopen_state
->bs
->drv
!= NULL
);
1602 drv
= reopen_state
->bs
->drv
;
1604 /* if we are to stay read-only, do not allow permission change
1606 if (!(reopen_state
->bs
->open_flags
& BDRV_O_ALLOW_RDWR
) &&
1607 reopen_state
->flags
& BDRV_O_RDWR
) {
1608 error_set(errp
, QERR_DEVICE_IS_READ_ONLY
,
1609 reopen_state
->bs
->device_name
);
1614 ret
= bdrv_flush(reopen_state
->bs
);
1616 error_set(errp
, ERROR_CLASS_GENERIC_ERROR
, "Error (%s) flushing drive",
1621 if (drv
->bdrv_reopen_prepare
) {
1622 ret
= drv
->bdrv_reopen_prepare(reopen_state
, queue
, &local_err
);
1624 if (local_err
!= NULL
) {
1625 error_propagate(errp
, local_err
);
1627 error_setg(errp
, "failed while preparing to reopen image '%s'",
1628 reopen_state
->bs
->filename
);
1633 /* It is currently mandatory to have a bdrv_reopen_prepare()
1634 * handler for each supported drv. */
1635 error_set(errp
, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED
,
1636 drv
->format_name
, reopen_state
->bs
->device_name
,
1637 "reopening of file");
1649 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1650 * makes them final by swapping the staging BlockDriverState contents into
1651 * the active BlockDriverState contents.
1653 void bdrv_reopen_commit(BDRVReopenState
*reopen_state
)
1657 assert(reopen_state
!= NULL
);
1658 drv
= reopen_state
->bs
->drv
;
1659 assert(drv
!= NULL
);
1661 /* If there are any driver level actions to take */
1662 if (drv
->bdrv_reopen_commit
) {
1663 drv
->bdrv_reopen_commit(reopen_state
);
1666 /* set BDS specific flags now */
1667 reopen_state
->bs
->open_flags
= reopen_state
->flags
;
1668 reopen_state
->bs
->enable_write_cache
= !!(reopen_state
->flags
&
1670 reopen_state
->bs
->read_only
= !(reopen_state
->flags
& BDRV_O_RDWR
);
1672 bdrv_refresh_limits(reopen_state
->bs
);
1676 * Abort the reopen, and delete and free the staged changes in
1679 void bdrv_reopen_abort(BDRVReopenState
*reopen_state
)
1683 assert(reopen_state
!= NULL
);
1684 drv
= reopen_state
->bs
->drv
;
1685 assert(drv
!= NULL
);
1687 if (drv
->bdrv_reopen_abort
) {
1688 drv
->bdrv_reopen_abort(reopen_state
);
1693 void bdrv_close(BlockDriverState
*bs
)
1696 block_job_cancel_sync(bs
->job
);
1698 bdrv_drain_all(); /* complete I/O */
1700 bdrv_drain_all(); /* in case flush left pending I/O */
1701 notifier_list_notify(&bs
->close_notifiers
, bs
);
1704 if (bs
->backing_hd
) {
1705 bdrv_unref(bs
->backing_hd
);
1706 bs
->backing_hd
= NULL
;
1708 bs
->drv
->bdrv_close(bs
);
1711 if (bs
->is_temporary
) {
1712 unlink(bs
->filename
);
1717 bs
->copy_on_read
= 0;
1718 bs
->backing_file
[0] = '\0';
1719 bs
->backing_format
[0] = '\0';
1720 bs
->total_sectors
= 0;
1725 bs
->zero_beyond_eof
= false;
1726 QDECREF(bs
->options
);
1729 if (bs
->file
!= NULL
) {
1730 bdrv_unref(bs
->file
);
1735 bdrv_dev_change_media_cb(bs
, false);
1737 /*throttling disk I/O limits*/
1738 if (bs
->io_limits_enabled
) {
1739 bdrv_io_limits_disable(bs
);
1743 void bdrv_close_all(void)
1745 BlockDriverState
*bs
;
1747 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
1752 /* Check if any requests are in-flight (including throttled requests) */
1753 static bool bdrv_requests_pending(BlockDriverState
*bs
)
1755 if (!QLIST_EMPTY(&bs
->tracked_requests
)) {
1758 if (!qemu_co_queue_empty(&bs
->throttled_reqs
[0])) {
1761 if (!qemu_co_queue_empty(&bs
->throttled_reqs
[1])) {
1764 if (bs
->file
&& bdrv_requests_pending(bs
->file
)) {
1767 if (bs
->backing_hd
&& bdrv_requests_pending(bs
->backing_hd
)) {
1773 static bool bdrv_requests_pending_all(void)
1775 BlockDriverState
*bs
;
1776 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
1777 if (bdrv_requests_pending(bs
)) {
1785 * Wait for pending requests to complete across all BlockDriverStates
1787 * This function does not flush data to disk, use bdrv_flush_all() for that
1788 * after calling this function.
1790 * Note that completion of an asynchronous I/O operation can trigger any
1791 * number of other I/O operations on other devices---for example a coroutine
1792 * can be arbitrarily complex and a constant flow of I/O can come until the
1793 * coroutine is complete. Because of this, it is not possible to have a
1794 * function to drain a single device's I/O queue.
1796 void bdrv_drain_all(void)
1798 /* Always run first iteration so any pending completion BHs run */
1800 BlockDriverState
*bs
;
1803 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
1804 bdrv_start_throttled_reqs(bs
);
1807 busy
= bdrv_requests_pending_all();
1808 busy
|= aio_poll(qemu_get_aio_context(), busy
);
1812 /* make a BlockDriverState anonymous by removing from bdrv_state and
1813 * graph_bdrv_state list.
1814 Also, NULL terminate the device_name to prevent double remove */
1815 void bdrv_make_anon(BlockDriverState
*bs
)
1817 if (bs
->device_name
[0] != '\0') {
1818 QTAILQ_REMOVE(&bdrv_states
, bs
, device_list
);
1820 bs
->device_name
[0] = '\0';
1821 if (bs
->node_name
[0] != '\0') {
1822 QTAILQ_REMOVE(&graph_bdrv_states
, bs
, node_list
);
1824 bs
->node_name
[0] = '\0';
1827 static void bdrv_rebind(BlockDriverState
*bs
)
1829 if (bs
->drv
&& bs
->drv
->bdrv_rebind
) {
1830 bs
->drv
->bdrv_rebind(bs
);
1834 static void bdrv_move_feature_fields(BlockDriverState
*bs_dest
,
1835 BlockDriverState
*bs_src
)
1837 /* move some fields that need to stay attached to the device */
1838 bs_dest
->open_flags
= bs_src
->open_flags
;
1841 bs_dest
->dev_ops
= bs_src
->dev_ops
;
1842 bs_dest
->dev_opaque
= bs_src
->dev_opaque
;
1843 bs_dest
->dev
= bs_src
->dev
;
1844 bs_dest
->guest_block_size
= bs_src
->guest_block_size
;
1845 bs_dest
->copy_on_read
= bs_src
->copy_on_read
;
1847 bs_dest
->enable_write_cache
= bs_src
->enable_write_cache
;
1849 /* i/o throttled req */
1850 memcpy(&bs_dest
->throttle_state
,
1851 &bs_src
->throttle_state
,
1852 sizeof(ThrottleState
));
1853 bs_dest
->throttled_reqs
[0] = bs_src
->throttled_reqs
[0];
1854 bs_dest
->throttled_reqs
[1] = bs_src
->throttled_reqs
[1];
1855 bs_dest
->io_limits_enabled
= bs_src
->io_limits_enabled
;
1858 bs_dest
->on_read_error
= bs_src
->on_read_error
;
1859 bs_dest
->on_write_error
= bs_src
->on_write_error
;
1862 bs_dest
->iostatus_enabled
= bs_src
->iostatus_enabled
;
1863 bs_dest
->iostatus
= bs_src
->iostatus
;
1866 bs_dest
->dirty_bitmaps
= bs_src
->dirty_bitmaps
;
1868 /* reference count */
1869 bs_dest
->refcnt
= bs_src
->refcnt
;
1872 bs_dest
->in_use
= bs_src
->in_use
;
1873 bs_dest
->job
= bs_src
->job
;
1875 /* keep the same entry in bdrv_states */
1876 pstrcpy(bs_dest
->device_name
, sizeof(bs_dest
->device_name
),
1877 bs_src
->device_name
);
1878 bs_dest
->device_list
= bs_src
->device_list
;
1880 /* keep the same entry in graph_bdrv_states
1881 * We do want to swap name but don't want to swap linked list entries
1883 bs_dest
->node_list
= bs_src
->node_list
;
1887 * Swap bs contents for two image chains while they are live,
1888 * while keeping required fields on the BlockDriverState that is
1889 * actually attached to a device.
1891 * This will modify the BlockDriverState fields, and swap contents
1892 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1894 * bs_new is required to be anonymous.
1896 * This function does not create any image files.
1898 void bdrv_swap(BlockDriverState
*bs_new
, BlockDriverState
*bs_old
)
1900 BlockDriverState tmp
;
1902 /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1903 assert(bs_new
->device_name
[0] == '\0');
1904 assert(QLIST_EMPTY(&bs_new
->dirty_bitmaps
));
1905 assert(bs_new
->job
== NULL
);
1906 assert(bs_new
->dev
== NULL
);
1907 assert(bs_new
->in_use
== 0);
1908 assert(bs_new
->io_limits_enabled
== false);
1909 assert(!throttle_have_timer(&bs_new
->throttle_state
));
1915 /* there are some fields that should not be swapped, move them back */
1916 bdrv_move_feature_fields(&tmp
, bs_old
);
1917 bdrv_move_feature_fields(bs_old
, bs_new
);
1918 bdrv_move_feature_fields(bs_new
, &tmp
);
1920 /* bs_new shouldn't be in bdrv_states even after the swap! */
1921 assert(bs_new
->device_name
[0] == '\0');
1923 /* Check a few fields that should remain attached to the device */
1924 assert(bs_new
->dev
== NULL
);
1925 assert(bs_new
->job
== NULL
);
1926 assert(bs_new
->in_use
== 0);
1927 assert(bs_new
->io_limits_enabled
== false);
1928 assert(!throttle_have_timer(&bs_new
->throttle_state
));
1930 bdrv_rebind(bs_new
);
1931 bdrv_rebind(bs_old
);
1935 * Add new bs contents at the top of an image chain while the chain is
1936 * live, while keeping required fields on the top layer.
1938 * This will modify the BlockDriverState fields, and swap contents
1939 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1941 * bs_new is required to be anonymous.
1943 * This function does not create any image files.
1945 void bdrv_append(BlockDriverState
*bs_new
, BlockDriverState
*bs_top
)
1947 bdrv_swap(bs_new
, bs_top
);
1949 /* The contents of 'tmp' will become bs_top, as we are
1950 * swapping bs_new and bs_top contents. */
1951 bs_top
->backing_hd
= bs_new
;
1952 bs_top
->open_flags
&= ~BDRV_O_NO_BACKING
;
1953 pstrcpy(bs_top
->backing_file
, sizeof(bs_top
->backing_file
),
1955 pstrcpy(bs_top
->backing_format
, sizeof(bs_top
->backing_format
),
1956 bs_new
->drv
? bs_new
->drv
->format_name
: "");
1959 static void bdrv_delete(BlockDriverState
*bs
)
1963 assert(!bs
->in_use
);
1964 assert(!bs
->refcnt
);
1965 assert(QLIST_EMPTY(&bs
->dirty_bitmaps
));
1969 /* remove from list, if necessary */
1975 int bdrv_attach_dev(BlockDriverState
*bs
, void *dev
)
1976 /* TODO change to DeviceState *dev when all users are qdevified */
1982 bdrv_iostatus_reset(bs
);
1986 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1987 void bdrv_attach_dev_nofail(BlockDriverState
*bs
, void *dev
)
1989 if (bdrv_attach_dev(bs
, dev
) < 0) {
1994 void bdrv_detach_dev(BlockDriverState
*bs
, void *dev
)
1995 /* TODO change to DeviceState *dev when all users are qdevified */
1997 assert(bs
->dev
== dev
);
2000 bs
->dev_opaque
= NULL
;
2001 bs
->guest_block_size
= 512;
2004 /* TODO change to return DeviceState * when all users are qdevified */
2005 void *bdrv_get_attached_dev(BlockDriverState
*bs
)
2010 void bdrv_set_dev_ops(BlockDriverState
*bs
, const BlockDevOps
*ops
,
2014 bs
->dev_opaque
= opaque
;
2017 void bdrv_emit_qmp_error_event(const BlockDriverState
*bdrv
,
2018 enum MonitorEvent ev
,
2019 BlockErrorAction action
, bool is_read
)
2022 const char *action_str
;
2025 case BDRV_ACTION_REPORT
:
2026 action_str
= "report";
2028 case BDRV_ACTION_IGNORE
:
2029 action_str
= "ignore";
2031 case BDRV_ACTION_STOP
:
2032 action_str
= "stop";
2038 data
= qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2041 is_read
? "read" : "write");
2042 monitor_protocol_event(ev
, data
);
2044 qobject_decref(data
);
2047 static void bdrv_emit_qmp_eject_event(BlockDriverState
*bs
, bool ejected
)
2051 data
= qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2052 bdrv_get_device_name(bs
), ejected
);
2053 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED
, data
);
2055 qobject_decref(data
);
2058 static void bdrv_dev_change_media_cb(BlockDriverState
*bs
, bool load
)
2060 if (bs
->dev_ops
&& bs
->dev_ops
->change_media_cb
) {
2061 bool tray_was_closed
= !bdrv_dev_is_tray_open(bs
);
2062 bs
->dev_ops
->change_media_cb(bs
->dev_opaque
, load
);
2063 if (tray_was_closed
) {
2065 bdrv_emit_qmp_eject_event(bs
, true);
2069 bdrv_emit_qmp_eject_event(bs
, false);
2074 bool bdrv_dev_has_removable_media(BlockDriverState
*bs
)
2076 return !bs
->dev
|| (bs
->dev_ops
&& bs
->dev_ops
->change_media_cb
);
2079 void bdrv_dev_eject_request(BlockDriverState
*bs
, bool force
)
2081 if (bs
->dev_ops
&& bs
->dev_ops
->eject_request_cb
) {
2082 bs
->dev_ops
->eject_request_cb(bs
->dev_opaque
, force
);
2086 bool bdrv_dev_is_tray_open(BlockDriverState
*bs
)
2088 if (bs
->dev_ops
&& bs
->dev_ops
->is_tray_open
) {
2089 return bs
->dev_ops
->is_tray_open(bs
->dev_opaque
);
2094 static void bdrv_dev_resize_cb(BlockDriverState
*bs
)
2096 if (bs
->dev_ops
&& bs
->dev_ops
->resize_cb
) {
2097 bs
->dev_ops
->resize_cb(bs
->dev_opaque
);
2101 bool bdrv_dev_is_medium_locked(BlockDriverState
*bs
)
2103 if (bs
->dev_ops
&& bs
->dev_ops
->is_medium_locked
) {
2104 return bs
->dev_ops
->is_medium_locked(bs
->dev_opaque
);
2110 * Run consistency checks on an image
2112 * Returns 0 if the check could be completed (it doesn't mean that the image is
2113 * free of errors) or -errno when an internal error occurred. The results of the
2114 * check are stored in res.
2116 int bdrv_check(BlockDriverState
*bs
, BdrvCheckResult
*res
, BdrvCheckMode fix
)
2118 if (bs
->drv
->bdrv_check
== NULL
) {
2122 memset(res
, 0, sizeof(*res
));
2123 return bs
->drv
->bdrv_check(bs
, res
, fix
);
2126 #define COMMIT_BUF_SECTORS 2048
2128 /* commit COW file into the raw image */
2129 int bdrv_commit(BlockDriverState
*bs
)
2131 BlockDriver
*drv
= bs
->drv
;
2132 int64_t sector
, total_sectors
, length
, backing_length
;
2133 int n
, ro
, open_flags
;
2135 uint8_t *buf
= NULL
;
2136 char filename
[PATH_MAX
];
2141 if (!bs
->backing_hd
) {
2145 if (bdrv_in_use(bs
) || bdrv_in_use(bs
->backing_hd
)) {
2149 ro
= bs
->backing_hd
->read_only
;
2150 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2151 pstrcpy(filename
, sizeof(filename
), bs
->backing_hd
->filename
);
2152 open_flags
= bs
->backing_hd
->open_flags
;
2155 if (bdrv_reopen(bs
->backing_hd
, open_flags
| BDRV_O_RDWR
, NULL
)) {
2160 length
= bdrv_getlength(bs
);
2166 backing_length
= bdrv_getlength(bs
->backing_hd
);
2167 if (backing_length
< 0) {
2168 ret
= backing_length
;
2172 /* If our top snapshot is larger than the backing file image,
2173 * grow the backing file image if possible. If not possible,
2174 * we must return an error */
2175 if (length
> backing_length
) {
2176 ret
= bdrv_truncate(bs
->backing_hd
, length
);
2182 total_sectors
= length
>> BDRV_SECTOR_BITS
;
2183 buf
= g_malloc(COMMIT_BUF_SECTORS
* BDRV_SECTOR_SIZE
);
2185 for (sector
= 0; sector
< total_sectors
; sector
+= n
) {
2186 ret
= bdrv_is_allocated(bs
, sector
, COMMIT_BUF_SECTORS
, &n
);
2191 ret
= bdrv_read(bs
, sector
, buf
, n
);
2196 ret
= bdrv_write(bs
->backing_hd
, sector
, buf
, n
);
2203 if (drv
->bdrv_make_empty
) {
2204 ret
= drv
->bdrv_make_empty(bs
);
2212 * Make sure all data we wrote to the backing device is actually
2215 if (bs
->backing_hd
) {
2216 bdrv_flush(bs
->backing_hd
);
2224 /* ignoring error return here */
2225 bdrv_reopen(bs
->backing_hd
, open_flags
& ~BDRV_O_RDWR
, NULL
);
2231 int bdrv_commit_all(void)
2233 BlockDriverState
*bs
;
2235 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
2236 if (bs
->drv
&& bs
->backing_hd
) {
2237 int ret
= bdrv_commit(bs
);
2247 * Remove an active request from the tracked requests list
2249 * This function should be called when a tracked request is completing.
2251 static void tracked_request_end(BdrvTrackedRequest
*req
)
2253 if (req
->serialising
) {
2254 req
->bs
->serialising_in_flight
--;
2257 QLIST_REMOVE(req
, list
);
2258 qemu_co_queue_restart_all(&req
->wait_queue
);
2262 * Add an active request to the tracked requests list
2264 static void tracked_request_begin(BdrvTrackedRequest
*req
,
2265 BlockDriverState
*bs
,
2267 unsigned int bytes
, bool is_write
)
2269 *req
= (BdrvTrackedRequest
){
2273 .is_write
= is_write
,
2274 .co
= qemu_coroutine_self(),
2275 .serialising
= false,
2276 .overlap_offset
= offset
,
2277 .overlap_bytes
= bytes
,
2280 qemu_co_queue_init(&req
->wait_queue
);
2282 QLIST_INSERT_HEAD(&bs
->tracked_requests
, req
, list
);
2285 static void mark_request_serialising(BdrvTrackedRequest
*req
, uint64_t align
)
2287 int64_t overlap_offset
= req
->offset
& ~(align
- 1);
2288 unsigned int overlap_bytes
= ROUND_UP(req
->offset
+ req
->bytes
, align
)
2291 if (!req
->serialising
) {
2292 req
->bs
->serialising_in_flight
++;
2293 req
->serialising
= true;
2296 req
->overlap_offset
= MIN(req
->overlap_offset
, overlap_offset
);
2297 req
->overlap_bytes
= MAX(req
->overlap_bytes
, overlap_bytes
);
2301 * Round a region to cluster boundaries
2303 void bdrv_round_to_clusters(BlockDriverState
*bs
,
2304 int64_t sector_num
, int nb_sectors
,
2305 int64_t *cluster_sector_num
,
2306 int *cluster_nb_sectors
)
2308 BlockDriverInfo bdi
;
2310 if (bdrv_get_info(bs
, &bdi
) < 0 || bdi
.cluster_size
== 0) {
2311 *cluster_sector_num
= sector_num
;
2312 *cluster_nb_sectors
= nb_sectors
;
2314 int64_t c
= bdi
.cluster_size
/ BDRV_SECTOR_SIZE
;
2315 *cluster_sector_num
= QEMU_ALIGN_DOWN(sector_num
, c
);
2316 *cluster_nb_sectors
= QEMU_ALIGN_UP(sector_num
- *cluster_sector_num
+
2321 static int bdrv_get_cluster_size(BlockDriverState
*bs
)
2323 BlockDriverInfo bdi
;
2326 ret
= bdrv_get_info(bs
, &bdi
);
2327 if (ret
< 0 || bdi
.cluster_size
== 0) {
2328 return bs
->request_alignment
;
2330 return bdi
.cluster_size
;
2334 static bool tracked_request_overlaps(BdrvTrackedRequest
*req
,
2335 int64_t offset
, unsigned int bytes
)
2338 if (offset
>= req
->overlap_offset
+ req
->overlap_bytes
) {
2342 if (req
->overlap_offset
>= offset
+ bytes
) {
2348 static bool coroutine_fn
wait_serialising_requests(BdrvTrackedRequest
*self
)
2350 BlockDriverState
*bs
= self
->bs
;
2351 BdrvTrackedRequest
*req
;
2353 bool waited
= false;
2355 if (!bs
->serialising_in_flight
) {
2361 QLIST_FOREACH(req
, &bs
->tracked_requests
, list
) {
2362 if (req
== self
|| (!req
->serialising
&& !self
->serialising
)) {
2365 if (tracked_request_overlaps(req
, self
->overlap_offset
,
2366 self
->overlap_bytes
))
2368 /* Hitting this means there was a reentrant request, for
2369 * example, a block driver issuing nested requests. This must
2370 * never happen since it means deadlock.
2372 assert(qemu_coroutine_self() != req
->co
);
2374 /* If the request is already (indirectly) waiting for us, or
2375 * will wait for us as soon as it wakes up, then just go on
2376 * (instead of producing a deadlock in the former case). */
2377 if (!req
->waiting_for
) {
2378 self
->waiting_for
= req
;
2379 qemu_co_queue_wait(&req
->wait_queue
);
2380 self
->waiting_for
= NULL
;
2395 * -EINVAL - backing format specified, but no file
2396 * -ENOSPC - can't update the backing file because no space is left in the
2398 * -ENOTSUP - format driver doesn't support changing the backing file
2400 int bdrv_change_backing_file(BlockDriverState
*bs
,
2401 const char *backing_file
, const char *backing_fmt
)
2403 BlockDriver
*drv
= bs
->drv
;
2406 /* Backing file format doesn't make sense without a backing file */
2407 if (backing_fmt
&& !backing_file
) {
2411 if (drv
->bdrv_change_backing_file
!= NULL
) {
2412 ret
= drv
->bdrv_change_backing_file(bs
, backing_file
, backing_fmt
);
2418 pstrcpy(bs
->backing_file
, sizeof(bs
->backing_file
), backing_file
?: "");
2419 pstrcpy(bs
->backing_format
, sizeof(bs
->backing_format
), backing_fmt
?: "");
2425 * Finds the image layer in the chain that has 'bs' as its backing file.
2427 * active is the current topmost image.
2429 * Returns NULL if bs is not found in active's image chain,
2430 * or if active == bs.
2432 BlockDriverState
*bdrv_find_overlay(BlockDriverState
*active
,
2433 BlockDriverState
*bs
)
2435 BlockDriverState
*overlay
= NULL
;
2436 BlockDriverState
*intermediate
;
2438 assert(active
!= NULL
);
2441 /* if bs is the same as active, then by definition it has no overlay
2447 intermediate
= active
;
2448 while (intermediate
->backing_hd
) {
2449 if (intermediate
->backing_hd
== bs
) {
2450 overlay
= intermediate
;
2453 intermediate
= intermediate
->backing_hd
;
2459 typedef struct BlkIntermediateStates
{
2460 BlockDriverState
*bs
;
2461 QSIMPLEQ_ENTRY(BlkIntermediateStates
) entry
;
2462 } BlkIntermediateStates
;
2466 * Drops images above 'base' up to and including 'top', and sets the image
2467 * above 'top' to have base as its backing file.
2469 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2470 * information in 'bs' can be properly updated.
2472 * E.g., this will convert the following chain:
2473 * bottom <- base <- intermediate <- top <- active
2477 * bottom <- base <- active
2479 * It is allowed for bottom==base, in which case it converts:
2481 * base <- intermediate <- top <- active
2488 * if active == top, that is considered an error
2491 int bdrv_drop_intermediate(BlockDriverState
*active
, BlockDriverState
*top
,
2492 BlockDriverState
*base
)
2494 BlockDriverState
*intermediate
;
2495 BlockDriverState
*base_bs
= NULL
;
2496 BlockDriverState
*new_top_bs
= NULL
;
2497 BlkIntermediateStates
*intermediate_state
, *next
;
2500 QSIMPLEQ_HEAD(states_to_delete
, BlkIntermediateStates
) states_to_delete
;
2501 QSIMPLEQ_INIT(&states_to_delete
);
2503 if (!top
->drv
|| !base
->drv
) {
2507 new_top_bs
= bdrv_find_overlay(active
, top
);
2509 if (new_top_bs
== NULL
) {
2510 /* we could not find the image above 'top', this is an error */
2514 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2515 * to do, no intermediate images */
2516 if (new_top_bs
->backing_hd
== base
) {
2523 /* now we will go down through the list, and add each BDS we find
2524 * into our deletion queue, until we hit the 'base'
2526 while (intermediate
) {
2527 intermediate_state
= g_malloc0(sizeof(BlkIntermediateStates
));
2528 intermediate_state
->bs
= intermediate
;
2529 QSIMPLEQ_INSERT_TAIL(&states_to_delete
, intermediate_state
, entry
);
2531 if (intermediate
->backing_hd
== base
) {
2532 base_bs
= intermediate
->backing_hd
;
2535 intermediate
= intermediate
->backing_hd
;
2537 if (base_bs
== NULL
) {
2538 /* something went wrong, we did not end at the base. safely
2539 * unravel everything, and exit with error */
2543 /* success - we can delete the intermediate states, and link top->base */
2544 ret
= bdrv_change_backing_file(new_top_bs
, base_bs
->filename
,
2545 base_bs
->drv
? base_bs
->drv
->format_name
: "");
2549 new_top_bs
->backing_hd
= base_bs
;
2551 bdrv_refresh_limits(new_top_bs
);
2553 QSIMPLEQ_FOREACH_SAFE(intermediate_state
, &states_to_delete
, entry
, next
) {
2554 /* so that bdrv_close() does not recursively close the chain */
2555 intermediate_state
->bs
->backing_hd
= NULL
;
2556 bdrv_unref(intermediate_state
->bs
);
2561 QSIMPLEQ_FOREACH_SAFE(intermediate_state
, &states_to_delete
, entry
, next
) {
2562 g_free(intermediate_state
);
2568 static int bdrv_check_byte_request(BlockDriverState
*bs
, int64_t offset
,
2573 if (!bdrv_is_inserted(bs
))
2579 len
= bdrv_getlength(bs
);
2584 if ((offset
> len
) || (len
- offset
< size
))
2590 static int bdrv_check_request(BlockDriverState
*bs
, int64_t sector_num
,
2593 return bdrv_check_byte_request(bs
, sector_num
* BDRV_SECTOR_SIZE
,
2594 nb_sectors
* BDRV_SECTOR_SIZE
);
2597 typedef struct RwCo
{
2598 BlockDriverState
*bs
;
2603 BdrvRequestFlags flags
;
2606 static void coroutine_fn
bdrv_rw_co_entry(void *opaque
)
2608 RwCo
*rwco
= opaque
;
2610 if (!rwco
->is_write
) {
2611 rwco
->ret
= bdrv_co_do_preadv(rwco
->bs
, rwco
->offset
,
2612 rwco
->qiov
->size
, rwco
->qiov
,
2615 rwco
->ret
= bdrv_co_do_pwritev(rwco
->bs
, rwco
->offset
,
2616 rwco
->qiov
->size
, rwco
->qiov
,
2622 * Process a vectored synchronous request using coroutines
2624 static int bdrv_prwv_co(BlockDriverState
*bs
, int64_t offset
,
2625 QEMUIOVector
*qiov
, bool is_write
,
2626 BdrvRequestFlags flags
)
2633 .is_write
= is_write
,
2639 * In sync call context, when the vcpu is blocked, this throttling timer
2640 * will not fire; so the I/O throttling function has to be disabled here
2641 * if it has been enabled.
2643 if (bs
->io_limits_enabled
) {
2644 fprintf(stderr
, "Disabling I/O throttling on '%s' due "
2645 "to synchronous I/O.\n", bdrv_get_device_name(bs
));
2646 bdrv_io_limits_disable(bs
);
2649 if (qemu_in_coroutine()) {
2650 /* Fast-path if already in coroutine context */
2651 bdrv_rw_co_entry(&rwco
);
2653 co
= qemu_coroutine_create(bdrv_rw_co_entry
);
2654 qemu_coroutine_enter(co
, &rwco
);
2655 while (rwco
.ret
== NOT_DONE
) {
2663 * Process a synchronous request using coroutines
2665 static int bdrv_rw_co(BlockDriverState
*bs
, int64_t sector_num
, uint8_t *buf
,
2666 int nb_sectors
, bool is_write
, BdrvRequestFlags flags
)
2669 struct iovec iov
= {
2670 .iov_base
= (void *)buf
,
2671 .iov_len
= nb_sectors
* BDRV_SECTOR_SIZE
,
2674 qemu_iovec_init_external(&qiov
, &iov
, 1);
2675 return bdrv_prwv_co(bs
, sector_num
<< BDRV_SECTOR_BITS
,
2676 &qiov
, is_write
, flags
);
2679 /* return < 0 if error. See bdrv_write() for the return codes */
2680 int bdrv_read(BlockDriverState
*bs
, int64_t sector_num
,
2681 uint8_t *buf
, int nb_sectors
)
2683 return bdrv_rw_co(bs
, sector_num
, buf
, nb_sectors
, false, 0);
2686 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2687 int bdrv_read_unthrottled(BlockDriverState
*bs
, int64_t sector_num
,
2688 uint8_t *buf
, int nb_sectors
)
2693 enabled
= bs
->io_limits_enabled
;
2694 bs
->io_limits_enabled
= false;
2695 ret
= bdrv_read(bs
, sector_num
, buf
, nb_sectors
);
2696 bs
->io_limits_enabled
= enabled
;
2700 /* Return < 0 if error. Important errors are:
2701 -EIO generic I/O error (may happen for all errors)
2702 -ENOMEDIUM No media inserted.
2703 -EINVAL Invalid sector number or nb_sectors
2704 -EACCES Trying to write a read-only device
2706 int bdrv_write(BlockDriverState
*bs
, int64_t sector_num
,
2707 const uint8_t *buf
, int nb_sectors
)
2709 return bdrv_rw_co(bs
, sector_num
, (uint8_t *)buf
, nb_sectors
, true, 0);
2712 int bdrv_write_zeroes(BlockDriverState
*bs
, int64_t sector_num
,
2713 int nb_sectors
, BdrvRequestFlags flags
)
2715 return bdrv_rw_co(bs
, sector_num
, NULL
, nb_sectors
, true,
2716 BDRV_REQ_ZERO_WRITE
| flags
);
2720 * Completely zero out a block device with the help of bdrv_write_zeroes.
2721 * The operation is sped up by checking the block status and only writing
2722 * zeroes to the device if they currently do not return zeroes. Optional
2723 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2725 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2727 int bdrv_make_zero(BlockDriverState
*bs
, BdrvRequestFlags flags
)
2729 int64_t target_size
= bdrv_getlength(bs
) / BDRV_SECTOR_SIZE
;
2730 int64_t ret
, nb_sectors
, sector_num
= 0;
2734 nb_sectors
= target_size
- sector_num
;
2735 if (nb_sectors
<= 0) {
2738 if (nb_sectors
> INT_MAX
) {
2739 nb_sectors
= INT_MAX
;
2741 ret
= bdrv_get_block_status(bs
, sector_num
, nb_sectors
, &n
);
2743 error_report("error getting block status at sector %" PRId64
": %s",
2744 sector_num
, strerror(-ret
));
2747 if (ret
& BDRV_BLOCK_ZERO
) {
2751 ret
= bdrv_write_zeroes(bs
, sector_num
, n
, flags
);
2753 error_report("error writing zeroes at sector %" PRId64
": %s",
2754 sector_num
, strerror(-ret
));
2761 int bdrv_pread(BlockDriverState
*bs
, int64_t offset
, void *buf
, int bytes
)
2764 struct iovec iov
= {
2765 .iov_base
= (void *)buf
,
2774 qemu_iovec_init_external(&qiov
, &iov
, 1);
2775 ret
= bdrv_prwv_co(bs
, offset
, &qiov
, false, 0);
2783 int bdrv_pwritev(BlockDriverState
*bs
, int64_t offset
, QEMUIOVector
*qiov
)
2787 ret
= bdrv_prwv_co(bs
, offset
, qiov
, true, 0);
2795 int bdrv_pwrite(BlockDriverState
*bs
, int64_t offset
,
2796 const void *buf
, int bytes
)
2799 struct iovec iov
= {
2800 .iov_base
= (void *) buf
,
2808 qemu_iovec_init_external(&qiov
, &iov
, 1);
2809 return bdrv_pwritev(bs
, offset
, &qiov
);
2813 * Writes to the file and ensures that no writes are reordered across this
2814 * request (acts as a barrier)
2816 * Returns 0 on success, -errno in error cases.
2818 int bdrv_pwrite_sync(BlockDriverState
*bs
, int64_t offset
,
2819 const void *buf
, int count
)
2823 ret
= bdrv_pwrite(bs
, offset
, buf
, count
);
2828 /* No flush needed for cache modes that already do it */
2829 if (bs
->enable_write_cache
) {
2836 static int coroutine_fn
bdrv_co_do_copy_on_readv(BlockDriverState
*bs
,
2837 int64_t sector_num
, int nb_sectors
, QEMUIOVector
*qiov
)
2839 /* Perform I/O through a temporary buffer so that users who scribble over
2840 * their read buffer while the operation is in progress do not end up
2841 * modifying the image file. This is critical for zero-copy guest I/O
2842 * where anything might happen inside guest memory.
2844 void *bounce_buffer
;
2846 BlockDriver
*drv
= bs
->drv
;
2848 QEMUIOVector bounce_qiov
;
2849 int64_t cluster_sector_num
;
2850 int cluster_nb_sectors
;
2854 /* Cover entire cluster so no additional backing file I/O is required when
2855 * allocating cluster in the image file.
2857 bdrv_round_to_clusters(bs
, sector_num
, nb_sectors
,
2858 &cluster_sector_num
, &cluster_nb_sectors
);
2860 trace_bdrv_co_do_copy_on_readv(bs
, sector_num
, nb_sectors
,
2861 cluster_sector_num
, cluster_nb_sectors
);
2863 iov
.iov_len
= cluster_nb_sectors
* BDRV_SECTOR_SIZE
;
2864 iov
.iov_base
= bounce_buffer
= qemu_blockalign(bs
, iov
.iov_len
);
2865 qemu_iovec_init_external(&bounce_qiov
, &iov
, 1);
2867 ret
= drv
->bdrv_co_readv(bs
, cluster_sector_num
, cluster_nb_sectors
,
2873 if (drv
->bdrv_co_write_zeroes
&&
2874 buffer_is_zero(bounce_buffer
, iov
.iov_len
)) {
2875 ret
= bdrv_co_do_write_zeroes(bs
, cluster_sector_num
,
2876 cluster_nb_sectors
, 0);
2878 /* This does not change the data on the disk, it is not necessary
2879 * to flush even in cache=writethrough mode.
2881 ret
= drv
->bdrv_co_writev(bs
, cluster_sector_num
, cluster_nb_sectors
,
2886 /* It might be okay to ignore write errors for guest requests. If this
2887 * is a deliberate copy-on-read then we don't want to ignore the error.
2888 * Simply report it in all cases.
2893 skip_bytes
= (sector_num
- cluster_sector_num
) * BDRV_SECTOR_SIZE
;
2894 qemu_iovec_from_buf(qiov
, 0, bounce_buffer
+ skip_bytes
,
2895 nb_sectors
* BDRV_SECTOR_SIZE
);
2898 qemu_vfree(bounce_buffer
);
2903 * Forwards an already correctly aligned request to the BlockDriver. This
2904 * handles copy on read and zeroing after EOF; any other features must be
2905 * implemented by the caller.
2907 static int coroutine_fn
bdrv_aligned_preadv(BlockDriverState
*bs
,
2908 BdrvTrackedRequest
*req
, int64_t offset
, unsigned int bytes
,
2909 int64_t align
, QEMUIOVector
*qiov
, int flags
)
2911 BlockDriver
*drv
= bs
->drv
;
2914 int64_t sector_num
= offset
>> BDRV_SECTOR_BITS
;
2915 unsigned int nb_sectors
= bytes
>> BDRV_SECTOR_BITS
;
2917 assert((offset
& (BDRV_SECTOR_SIZE
- 1)) == 0);
2918 assert((bytes
& (BDRV_SECTOR_SIZE
- 1)) == 0);
2920 /* Handle Copy on Read and associated serialisation */
2921 if (flags
& BDRV_REQ_COPY_ON_READ
) {
2922 /* If we touch the same cluster it counts as an overlap. This
2923 * guarantees that allocating writes will be serialized and not race
2924 * with each other for the same cluster. For example, in copy-on-read
2925 * it ensures that the CoR read and write operations are atomic and
2926 * guest writes cannot interleave between them. */
2927 mark_request_serialising(req
, bdrv_get_cluster_size(bs
));
2930 wait_serialising_requests(req
);
2932 if (flags
& BDRV_REQ_COPY_ON_READ
) {
2935 ret
= bdrv_is_allocated(bs
, sector_num
, nb_sectors
, &pnum
);
2940 if (!ret
|| pnum
!= nb_sectors
) {
2941 ret
= bdrv_co_do_copy_on_readv(bs
, sector_num
, nb_sectors
, qiov
);
2946 /* Forward the request to the BlockDriver */
2947 if (!(bs
->zero_beyond_eof
&& bs
->growable
)) {
2948 ret
= drv
->bdrv_co_readv(bs
, sector_num
, nb_sectors
, qiov
);
2950 /* Read zeros after EOF of growable BDSes */
2951 int64_t len
, total_sectors
, max_nb_sectors
;
2953 len
= bdrv_getlength(bs
);
2959 total_sectors
= DIV_ROUND_UP(len
, BDRV_SECTOR_SIZE
);
2960 max_nb_sectors
= ROUND_UP(MAX(0, total_sectors
- sector_num
),
2961 align
>> BDRV_SECTOR_BITS
);
2962 if (max_nb_sectors
> 0) {
2963 ret
= drv
->bdrv_co_readv(bs
, sector_num
,
2964 MIN(nb_sectors
, max_nb_sectors
), qiov
);
2969 /* Reading beyond end of file is supposed to produce zeroes */
2970 if (ret
== 0 && total_sectors
< sector_num
+ nb_sectors
) {
2971 uint64_t offset
= MAX(0, total_sectors
- sector_num
);
2972 uint64_t bytes
= (sector_num
+ nb_sectors
- offset
) *
2974 qemu_iovec_memset(qiov
, offset
* BDRV_SECTOR_SIZE
, 0, bytes
);
2983 * Handle a read request in coroutine context
2985 static int coroutine_fn
bdrv_co_do_preadv(BlockDriverState
*bs
,
2986 int64_t offset
, unsigned int bytes
, QEMUIOVector
*qiov
,
2987 BdrvRequestFlags flags
)
2989 BlockDriver
*drv
= bs
->drv
;
2990 BdrvTrackedRequest req
;
2992 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
2993 uint64_t align
= MAX(BDRV_SECTOR_SIZE
, bs
->request_alignment
);
2994 uint8_t *head_buf
= NULL
;
2995 uint8_t *tail_buf
= NULL
;
2996 QEMUIOVector local_qiov
;
2997 bool use_local_qiov
= false;
3003 if (bdrv_check_byte_request(bs
, offset
, bytes
)) {
3007 if (bs
->copy_on_read
) {
3008 flags
|= BDRV_REQ_COPY_ON_READ
;
3011 /* throttling disk I/O */
3012 if (bs
->io_limits_enabled
) {
3013 bdrv_io_limits_intercept(bs
, bytes
, false);
3016 /* Align read if necessary by padding qiov */
3017 if (offset
& (align
- 1)) {
3018 head_buf
= qemu_blockalign(bs
, align
);
3019 qemu_iovec_init(&local_qiov
, qiov
->niov
+ 2);
3020 qemu_iovec_add(&local_qiov
, head_buf
, offset
& (align
- 1));
3021 qemu_iovec_concat(&local_qiov
, qiov
, 0, qiov
->size
);
3022 use_local_qiov
= true;
3024 bytes
+= offset
& (align
- 1);
3025 offset
= offset
& ~(align
- 1);
3028 if ((offset
+ bytes
) & (align
- 1)) {
3029 if (!use_local_qiov
) {
3030 qemu_iovec_init(&local_qiov
, qiov
->niov
+ 1);
3031 qemu_iovec_concat(&local_qiov
, qiov
, 0, qiov
->size
);
3032 use_local_qiov
= true;
3034 tail_buf
= qemu_blockalign(bs
, align
);
3035 qemu_iovec_add(&local_qiov
, tail_buf
,
3036 align
- ((offset
+ bytes
) & (align
- 1)));
3038 bytes
= ROUND_UP(bytes
, align
);
3041 tracked_request_begin(&req
, bs
, offset
, bytes
, false);
3042 ret
= bdrv_aligned_preadv(bs
, &req
, offset
, bytes
, align
,
3043 use_local_qiov
? &local_qiov
: qiov
,
3045 tracked_request_end(&req
);
3047 if (use_local_qiov
) {
3048 qemu_iovec_destroy(&local_qiov
);
3049 qemu_vfree(head_buf
);
3050 qemu_vfree(tail_buf
);
3056 static int coroutine_fn
bdrv_co_do_readv(BlockDriverState
*bs
,
3057 int64_t sector_num
, int nb_sectors
, QEMUIOVector
*qiov
,
3058 BdrvRequestFlags flags
)
3060 if (nb_sectors
< 0 || nb_sectors
> (UINT_MAX
>> BDRV_SECTOR_BITS
)) {
3064 return bdrv_co_do_preadv(bs
, sector_num
<< BDRV_SECTOR_BITS
,
3065 nb_sectors
<< BDRV_SECTOR_BITS
, qiov
, flags
);
3068 int coroutine_fn
bdrv_co_readv(BlockDriverState
*bs
, int64_t sector_num
,
3069 int nb_sectors
, QEMUIOVector
*qiov
)
3071 trace_bdrv_co_readv(bs
, sector_num
, nb_sectors
);
3073 return bdrv_co_do_readv(bs
, sector_num
, nb_sectors
, qiov
, 0);
3076 int coroutine_fn
bdrv_co_copy_on_readv(BlockDriverState
*bs
,
3077 int64_t sector_num
, int nb_sectors
, QEMUIOVector
*qiov
)
3079 trace_bdrv_co_copy_on_readv(bs
, sector_num
, nb_sectors
);
3081 return bdrv_co_do_readv(bs
, sector_num
, nb_sectors
, qiov
,
3082 BDRV_REQ_COPY_ON_READ
);
3085 /* if no limit is specified in the BlockLimits use a default
3086 * of 32768 512-byte sectors (16 MiB) per request.
3088 #define MAX_WRITE_ZEROES_DEFAULT 32768
3090 static int coroutine_fn
bdrv_co_do_write_zeroes(BlockDriverState
*bs
,
3091 int64_t sector_num
, int nb_sectors
, BdrvRequestFlags flags
)
3093 BlockDriver
*drv
= bs
->drv
;
3095 struct iovec iov
= {0};
3098 int max_write_zeroes
= bs
->bl
.max_write_zeroes
?
3099 bs
->bl
.max_write_zeroes
: MAX_WRITE_ZEROES_DEFAULT
;
3101 while (nb_sectors
> 0 && !ret
) {
3102 int num
= nb_sectors
;
3104 /* Align request. Block drivers can expect the "bulk" of the request
3107 if (bs
->bl
.write_zeroes_alignment
3108 && num
> bs
->bl
.write_zeroes_alignment
) {
3109 if (sector_num
% bs
->bl
.write_zeroes_alignment
!= 0) {
3110 /* Make a small request up to the first aligned sector. */
3111 num
= bs
->bl
.write_zeroes_alignment
;
3112 num
-= sector_num
% bs
->bl
.write_zeroes_alignment
;
3113 } else if ((sector_num
+ num
) % bs
->bl
.write_zeroes_alignment
!= 0) {
3114 /* Shorten the request to the last aligned sector. num cannot
3115 * underflow because num > bs->bl.write_zeroes_alignment.
3117 num
-= (sector_num
+ num
) % bs
->bl
.write_zeroes_alignment
;
3121 /* limit request size */
3122 if (num
> max_write_zeroes
) {
3123 num
= max_write_zeroes
;
3127 /* First try the efficient write zeroes operation */
3128 if (drv
->bdrv_co_write_zeroes
) {
3129 ret
= drv
->bdrv_co_write_zeroes(bs
, sector_num
, num
, flags
);
3132 if (ret
== -ENOTSUP
) {
3133 /* Fall back to bounce buffer if write zeroes is unsupported */
3134 iov
.iov_len
= num
* BDRV_SECTOR_SIZE
;
3135 if (iov
.iov_base
== NULL
) {
3136 iov
.iov_base
= qemu_blockalign(bs
, num
* BDRV_SECTOR_SIZE
);
3137 memset(iov
.iov_base
, 0, num
* BDRV_SECTOR_SIZE
);
3139 qemu_iovec_init_external(&qiov
, &iov
, 1);
3141 ret
= drv
->bdrv_co_writev(bs
, sector_num
, num
, &qiov
);
3143 /* Keep bounce buffer around if it is big enough for all
3144 * all future requests.
3146 if (num
< max_write_zeroes
) {
3147 qemu_vfree(iov
.iov_base
);
3148 iov
.iov_base
= NULL
;
3156 qemu_vfree(iov
.iov_base
);
3161 * Forwards an already correctly aligned write request to the BlockDriver.
3163 static int coroutine_fn
bdrv_aligned_pwritev(BlockDriverState
*bs
,
3164 BdrvTrackedRequest
*req
, int64_t offset
, unsigned int bytes
,
3165 QEMUIOVector
*qiov
, int flags
)
3167 BlockDriver
*drv
= bs
->drv
;
3171 int64_t sector_num
= offset
>> BDRV_SECTOR_BITS
;
3172 unsigned int nb_sectors
= bytes
>> BDRV_SECTOR_BITS
;
3174 assert((offset
& (BDRV_SECTOR_SIZE
- 1)) == 0);
3175 assert((bytes
& (BDRV_SECTOR_SIZE
- 1)) == 0);
3177 waited
= wait_serialising_requests(req
);
3178 assert(!waited
|| !req
->serialising
);
3179 assert(req
->overlap_offset
<= offset
);
3180 assert(offset
+ bytes
<= req
->overlap_offset
+ req
->overlap_bytes
);
3182 ret
= notifier_with_return_list_notify(&bs
->before_write_notifiers
, req
);
3185 /* Do nothing, write notifier decided to fail this request */
3186 } else if (flags
& BDRV_REQ_ZERO_WRITE
) {
3187 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV_ZERO
);
3188 ret
= bdrv_co_do_write_zeroes(bs
, sector_num
, nb_sectors
, flags
);
3190 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV
);
3191 ret
= drv
->bdrv_co_writev(bs
, sector_num
, nb_sectors
, qiov
);
3193 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV_DONE
);
3195 if (ret
== 0 && !bs
->enable_write_cache
) {
3196 ret
= bdrv_co_flush(bs
);
3199 bdrv_set_dirty(bs
, sector_num
, nb_sectors
);
3201 if (bs
->wr_highest_sector
< sector_num
+ nb_sectors
- 1) {
3202 bs
->wr_highest_sector
= sector_num
+ nb_sectors
- 1;
3204 if (bs
->growable
&& ret
>= 0) {
3205 bs
->total_sectors
= MAX(bs
->total_sectors
, sector_num
+ nb_sectors
);
3212 * Handle a write request in coroutine context
3214 static int coroutine_fn
bdrv_co_do_pwritev(BlockDriverState
*bs
,
3215 int64_t offset
, unsigned int bytes
, QEMUIOVector
*qiov
,
3216 BdrvRequestFlags flags
)
3218 BdrvTrackedRequest req
;
3219 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3220 uint64_t align
= MAX(BDRV_SECTOR_SIZE
, bs
->request_alignment
);
3221 uint8_t *head_buf
= NULL
;
3222 uint8_t *tail_buf
= NULL
;
3223 QEMUIOVector local_qiov
;
3224 bool use_local_qiov
= false;
3230 if (bs
->read_only
) {
3233 if (bdrv_check_byte_request(bs
, offset
, bytes
)) {
3237 /* throttling disk I/O */
3238 if (bs
->io_limits_enabled
) {
3239 bdrv_io_limits_intercept(bs
, bytes
, true);
3243 * Align write if necessary by performing a read-modify-write cycle.
3244 * Pad qiov with the read parts and be sure to have a tracked request not
3245 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3247 tracked_request_begin(&req
, bs
, offset
, bytes
, true);
3249 if (offset
& (align
- 1)) {
3250 QEMUIOVector head_qiov
;
3251 struct iovec head_iov
;
3253 mark_request_serialising(&req
, align
);
3254 wait_serialising_requests(&req
);
3256 head_buf
= qemu_blockalign(bs
, align
);
3257 head_iov
= (struct iovec
) {
3258 .iov_base
= head_buf
,
3261 qemu_iovec_init_external(&head_qiov
, &head_iov
, 1);
3263 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV_RMW_HEAD
);
3264 ret
= bdrv_aligned_preadv(bs
, &req
, offset
& ~(align
- 1), align
,
3265 align
, &head_qiov
, 0);
3269 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV_RMW_AFTER_HEAD
);
3271 qemu_iovec_init(&local_qiov
, qiov
->niov
+ 2);
3272 qemu_iovec_add(&local_qiov
, head_buf
, offset
& (align
- 1));
3273 qemu_iovec_concat(&local_qiov
, qiov
, 0, qiov
->size
);
3274 use_local_qiov
= true;
3276 bytes
+= offset
& (align
- 1);
3277 offset
= offset
& ~(align
- 1);
3280 if ((offset
+ bytes
) & (align
- 1)) {
3281 QEMUIOVector tail_qiov
;
3282 struct iovec tail_iov
;
3286 mark_request_serialising(&req
, align
);
3287 waited
= wait_serialising_requests(&req
);
3288 assert(!waited
|| !use_local_qiov
);
3290 tail_buf
= qemu_blockalign(bs
, align
);
3291 tail_iov
= (struct iovec
) {
3292 .iov_base
= tail_buf
,
3295 qemu_iovec_init_external(&tail_qiov
, &tail_iov
, 1);
3297 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV_RMW_TAIL
);
3298 ret
= bdrv_aligned_preadv(bs
, &req
, (offset
+ bytes
) & ~(align
- 1), align
,
3299 align
, &tail_qiov
, 0);
3303 BLKDBG_EVENT(bs
, BLKDBG_PWRITEV_RMW_AFTER_TAIL
);
3305 if (!use_local_qiov
) {
3306 qemu_iovec_init(&local_qiov
, qiov
->niov
+ 1);
3307 qemu_iovec_concat(&local_qiov
, qiov
, 0, qiov
->size
);
3308 use_local_qiov
= true;
3311 tail_bytes
= (offset
+ bytes
) & (align
- 1);
3312 qemu_iovec_add(&local_qiov
, tail_buf
+ tail_bytes
, align
- tail_bytes
);
3314 bytes
= ROUND_UP(bytes
, align
);
3317 ret
= bdrv_aligned_pwritev(bs
, &req
, offset
, bytes
,
3318 use_local_qiov
? &local_qiov
: qiov
,
3322 tracked_request_end(&req
);
3324 if (use_local_qiov
) {
3325 qemu_iovec_destroy(&local_qiov
);
3327 qemu_vfree(head_buf
);
3328 qemu_vfree(tail_buf
);
3333 static int coroutine_fn
bdrv_co_do_writev(BlockDriverState
*bs
,
3334 int64_t sector_num
, int nb_sectors
, QEMUIOVector
*qiov
,
3335 BdrvRequestFlags flags
)
3337 if (nb_sectors
< 0 || nb_sectors
> (INT_MAX
>> BDRV_SECTOR_BITS
)) {
3341 return bdrv_co_do_pwritev(bs
, sector_num
<< BDRV_SECTOR_BITS
,
3342 nb_sectors
<< BDRV_SECTOR_BITS
, qiov
, flags
);
3345 int coroutine_fn
bdrv_co_writev(BlockDriverState
*bs
, int64_t sector_num
,
3346 int nb_sectors
, QEMUIOVector
*qiov
)
3348 trace_bdrv_co_writev(bs
, sector_num
, nb_sectors
);
3350 return bdrv_co_do_writev(bs
, sector_num
, nb_sectors
, qiov
, 0);
3353 int coroutine_fn
bdrv_co_write_zeroes(BlockDriverState
*bs
,
3354 int64_t sector_num
, int nb_sectors
,
3355 BdrvRequestFlags flags
)
3357 trace_bdrv_co_write_zeroes(bs
, sector_num
, nb_sectors
, flags
);
3359 if (!(bs
->open_flags
& BDRV_O_UNMAP
)) {
3360 flags
&= ~BDRV_REQ_MAY_UNMAP
;
3363 return bdrv_co_do_writev(bs
, sector_num
, nb_sectors
, NULL
,
3364 BDRV_REQ_ZERO_WRITE
| flags
);
3368 * Truncate file to 'offset' bytes (needed only for file protocols)
3370 int bdrv_truncate(BlockDriverState
*bs
, int64_t offset
)
3372 BlockDriver
*drv
= bs
->drv
;
3376 if (!drv
->bdrv_truncate
)
3380 if (bdrv_in_use(bs
))
3382 ret
= drv
->bdrv_truncate(bs
, offset
);
3384 ret
= refresh_total_sectors(bs
, offset
>> BDRV_SECTOR_BITS
);
3385 bdrv_dev_resize_cb(bs
);
3391 * Length of a allocated file in bytes. Sparse files are counted by actual
3392 * allocated space. Return < 0 if error or unknown.
3394 int64_t bdrv_get_allocated_file_size(BlockDriverState
*bs
)
3396 BlockDriver
*drv
= bs
->drv
;
3400 if (drv
->bdrv_get_allocated_file_size
) {
3401 return drv
->bdrv_get_allocated_file_size(bs
);
3404 return bdrv_get_allocated_file_size(bs
->file
);
3410 * Length of a file in bytes. Return < 0 if error or unknown.
3412 int64_t bdrv_getlength(BlockDriverState
*bs
)
3414 BlockDriver
*drv
= bs
->drv
;
3418 if (drv
->has_variable_length
) {
3419 int ret
= refresh_total_sectors(bs
, bs
->total_sectors
);
3424 return bs
->total_sectors
* BDRV_SECTOR_SIZE
;
3427 /* return 0 as number of sectors if no device present or error */
3428 void bdrv_get_geometry(BlockDriverState
*bs
, uint64_t *nb_sectors_ptr
)
3431 length
= bdrv_getlength(bs
);
3435 length
= length
>> BDRV_SECTOR_BITS
;
3436 *nb_sectors_ptr
= length
;
3439 void bdrv_set_on_error(BlockDriverState
*bs
, BlockdevOnError on_read_error
,
3440 BlockdevOnError on_write_error
)
3442 bs
->on_read_error
= on_read_error
;
3443 bs
->on_write_error
= on_write_error
;
3446 BlockdevOnError
bdrv_get_on_error(BlockDriverState
*bs
, bool is_read
)
3448 return is_read
? bs
->on_read_error
: bs
->on_write_error
;
3451 BlockErrorAction
bdrv_get_error_action(BlockDriverState
*bs
, bool is_read
, int error
)
3453 BlockdevOnError on_err
= is_read
? bs
->on_read_error
: bs
->on_write_error
;
3456 case BLOCKDEV_ON_ERROR_ENOSPC
:
3457 return (error
== ENOSPC
) ? BDRV_ACTION_STOP
: BDRV_ACTION_REPORT
;
3458 case BLOCKDEV_ON_ERROR_STOP
:
3459 return BDRV_ACTION_STOP
;
3460 case BLOCKDEV_ON_ERROR_REPORT
:
3461 return BDRV_ACTION_REPORT
;
3462 case BLOCKDEV_ON_ERROR_IGNORE
:
3463 return BDRV_ACTION_IGNORE
;
3469 /* This is done by device models because, while the block layer knows
3470 * about the error, it does not know whether an operation comes from
3471 * the device or the block layer (from a job, for example).
3473 void bdrv_error_action(BlockDriverState
*bs
, BlockErrorAction action
,
3474 bool is_read
, int error
)
3477 bdrv_emit_qmp_error_event(bs
, QEVENT_BLOCK_IO_ERROR
, action
, is_read
);
3478 if (action
== BDRV_ACTION_STOP
) {
3479 vm_stop(RUN_STATE_IO_ERROR
);
3480 bdrv_iostatus_set_err(bs
, error
);
3484 int bdrv_is_read_only(BlockDriverState
*bs
)
3486 return bs
->read_only
;
3489 int bdrv_is_sg(BlockDriverState
*bs
)
3494 int bdrv_enable_write_cache(BlockDriverState
*bs
)
3496 return bs
->enable_write_cache
;
3499 void bdrv_set_enable_write_cache(BlockDriverState
*bs
, bool wce
)
3501 bs
->enable_write_cache
= wce
;
3503 /* so a reopen() will preserve wce */
3505 bs
->open_flags
|= BDRV_O_CACHE_WB
;
3507 bs
->open_flags
&= ~BDRV_O_CACHE_WB
;
3511 int bdrv_is_encrypted(BlockDriverState
*bs
)
3513 if (bs
->backing_hd
&& bs
->backing_hd
->encrypted
)
3515 return bs
->encrypted
;
3518 int bdrv_key_required(BlockDriverState
*bs
)
3520 BlockDriverState
*backing_hd
= bs
->backing_hd
;
3522 if (backing_hd
&& backing_hd
->encrypted
&& !backing_hd
->valid_key
)
3524 return (bs
->encrypted
&& !bs
->valid_key
);
3527 int bdrv_set_key(BlockDriverState
*bs
, const char *key
)
3530 if (bs
->backing_hd
&& bs
->backing_hd
->encrypted
) {
3531 ret
= bdrv_set_key(bs
->backing_hd
, key
);
3537 if (!bs
->encrypted
) {
3539 } else if (!bs
->drv
|| !bs
->drv
->bdrv_set_key
) {
3542 ret
= bs
->drv
->bdrv_set_key(bs
, key
);
3545 } else if (!bs
->valid_key
) {
3547 /* call the change callback now, we skipped it on open */
3548 bdrv_dev_change_media_cb(bs
, true);
3553 const char *bdrv_get_format_name(BlockDriverState
*bs
)
3555 return bs
->drv
? bs
->drv
->format_name
: NULL
;
3558 void bdrv_iterate_format(void (*it
)(void *opaque
, const char *name
),
3563 QLIST_FOREACH(drv
, &bdrv_drivers
, list
) {
3564 it(opaque
, drv
->format_name
);
3568 /* This function is to find block backend bs */
3569 BlockDriverState
*bdrv_find(const char *name
)
3571 BlockDriverState
*bs
;
3573 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
3574 if (!strcmp(name
, bs
->device_name
)) {
3581 /* This function is to find a node in the bs graph */
3582 BlockDriverState
*bdrv_find_node(const char *node_name
)
3584 BlockDriverState
*bs
;
3588 QTAILQ_FOREACH(bs
, &graph_bdrv_states
, node_list
) {
3589 if (!strcmp(node_name
, bs
->node_name
)) {
3596 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3597 BlockDeviceInfoList
*bdrv_named_nodes_list(void)
3599 BlockDeviceInfoList
*list
, *entry
;
3600 BlockDriverState
*bs
;
3603 QTAILQ_FOREACH(bs
, &graph_bdrv_states
, node_list
) {
3604 entry
= g_malloc0(sizeof(*entry
));
3605 entry
->value
= bdrv_block_device_info(bs
);
3613 BlockDriverState
*bdrv_lookup_bs(const char *device
,
3614 const char *node_name
,
3617 BlockDriverState
*bs
= NULL
;
3620 bs
= bdrv_find(device
);
3628 bs
= bdrv_find_node(node_name
);
3635 error_setg(errp
, "Cannot find device=%s nor node_name=%s",
3636 device
? device
: "",
3637 node_name
? node_name
: "");
3641 BlockDriverState
*bdrv_next(BlockDriverState
*bs
)
3644 return QTAILQ_FIRST(&bdrv_states
);
3646 return QTAILQ_NEXT(bs
, device_list
);
3649 void bdrv_iterate(void (*it
)(void *opaque
, BlockDriverState
*bs
), void *opaque
)
3651 BlockDriverState
*bs
;
3653 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
3658 const char *bdrv_get_device_name(BlockDriverState
*bs
)
3660 return bs
->device_name
;
3663 int bdrv_get_flags(BlockDriverState
*bs
)
3665 return bs
->open_flags
;
3668 int bdrv_flush_all(void)
3670 BlockDriverState
*bs
;
3673 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
3674 int ret
= bdrv_flush(bs
);
3675 if (ret
< 0 && !result
) {
3683 int bdrv_has_zero_init_1(BlockDriverState
*bs
)
3688 int bdrv_has_zero_init(BlockDriverState
*bs
)
3692 /* If BS is a copy on write image, it is initialized to
3693 the contents of the base image, which may not be zeroes. */
3694 if (bs
->backing_hd
) {
3697 if (bs
->drv
->bdrv_has_zero_init
) {
3698 return bs
->drv
->bdrv_has_zero_init(bs
);
3705 bool bdrv_unallocated_blocks_are_zero(BlockDriverState
*bs
)
3707 BlockDriverInfo bdi
;
3709 if (bs
->backing_hd
) {
3713 if (bdrv_get_info(bs
, &bdi
) == 0) {
3714 return bdi
.unallocated_blocks_are_zero
;
3720 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState
*bs
)
3722 BlockDriverInfo bdi
;
3724 if (bs
->backing_hd
|| !(bs
->open_flags
& BDRV_O_UNMAP
)) {
3728 if (bdrv_get_info(bs
, &bdi
) == 0) {
3729 return bdi
.can_write_zeroes_with_unmap
;
3735 typedef struct BdrvCoGetBlockStatusData
{
3736 BlockDriverState
*bs
;
3737 BlockDriverState
*base
;
3743 } BdrvCoGetBlockStatusData
;
3746 * Returns true iff the specified sector is present in the disk image. Drivers
3747 * not implementing the functionality are assumed to not support backing files,
3748 * hence all their sectors are reported as allocated.
3750 * If 'sector_num' is beyond the end of the disk image the return value is 0
3751 * and 'pnum' is set to 0.
3753 * 'pnum' is set to the number of sectors (including and immediately following
3754 * the specified sector) that are known to be in the same
3755 * allocated/unallocated state.
3757 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3758 * beyond the end of the disk image it will be clamped.
3760 static int64_t coroutine_fn
bdrv_co_get_block_status(BlockDriverState
*bs
,
3762 int nb_sectors
, int *pnum
)
3768 length
= bdrv_getlength(bs
);
3773 if (sector_num
>= (length
>> BDRV_SECTOR_BITS
)) {
3778 n
= bs
->total_sectors
- sector_num
;
3779 if (n
< nb_sectors
) {
3783 if (!bs
->drv
->bdrv_co_get_block_status
) {
3785 ret
= BDRV_BLOCK_DATA
;
3786 if (bs
->drv
->protocol_name
) {
3787 ret
|= BDRV_BLOCK_OFFSET_VALID
| (sector_num
* BDRV_SECTOR_SIZE
);
3792 ret
= bs
->drv
->bdrv_co_get_block_status(bs
, sector_num
, nb_sectors
, pnum
);
3798 if (ret
& BDRV_BLOCK_RAW
) {
3799 assert(ret
& BDRV_BLOCK_OFFSET_VALID
);
3800 return bdrv_get_block_status(bs
->file
, ret
>> BDRV_SECTOR_BITS
,
3804 if (!(ret
& BDRV_BLOCK_DATA
) && !(ret
& BDRV_BLOCK_ZERO
)) {
3805 if (bdrv_unallocated_blocks_are_zero(bs
)) {
3806 ret
|= BDRV_BLOCK_ZERO
;
3807 } else if (bs
->backing_hd
) {
3808 BlockDriverState
*bs2
= bs
->backing_hd
;
3809 int64_t length2
= bdrv_getlength(bs2
);
3810 if (length2
>= 0 && sector_num
>= (length2
>> BDRV_SECTOR_BITS
)) {
3811 ret
|= BDRV_BLOCK_ZERO
;
3817 (ret
& BDRV_BLOCK_DATA
) && !(ret
& BDRV_BLOCK_ZERO
) &&
3818 (ret
& BDRV_BLOCK_OFFSET_VALID
)) {
3819 ret2
= bdrv_co_get_block_status(bs
->file
, ret
>> BDRV_SECTOR_BITS
,
3822 /* Ignore errors. This is just providing extra information, it
3823 * is useful but not necessary.
3825 ret
|= (ret2
& BDRV_BLOCK_ZERO
);
3832 /* Coroutine wrapper for bdrv_get_block_status() */
3833 static void coroutine_fn
bdrv_get_block_status_co_entry(void *opaque
)
3835 BdrvCoGetBlockStatusData
*data
= opaque
;
3836 BlockDriverState
*bs
= data
->bs
;
3838 data
->ret
= bdrv_co_get_block_status(bs
, data
->sector_num
, data
->nb_sectors
,
3844 * Synchronous wrapper around bdrv_co_get_block_status().
3846 * See bdrv_co_get_block_status() for details.
3848 int64_t bdrv_get_block_status(BlockDriverState
*bs
, int64_t sector_num
,
3849 int nb_sectors
, int *pnum
)
3852 BdrvCoGetBlockStatusData data
= {
3854 .sector_num
= sector_num
,
3855 .nb_sectors
= nb_sectors
,
3860 if (qemu_in_coroutine()) {
3861 /* Fast-path if already in coroutine context */
3862 bdrv_get_block_status_co_entry(&data
);
3864 co
= qemu_coroutine_create(bdrv_get_block_status_co_entry
);
3865 qemu_coroutine_enter(co
, &data
);
3866 while (!data
.done
) {
3873 int coroutine_fn
bdrv_is_allocated(BlockDriverState
*bs
, int64_t sector_num
,
3874 int nb_sectors
, int *pnum
)
3876 int64_t ret
= bdrv_get_block_status(bs
, sector_num
, nb_sectors
, pnum
);
3881 (ret
& BDRV_BLOCK_DATA
) ||
3882 ((ret
& BDRV_BLOCK_ZERO
) && !bdrv_has_zero_init(bs
));
3886 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3888 * Return true if the given sector is allocated in any image between
3889 * BASE and TOP (inclusive). BASE can be NULL to check if the given
3890 * sector is allocated in any image of the chain. Return false otherwise.
3892 * 'pnum' is set to the number of sectors (including and immediately following
3893 * the specified sector) that are known to be in the same
3894 * allocated/unallocated state.
3897 int bdrv_is_allocated_above(BlockDriverState
*top
,
3898 BlockDriverState
*base
,
3900 int nb_sectors
, int *pnum
)
3902 BlockDriverState
*intermediate
;
3903 int ret
, n
= nb_sectors
;
3906 while (intermediate
&& intermediate
!= base
) {
3908 ret
= bdrv_is_allocated(intermediate
, sector_num
, nb_sectors
,
3918 * [sector_num, nb_sectors] is unallocated on top but intermediate
3921 * [sector_num+x, nr_sectors] allocated.
3923 if (n
> pnum_inter
&&
3924 (intermediate
== top
||
3925 sector_num
+ pnum_inter
< intermediate
->total_sectors
)) {
3929 intermediate
= intermediate
->backing_hd
;
3936 const char *bdrv_get_encrypted_filename(BlockDriverState
*bs
)
3938 if (bs
->backing_hd
&& bs
->backing_hd
->encrypted
)
3939 return bs
->backing_file
;
3940 else if (bs
->encrypted
)
3941 return bs
->filename
;
3946 void bdrv_get_backing_filename(BlockDriverState
*bs
,
3947 char *filename
, int filename_size
)
3949 pstrcpy(filename
, filename_size
, bs
->backing_file
);
3952 int bdrv_write_compressed(BlockDriverState
*bs
, int64_t sector_num
,
3953 const uint8_t *buf
, int nb_sectors
)
3955 BlockDriver
*drv
= bs
->drv
;
3958 if (!drv
->bdrv_write_compressed
)
3960 if (bdrv_check_request(bs
, sector_num
, nb_sectors
))
3963 assert(QLIST_EMPTY(&bs
->dirty_bitmaps
));
3965 return drv
->bdrv_write_compressed(bs
, sector_num
, buf
, nb_sectors
);
3968 int bdrv_get_info(BlockDriverState
*bs
, BlockDriverInfo
*bdi
)
3970 BlockDriver
*drv
= bs
->drv
;
3973 if (!drv
->bdrv_get_info
)
3975 memset(bdi
, 0, sizeof(*bdi
));
3976 return drv
->bdrv_get_info(bs
, bdi
);
3979 ImageInfoSpecific
*bdrv_get_specific_info(BlockDriverState
*bs
)
3981 BlockDriver
*drv
= bs
->drv
;
3982 if (drv
&& drv
->bdrv_get_specific_info
) {
3983 return drv
->bdrv_get_specific_info(bs
);
3988 int bdrv_save_vmstate(BlockDriverState
*bs
, const uint8_t *buf
,
3989 int64_t pos
, int size
)
3992 struct iovec iov
= {
3993 .iov_base
= (void *) buf
,
3997 qemu_iovec_init_external(&qiov
, &iov
, 1);
3998 return bdrv_writev_vmstate(bs
, &qiov
, pos
);
4001 int bdrv_writev_vmstate(BlockDriverState
*bs
, QEMUIOVector
*qiov
, int64_t pos
)
4003 BlockDriver
*drv
= bs
->drv
;
4007 } else if (drv
->bdrv_save_vmstate
) {
4008 return drv
->bdrv_save_vmstate(bs
, qiov
, pos
);
4009 } else if (bs
->file
) {
4010 return bdrv_writev_vmstate(bs
->file
, qiov
, pos
);
4016 int bdrv_load_vmstate(BlockDriverState
*bs
, uint8_t *buf
,
4017 int64_t pos
, int size
)
4019 BlockDriver
*drv
= bs
->drv
;
4022 if (drv
->bdrv_load_vmstate
)
4023 return drv
->bdrv_load_vmstate(bs
, buf
, pos
, size
);
4025 return bdrv_load_vmstate(bs
->file
, buf
, pos
, size
);
4029 void bdrv_debug_event(BlockDriverState
*bs
, BlkDebugEvent event
)
4031 if (!bs
|| !bs
->drv
|| !bs
->drv
->bdrv_debug_event
) {
4035 bs
->drv
->bdrv_debug_event(bs
, event
);
4038 int bdrv_debug_breakpoint(BlockDriverState
*bs
, const char *event
,
4041 while (bs
&& bs
->drv
&& !bs
->drv
->bdrv_debug_breakpoint
) {
4045 if (bs
&& bs
->drv
&& bs
->drv
->bdrv_debug_breakpoint
) {
4046 return bs
->drv
->bdrv_debug_breakpoint(bs
, event
, tag
);
4052 int bdrv_debug_remove_breakpoint(BlockDriverState
*bs
, const char *tag
)
4054 while (bs
&& bs
->drv
&& !bs
->drv
->bdrv_debug_remove_breakpoint
) {
4058 if (bs
&& bs
->drv
&& bs
->drv
->bdrv_debug_remove_breakpoint
) {
4059 return bs
->drv
->bdrv_debug_remove_breakpoint(bs
, tag
);
4065 int bdrv_debug_resume(BlockDriverState
*bs
, const char *tag
)
4067 while (bs
&& bs
->drv
&& !bs
->drv
->bdrv_debug_resume
) {
4071 if (bs
&& bs
->drv
&& bs
->drv
->bdrv_debug_resume
) {
4072 return bs
->drv
->bdrv_debug_resume(bs
, tag
);
4078 bool bdrv_debug_is_suspended(BlockDriverState
*bs
, const char *tag
)
4080 while (bs
&& bs
->drv
&& !bs
->drv
->bdrv_debug_is_suspended
) {
4084 if (bs
&& bs
->drv
&& bs
->drv
->bdrv_debug_is_suspended
) {
4085 return bs
->drv
->bdrv_debug_is_suspended(bs
, tag
);
4091 int bdrv_is_snapshot(BlockDriverState
*bs
)
4093 return !!(bs
->open_flags
& BDRV_O_SNAPSHOT
);
4096 /* backing_file can either be relative, or absolute, or a protocol. If it is
4097 * relative, it must be relative to the chain. So, passing in bs->filename
4098 * from a BDS as backing_file should not be done, as that may be relative to
4099 * the CWD rather than the chain. */
4100 BlockDriverState
*bdrv_find_backing_image(BlockDriverState
*bs
,
4101 const char *backing_file
)
4103 char *filename_full
= NULL
;
4104 char *backing_file_full
= NULL
;
4105 char *filename_tmp
= NULL
;
4106 int is_protocol
= 0;
4107 BlockDriverState
*curr_bs
= NULL
;
4108 BlockDriverState
*retval
= NULL
;
4110 if (!bs
|| !bs
->drv
|| !backing_file
) {
4114 filename_full
= g_malloc(PATH_MAX
);
4115 backing_file_full
= g_malloc(PATH_MAX
);
4116 filename_tmp
= g_malloc(PATH_MAX
);
4118 is_protocol
= path_has_protocol(backing_file
);
4120 for (curr_bs
= bs
; curr_bs
->backing_hd
; curr_bs
= curr_bs
->backing_hd
) {
4122 /* If either of the filename paths is actually a protocol, then
4123 * compare unmodified paths; otherwise make paths relative */
4124 if (is_protocol
|| path_has_protocol(curr_bs
->backing_file
)) {
4125 if (strcmp(backing_file
, curr_bs
->backing_file
) == 0) {
4126 retval
= curr_bs
->backing_hd
;
4130 /* If not an absolute filename path, make it relative to the current
4131 * image's filename path */
4132 path_combine(filename_tmp
, PATH_MAX
, curr_bs
->filename
,
4135 /* We are going to compare absolute pathnames */
4136 if (!realpath(filename_tmp
, filename_full
)) {
4140 /* We need to make sure the backing filename we are comparing against
4141 * is relative to the current image filename (or absolute) */
4142 path_combine(filename_tmp
, PATH_MAX
, curr_bs
->filename
,
4143 curr_bs
->backing_file
);
4145 if (!realpath(filename_tmp
, backing_file_full
)) {
4149 if (strcmp(backing_file_full
, filename_full
) == 0) {
4150 retval
= curr_bs
->backing_hd
;
4156 g_free(filename_full
);
4157 g_free(backing_file_full
);
4158 g_free(filename_tmp
);
4162 int bdrv_get_backing_file_depth(BlockDriverState
*bs
)
4168 if (!bs
->backing_hd
) {
4172 return 1 + bdrv_get_backing_file_depth(bs
->backing_hd
);
4175 BlockDriverState
*bdrv_find_base(BlockDriverState
*bs
)
4177 BlockDriverState
*curr_bs
= NULL
;
4185 while (curr_bs
->backing_hd
) {
4186 curr_bs
= curr_bs
->backing_hd
;
4191 /**************************************************************/
4194 BlockDriverAIOCB
*bdrv_aio_readv(BlockDriverState
*bs
, int64_t sector_num
,
4195 QEMUIOVector
*qiov
, int nb_sectors
,
4196 BlockDriverCompletionFunc
*cb
, void *opaque
)
4198 trace_bdrv_aio_readv(bs
, sector_num
, nb_sectors
, opaque
);
4200 return bdrv_co_aio_rw_vector(bs
, sector_num
, qiov
, nb_sectors
, 0,
4204 BlockDriverAIOCB
*bdrv_aio_writev(BlockDriverState
*bs
, int64_t sector_num
,
4205 QEMUIOVector
*qiov
, int nb_sectors
,
4206 BlockDriverCompletionFunc
*cb
, void *opaque
)
4208 trace_bdrv_aio_writev(bs
, sector_num
, nb_sectors
, opaque
);
4210 return bdrv_co_aio_rw_vector(bs
, sector_num
, qiov
, nb_sectors
, 0,
4214 BlockDriverAIOCB
*bdrv_aio_write_zeroes(BlockDriverState
*bs
,
4215 int64_t sector_num
, int nb_sectors
, BdrvRequestFlags flags
,
4216 BlockDriverCompletionFunc
*cb
, void *opaque
)
4218 trace_bdrv_aio_write_zeroes(bs
, sector_num
, nb_sectors
, flags
, opaque
);
4220 return bdrv_co_aio_rw_vector(bs
, sector_num
, NULL
, nb_sectors
,
4221 BDRV_REQ_ZERO_WRITE
| flags
,
4226 typedef struct MultiwriteCB
{
4231 BlockDriverCompletionFunc
*cb
;
4233 QEMUIOVector
*free_qiov
;
4237 static void multiwrite_user_cb(MultiwriteCB
*mcb
)
4241 for (i
= 0; i
< mcb
->num_callbacks
; i
++) {
4242 mcb
->callbacks
[i
].cb(mcb
->callbacks
[i
].opaque
, mcb
->error
);
4243 if (mcb
->callbacks
[i
].free_qiov
) {
4244 qemu_iovec_destroy(mcb
->callbacks
[i
].free_qiov
);
4246 g_free(mcb
->callbacks
[i
].free_qiov
);
4250 static void multiwrite_cb(void *opaque
, int ret
)
4252 MultiwriteCB
*mcb
= opaque
;
4254 trace_multiwrite_cb(mcb
, ret
);
4256 if (ret
< 0 && !mcb
->error
) {
4260 mcb
->num_requests
--;
4261 if (mcb
->num_requests
== 0) {
4262 multiwrite_user_cb(mcb
);
4267 static int multiwrite_req_compare(const void *a
, const void *b
)
4269 const BlockRequest
*req1
= a
, *req2
= b
;
4272 * Note that we can't simply subtract req2->sector from req1->sector
4273 * here as that could overflow the return value.
4275 if (req1
->sector
> req2
->sector
) {
4277 } else if (req1
->sector
< req2
->sector
) {
4285 * Takes a bunch of requests and tries to merge them. Returns the number of
4286 * requests that remain after merging.
4288 static int multiwrite_merge(BlockDriverState
*bs
, BlockRequest
*reqs
,
4289 int num_reqs
, MultiwriteCB
*mcb
)
4293 // Sort requests by start sector
4294 qsort(reqs
, num_reqs
, sizeof(*reqs
), &multiwrite_req_compare
);
4296 // Check if adjacent requests touch the same clusters. If so, combine them,
4297 // filling up gaps with zero sectors.
4299 for (i
= 1; i
< num_reqs
; i
++) {
4301 int64_t oldreq_last
= reqs
[outidx
].sector
+ reqs
[outidx
].nb_sectors
;
4303 // Handle exactly sequential writes and overlapping writes.
4304 if (reqs
[i
].sector
<= oldreq_last
) {
4308 if (reqs
[outidx
].qiov
->niov
+ reqs
[i
].qiov
->niov
+ 1 > IOV_MAX
) {
4314 QEMUIOVector
*qiov
= g_malloc0(sizeof(*qiov
));
4315 qemu_iovec_init(qiov
,
4316 reqs
[outidx
].qiov
->niov
+ reqs
[i
].qiov
->niov
+ 1);
4318 // Add the first request to the merged one. If the requests are
4319 // overlapping, drop the last sectors of the first request.
4320 size
= (reqs
[i
].sector
- reqs
[outidx
].sector
) << 9;
4321 qemu_iovec_concat(qiov
, reqs
[outidx
].qiov
, 0, size
);
4323 // We should need to add any zeros between the two requests
4324 assert (reqs
[i
].sector
<= oldreq_last
);
4326 // Add the second request
4327 qemu_iovec_concat(qiov
, reqs
[i
].qiov
, 0, reqs
[i
].qiov
->size
);
4329 reqs
[outidx
].nb_sectors
= qiov
->size
>> 9;
4330 reqs
[outidx
].qiov
= qiov
;
4332 mcb
->callbacks
[i
].free_qiov
= reqs
[outidx
].qiov
;
4335 reqs
[outidx
].sector
= reqs
[i
].sector
;
4336 reqs
[outidx
].nb_sectors
= reqs
[i
].nb_sectors
;
4337 reqs
[outidx
].qiov
= reqs
[i
].qiov
;
4345 * Submit multiple AIO write requests at once.
4347 * On success, the function returns 0 and all requests in the reqs array have
4348 * been submitted. In error case this function returns -1, and any of the
4349 * requests may or may not be submitted yet. In particular, this means that the
4350 * callback will be called for some of the requests, for others it won't. The
4351 * caller must check the error field of the BlockRequest to wait for the right
4352 * callbacks (if error != 0, no callback will be called).
4354 * The implementation may modify the contents of the reqs array, e.g. to merge
4355 * requests. However, the fields opaque and error are left unmodified as they
4356 * are used to signal failure for a single request to the caller.
4358 int bdrv_aio_multiwrite(BlockDriverState
*bs
, BlockRequest
*reqs
, int num_reqs
)
4363 /* don't submit writes if we don't have a medium */
4364 if (bs
->drv
== NULL
) {
4365 for (i
= 0; i
< num_reqs
; i
++) {
4366 reqs
[i
].error
= -ENOMEDIUM
;
4371 if (num_reqs
== 0) {
4375 // Create MultiwriteCB structure
4376 mcb
= g_malloc0(sizeof(*mcb
) + num_reqs
* sizeof(*mcb
->callbacks
));
4377 mcb
->num_requests
= 0;
4378 mcb
->num_callbacks
= num_reqs
;
4380 for (i
= 0; i
< num_reqs
; i
++) {
4381 mcb
->callbacks
[i
].cb
= reqs
[i
].cb
;
4382 mcb
->callbacks
[i
].opaque
= reqs
[i
].opaque
;
4385 // Check for mergable requests
4386 num_reqs
= multiwrite_merge(bs
, reqs
, num_reqs
, mcb
);
4388 trace_bdrv_aio_multiwrite(mcb
, mcb
->num_callbacks
, num_reqs
);
4390 /* Run the aio requests. */
4391 mcb
->num_requests
= num_reqs
;
4392 for (i
= 0; i
< num_reqs
; i
++) {
4393 bdrv_co_aio_rw_vector(bs
, reqs
[i
].sector
, reqs
[i
].qiov
,
4394 reqs
[i
].nb_sectors
, reqs
[i
].flags
,
4402 void bdrv_aio_cancel(BlockDriverAIOCB
*acb
)
4404 acb
->aiocb_info
->cancel(acb
);
4407 /**************************************************************/
4408 /* async block device emulation */
4410 typedef struct BlockDriverAIOCBSync
{
4411 BlockDriverAIOCB common
;
4414 /* vector translation state */
4418 } BlockDriverAIOCBSync
;
4420 static void bdrv_aio_cancel_em(BlockDriverAIOCB
*blockacb
)
4422 BlockDriverAIOCBSync
*acb
=
4423 container_of(blockacb
, BlockDriverAIOCBSync
, common
);
4424 qemu_bh_delete(acb
->bh
);
4426 qemu_aio_release(acb
);
4429 static const AIOCBInfo bdrv_em_aiocb_info
= {
4430 .aiocb_size
= sizeof(BlockDriverAIOCBSync
),
4431 .cancel
= bdrv_aio_cancel_em
,
4434 static void bdrv_aio_bh_cb(void *opaque
)
4436 BlockDriverAIOCBSync
*acb
= opaque
;
4439 qemu_iovec_from_buf(acb
->qiov
, 0, acb
->bounce
, acb
->qiov
->size
);
4440 qemu_vfree(acb
->bounce
);
4441 acb
->common
.cb(acb
->common
.opaque
, acb
->ret
);
4442 qemu_bh_delete(acb
->bh
);
4444 qemu_aio_release(acb
);
4447 static BlockDriverAIOCB
*bdrv_aio_rw_vector(BlockDriverState
*bs
,
4451 BlockDriverCompletionFunc
*cb
,
4456 BlockDriverAIOCBSync
*acb
;
4458 acb
= qemu_aio_get(&bdrv_em_aiocb_info
, bs
, cb
, opaque
);
4459 acb
->is_write
= is_write
;
4461 acb
->bounce
= qemu_blockalign(bs
, qiov
->size
);
4462 acb
->bh
= qemu_bh_new(bdrv_aio_bh_cb
, acb
);
4465 qemu_iovec_to_buf(acb
->qiov
, 0, acb
->bounce
, qiov
->size
);
4466 acb
->ret
= bs
->drv
->bdrv_write(bs
, sector_num
, acb
->bounce
, nb_sectors
);
4468 acb
->ret
= bs
->drv
->bdrv_read(bs
, sector_num
, acb
->bounce
, nb_sectors
);
4471 qemu_bh_schedule(acb
->bh
);
4473 return &acb
->common
;
4476 static BlockDriverAIOCB
*bdrv_aio_readv_em(BlockDriverState
*bs
,
4477 int64_t sector_num
, QEMUIOVector
*qiov
, int nb_sectors
,
4478 BlockDriverCompletionFunc
*cb
, void *opaque
)
4480 return bdrv_aio_rw_vector(bs
, sector_num
, qiov
, nb_sectors
, cb
, opaque
, 0);
4483 static BlockDriverAIOCB
*bdrv_aio_writev_em(BlockDriverState
*bs
,
4484 int64_t sector_num
, QEMUIOVector
*qiov
, int nb_sectors
,
4485 BlockDriverCompletionFunc
*cb
, void *opaque
)
4487 return bdrv_aio_rw_vector(bs
, sector_num
, qiov
, nb_sectors
, cb
, opaque
, 1);
4491 typedef struct BlockDriverAIOCBCoroutine
{
4492 BlockDriverAIOCB common
;
4497 } BlockDriverAIOCBCoroutine
;
4499 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB
*blockacb
)
4501 BlockDriverAIOCBCoroutine
*acb
=
4502 container_of(blockacb
, BlockDriverAIOCBCoroutine
, common
);
4511 static const AIOCBInfo bdrv_em_co_aiocb_info
= {
4512 .aiocb_size
= sizeof(BlockDriverAIOCBCoroutine
),
4513 .cancel
= bdrv_aio_co_cancel_em
,
4516 static void bdrv_co_em_bh(void *opaque
)
4518 BlockDriverAIOCBCoroutine
*acb
= opaque
;
4520 acb
->common
.cb(acb
->common
.opaque
, acb
->req
.error
);
4526 qemu_bh_delete(acb
->bh
);
4527 qemu_aio_release(acb
);
4530 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4531 static void coroutine_fn
bdrv_co_do_rw(void *opaque
)
4533 BlockDriverAIOCBCoroutine
*acb
= opaque
;
4534 BlockDriverState
*bs
= acb
->common
.bs
;
4536 if (!acb
->is_write
) {
4537 acb
->req
.error
= bdrv_co_do_readv(bs
, acb
->req
.sector
,
4538 acb
->req
.nb_sectors
, acb
->req
.qiov
, acb
->req
.flags
);
4540 acb
->req
.error
= bdrv_co_do_writev(bs
, acb
->req
.sector
,
4541 acb
->req
.nb_sectors
, acb
->req
.qiov
, acb
->req
.flags
);
4544 acb
->bh
= qemu_bh_new(bdrv_co_em_bh
, acb
);
4545 qemu_bh_schedule(acb
->bh
);
4548 static BlockDriverAIOCB
*bdrv_co_aio_rw_vector(BlockDriverState
*bs
,
4552 BdrvRequestFlags flags
,
4553 BlockDriverCompletionFunc
*cb
,
4558 BlockDriverAIOCBCoroutine
*acb
;
4560 acb
= qemu_aio_get(&bdrv_em_co_aiocb_info
, bs
, cb
, opaque
);
4561 acb
->req
.sector
= sector_num
;
4562 acb
->req
.nb_sectors
= nb_sectors
;
4563 acb
->req
.qiov
= qiov
;
4564 acb
->req
.flags
= flags
;
4565 acb
->is_write
= is_write
;
4568 co
= qemu_coroutine_create(bdrv_co_do_rw
);
4569 qemu_coroutine_enter(co
, acb
);
4571 return &acb
->common
;
4574 static void coroutine_fn
bdrv_aio_flush_co_entry(void *opaque
)
4576 BlockDriverAIOCBCoroutine
*acb
= opaque
;
4577 BlockDriverState
*bs
= acb
->common
.bs
;
4579 acb
->req
.error
= bdrv_co_flush(bs
);
4580 acb
->bh
= qemu_bh_new(bdrv_co_em_bh
, acb
);
4581 qemu_bh_schedule(acb
->bh
);
4584 BlockDriverAIOCB
*bdrv_aio_flush(BlockDriverState
*bs
,
4585 BlockDriverCompletionFunc
*cb
, void *opaque
)
4587 trace_bdrv_aio_flush(bs
, opaque
);
4590 BlockDriverAIOCBCoroutine
*acb
;
4592 acb
= qemu_aio_get(&bdrv_em_co_aiocb_info
, bs
, cb
, opaque
);
4595 co
= qemu_coroutine_create(bdrv_aio_flush_co_entry
);
4596 qemu_coroutine_enter(co
, acb
);
4598 return &acb
->common
;
4601 static void coroutine_fn
bdrv_aio_discard_co_entry(void *opaque
)
4603 BlockDriverAIOCBCoroutine
*acb
= opaque
;
4604 BlockDriverState
*bs
= acb
->common
.bs
;
4606 acb
->req
.error
= bdrv_co_discard(bs
, acb
->req
.sector
, acb
->req
.nb_sectors
);
4607 acb
->bh
= qemu_bh_new(bdrv_co_em_bh
, acb
);
4608 qemu_bh_schedule(acb
->bh
);
4611 BlockDriverAIOCB
*bdrv_aio_discard(BlockDriverState
*bs
,
4612 int64_t sector_num
, int nb_sectors
,
4613 BlockDriverCompletionFunc
*cb
, void *opaque
)
4616 BlockDriverAIOCBCoroutine
*acb
;
4618 trace_bdrv_aio_discard(bs
, sector_num
, nb_sectors
, opaque
);
4620 acb
= qemu_aio_get(&bdrv_em_co_aiocb_info
, bs
, cb
, opaque
);
4621 acb
->req
.sector
= sector_num
;
4622 acb
->req
.nb_sectors
= nb_sectors
;
4624 co
= qemu_coroutine_create(bdrv_aio_discard_co_entry
);
4625 qemu_coroutine_enter(co
, acb
);
4627 return &acb
->common
;
4630 void bdrv_init(void)
4632 module_call_init(MODULE_INIT_BLOCK
);
4635 void bdrv_init_with_whitelist(void)
4637 use_bdrv_whitelist
= 1;
4641 void *qemu_aio_get(const AIOCBInfo
*aiocb_info
, BlockDriverState
*bs
,
4642 BlockDriverCompletionFunc
*cb
, void *opaque
)
4644 BlockDriverAIOCB
*acb
;
4646 acb
= g_slice_alloc(aiocb_info
->aiocb_size
);
4647 acb
->aiocb_info
= aiocb_info
;
4650 acb
->opaque
= opaque
;
4654 void qemu_aio_release(void *p
)
4656 BlockDriverAIOCB
*acb
= p
;
4657 g_slice_free1(acb
->aiocb_info
->aiocb_size
, acb
);
4660 /**************************************************************/
4661 /* Coroutine block device emulation */
4663 typedef struct CoroutineIOCompletion
{
4664 Coroutine
*coroutine
;
4666 } CoroutineIOCompletion
;
4668 static void bdrv_co_io_em_complete(void *opaque
, int ret
)
4670 CoroutineIOCompletion
*co
= opaque
;
4673 qemu_coroutine_enter(co
->coroutine
, NULL
);
4676 static int coroutine_fn
bdrv_co_io_em(BlockDriverState
*bs
, int64_t sector_num
,
4677 int nb_sectors
, QEMUIOVector
*iov
,
4680 CoroutineIOCompletion co
= {
4681 .coroutine
= qemu_coroutine_self(),
4683 BlockDriverAIOCB
*acb
;
4686 acb
= bs
->drv
->bdrv_aio_writev(bs
, sector_num
, iov
, nb_sectors
,
4687 bdrv_co_io_em_complete
, &co
);
4689 acb
= bs
->drv
->bdrv_aio_readv(bs
, sector_num
, iov
, nb_sectors
,
4690 bdrv_co_io_em_complete
, &co
);
4693 trace_bdrv_co_io_em(bs
, sector_num
, nb_sectors
, is_write
, acb
);
4697 qemu_coroutine_yield();
4702 static int coroutine_fn
bdrv_co_readv_em(BlockDriverState
*bs
,
4703 int64_t sector_num
, int nb_sectors
,
4706 return bdrv_co_io_em(bs
, sector_num
, nb_sectors
, iov
, false);
4709 static int coroutine_fn
bdrv_co_writev_em(BlockDriverState
*bs
,
4710 int64_t sector_num
, int nb_sectors
,
4713 return bdrv_co_io_em(bs
, sector_num
, nb_sectors
, iov
, true);
4716 static void coroutine_fn
bdrv_flush_co_entry(void *opaque
)
4718 RwCo
*rwco
= opaque
;
4720 rwco
->ret
= bdrv_co_flush(rwco
->bs
);
4723 int coroutine_fn
bdrv_co_flush(BlockDriverState
*bs
)
4727 if (!bs
|| !bdrv_is_inserted(bs
) || bdrv_is_read_only(bs
)) {
4731 /* Write back cached data to the OS even with cache=unsafe */
4732 BLKDBG_EVENT(bs
->file
, BLKDBG_FLUSH_TO_OS
);
4733 if (bs
->drv
->bdrv_co_flush_to_os
) {
4734 ret
= bs
->drv
->bdrv_co_flush_to_os(bs
);
4740 /* But don't actually force it to the disk with cache=unsafe */
4741 if (bs
->open_flags
& BDRV_O_NO_FLUSH
) {
4745 BLKDBG_EVENT(bs
->file
, BLKDBG_FLUSH_TO_DISK
);
4746 if (bs
->drv
->bdrv_co_flush_to_disk
) {
4747 ret
= bs
->drv
->bdrv_co_flush_to_disk(bs
);
4748 } else if (bs
->drv
->bdrv_aio_flush
) {
4749 BlockDriverAIOCB
*acb
;
4750 CoroutineIOCompletion co
= {
4751 .coroutine
= qemu_coroutine_self(),
4754 acb
= bs
->drv
->bdrv_aio_flush(bs
, bdrv_co_io_em_complete
, &co
);
4758 qemu_coroutine_yield();
4763 * Some block drivers always operate in either writethrough or unsafe
4764 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4765 * know how the server works (because the behaviour is hardcoded or
4766 * depends on server-side configuration), so we can't ensure that
4767 * everything is safe on disk. Returning an error doesn't work because
4768 * that would break guests even if the server operates in writethrough
4771 * Let's hope the user knows what he's doing.
4779 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4780 * in the case of cache=unsafe, so there are no useless flushes.
4783 return bdrv_co_flush(bs
->file
);
4786 void bdrv_invalidate_cache(BlockDriverState
*bs
)
4788 if (bs
->drv
&& bs
->drv
->bdrv_invalidate_cache
) {
4789 bs
->drv
->bdrv_invalidate_cache(bs
);
4793 void bdrv_invalidate_cache_all(void)
4795 BlockDriverState
*bs
;
4797 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
4798 bdrv_invalidate_cache(bs
);
4802 void bdrv_clear_incoming_migration_all(void)
4804 BlockDriverState
*bs
;
4806 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
4807 bs
->open_flags
= bs
->open_flags
& ~(BDRV_O_INCOMING
);
4811 int bdrv_flush(BlockDriverState
*bs
)
4819 if (qemu_in_coroutine()) {
4820 /* Fast-path if already in coroutine context */
4821 bdrv_flush_co_entry(&rwco
);
4823 co
= qemu_coroutine_create(bdrv_flush_co_entry
);
4824 qemu_coroutine_enter(co
, &rwco
);
4825 while (rwco
.ret
== NOT_DONE
) {
4833 typedef struct DiscardCo
{
4834 BlockDriverState
*bs
;
4839 static void coroutine_fn
bdrv_discard_co_entry(void *opaque
)
4841 DiscardCo
*rwco
= opaque
;
4843 rwco
->ret
= bdrv_co_discard(rwco
->bs
, rwco
->sector_num
, rwco
->nb_sectors
);
4846 /* if no limit is specified in the BlockLimits use a default
4847 * of 32768 512-byte sectors (16 MiB) per request.
4849 #define MAX_DISCARD_DEFAULT 32768
4851 int coroutine_fn
bdrv_co_discard(BlockDriverState
*bs
, int64_t sector_num
,
4858 } else if (bdrv_check_request(bs
, sector_num
, nb_sectors
)) {
4860 } else if (bs
->read_only
) {
4864 bdrv_reset_dirty(bs
, sector_num
, nb_sectors
);
4866 /* Do nothing if disabled. */
4867 if (!(bs
->open_flags
& BDRV_O_UNMAP
)) {
4871 if (!bs
->drv
->bdrv_co_discard
&& !bs
->drv
->bdrv_aio_discard
) {
4875 max_discard
= bs
->bl
.max_discard
? bs
->bl
.max_discard
: MAX_DISCARD_DEFAULT
;
4876 while (nb_sectors
> 0) {
4878 int num
= nb_sectors
;
4881 if (bs
->bl
.discard_alignment
&&
4882 num
>= bs
->bl
.discard_alignment
&&
4883 sector_num
% bs
->bl
.discard_alignment
) {
4884 if (num
> bs
->bl
.discard_alignment
) {
4885 num
= bs
->bl
.discard_alignment
;
4887 num
-= sector_num
% bs
->bl
.discard_alignment
;
4890 /* limit request size */
4891 if (num
> max_discard
) {
4895 if (bs
->drv
->bdrv_co_discard
) {
4896 ret
= bs
->drv
->bdrv_co_discard(bs
, sector_num
, num
);
4898 BlockDriverAIOCB
*acb
;
4899 CoroutineIOCompletion co
= {
4900 .coroutine
= qemu_coroutine_self(),
4903 acb
= bs
->drv
->bdrv_aio_discard(bs
, sector_num
, nb_sectors
,
4904 bdrv_co_io_em_complete
, &co
);
4908 qemu_coroutine_yield();
4912 if (ret
&& ret
!= -ENOTSUP
) {
4922 int bdrv_discard(BlockDriverState
*bs
, int64_t sector_num
, int nb_sectors
)
4927 .sector_num
= sector_num
,
4928 .nb_sectors
= nb_sectors
,
4932 if (qemu_in_coroutine()) {
4933 /* Fast-path if already in coroutine context */
4934 bdrv_discard_co_entry(&rwco
);
4936 co
= qemu_coroutine_create(bdrv_discard_co_entry
);
4937 qemu_coroutine_enter(co
, &rwco
);
4938 while (rwco
.ret
== NOT_DONE
) {
4946 /**************************************************************/
4947 /* removable device support */
4950 * Return TRUE if the media is present
4952 int bdrv_is_inserted(BlockDriverState
*bs
)
4954 BlockDriver
*drv
= bs
->drv
;
4958 if (!drv
->bdrv_is_inserted
)
4960 return drv
->bdrv_is_inserted(bs
);
4964 * Return whether the media changed since the last call to this
4965 * function, or -ENOTSUP if we don't know. Most drivers don't know.
4967 int bdrv_media_changed(BlockDriverState
*bs
)
4969 BlockDriver
*drv
= bs
->drv
;
4971 if (drv
&& drv
->bdrv_media_changed
) {
4972 return drv
->bdrv_media_changed(bs
);
4978 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4980 void bdrv_eject(BlockDriverState
*bs
, bool eject_flag
)
4982 BlockDriver
*drv
= bs
->drv
;
4984 if (drv
&& drv
->bdrv_eject
) {
4985 drv
->bdrv_eject(bs
, eject_flag
);
4988 if (bs
->device_name
[0] != '\0') {
4989 bdrv_emit_qmp_eject_event(bs
, eject_flag
);
4994 * Lock or unlock the media (if it is locked, the user won't be able
4995 * to eject it manually).
4997 void bdrv_lock_medium(BlockDriverState
*bs
, bool locked
)
4999 BlockDriver
*drv
= bs
->drv
;
5001 trace_bdrv_lock_medium(bs
, locked
);
5003 if (drv
&& drv
->bdrv_lock_medium
) {
5004 drv
->bdrv_lock_medium(bs
, locked
);
5008 /* needed for generic scsi interface */
5010 int bdrv_ioctl(BlockDriverState
*bs
, unsigned long int req
, void *buf
)
5012 BlockDriver
*drv
= bs
->drv
;
5014 if (drv
&& drv
->bdrv_ioctl
)
5015 return drv
->bdrv_ioctl(bs
, req
, buf
);
5019 BlockDriverAIOCB
*bdrv_aio_ioctl(BlockDriverState
*bs
,
5020 unsigned long int req
, void *buf
,
5021 BlockDriverCompletionFunc
*cb
, void *opaque
)
5023 BlockDriver
*drv
= bs
->drv
;
5025 if (drv
&& drv
->bdrv_aio_ioctl
)
5026 return drv
->bdrv_aio_ioctl(bs
, req
, buf
, cb
, opaque
);
5030 void bdrv_set_guest_block_size(BlockDriverState
*bs
, int align
)
5032 bs
->guest_block_size
= align
;
5035 void *qemu_blockalign(BlockDriverState
*bs
, size_t size
)
5037 return qemu_memalign(bdrv_opt_mem_align(bs
), size
);
5041 * Check if all memory in this vector is sector aligned.
5043 bool bdrv_qiov_is_aligned(BlockDriverState
*bs
, QEMUIOVector
*qiov
)
5046 size_t alignment
= bdrv_opt_mem_align(bs
);
5048 for (i
= 0; i
< qiov
->niov
; i
++) {
5049 if ((uintptr_t) qiov
->iov
[i
].iov_base
% alignment
) {
5052 if (qiov
->iov
[i
].iov_len
% alignment
) {
5060 BdrvDirtyBitmap
*bdrv_create_dirty_bitmap(BlockDriverState
*bs
, int granularity
)
5062 int64_t bitmap_size
;
5063 BdrvDirtyBitmap
*bitmap
;
5065 assert((granularity
& (granularity
- 1)) == 0);
5067 granularity
>>= BDRV_SECTOR_BITS
;
5068 assert(granularity
);
5069 bitmap_size
= (bdrv_getlength(bs
) >> BDRV_SECTOR_BITS
);
5070 bitmap
= g_malloc0(sizeof(BdrvDirtyBitmap
));
5071 bitmap
->bitmap
= hbitmap_alloc(bitmap_size
, ffs(granularity
) - 1);
5072 QLIST_INSERT_HEAD(&bs
->dirty_bitmaps
, bitmap
, list
);
5076 void bdrv_release_dirty_bitmap(BlockDriverState
*bs
, BdrvDirtyBitmap
*bitmap
)
5078 BdrvDirtyBitmap
*bm
, *next
;
5079 QLIST_FOREACH_SAFE(bm
, &bs
->dirty_bitmaps
, list
, next
) {
5081 QLIST_REMOVE(bitmap
, list
);
5082 hbitmap_free(bitmap
->bitmap
);
5089 BlockDirtyInfoList
*bdrv_query_dirty_bitmaps(BlockDriverState
*bs
)
5091 BdrvDirtyBitmap
*bm
;
5092 BlockDirtyInfoList
*list
= NULL
;
5093 BlockDirtyInfoList
**plist
= &list
;
5095 QLIST_FOREACH(bm
, &bs
->dirty_bitmaps
, list
) {
5096 BlockDirtyInfo
*info
= g_malloc0(sizeof(BlockDirtyInfo
));
5097 BlockDirtyInfoList
*entry
= g_malloc0(sizeof(BlockDirtyInfoList
));
5098 info
->count
= bdrv_get_dirty_count(bs
, bm
);
5100 ((int64_t) BDRV_SECTOR_SIZE
<< hbitmap_granularity(bm
->bitmap
));
5101 entry
->value
= info
;
5103 plist
= &entry
->next
;
5109 int bdrv_get_dirty(BlockDriverState
*bs
, BdrvDirtyBitmap
*bitmap
, int64_t sector
)
5112 return hbitmap_get(bitmap
->bitmap
, sector
);
5118 void bdrv_dirty_iter_init(BlockDriverState
*bs
,
5119 BdrvDirtyBitmap
*bitmap
, HBitmapIter
*hbi
)
5121 hbitmap_iter_init(hbi
, bitmap
->bitmap
, 0);
5124 void bdrv_set_dirty(BlockDriverState
*bs
, int64_t cur_sector
,
5127 BdrvDirtyBitmap
*bitmap
;
5128 QLIST_FOREACH(bitmap
, &bs
->dirty_bitmaps
, list
) {
5129 hbitmap_set(bitmap
->bitmap
, cur_sector
, nr_sectors
);
5133 void bdrv_reset_dirty(BlockDriverState
*bs
, int64_t cur_sector
, int nr_sectors
)
5135 BdrvDirtyBitmap
*bitmap
;
5136 QLIST_FOREACH(bitmap
, &bs
->dirty_bitmaps
, list
) {
5137 hbitmap_reset(bitmap
->bitmap
, cur_sector
, nr_sectors
);
5141 int64_t bdrv_get_dirty_count(BlockDriverState
*bs
, BdrvDirtyBitmap
*bitmap
)
5143 return hbitmap_count(bitmap
->bitmap
);
5146 /* Get a reference to bs */
5147 void bdrv_ref(BlockDriverState
*bs
)
5152 /* Release a previously grabbed reference to bs.
5153 * If after releasing, reference count is zero, the BlockDriverState is
5155 void bdrv_unref(BlockDriverState
*bs
)
5157 assert(bs
->refcnt
> 0);
5158 if (--bs
->refcnt
== 0) {
5163 void bdrv_set_in_use(BlockDriverState
*bs
, int in_use
)
5165 assert(bs
->in_use
!= in_use
);
5166 bs
->in_use
= in_use
;
5169 int bdrv_in_use(BlockDriverState
*bs
)
5174 void bdrv_iostatus_enable(BlockDriverState
*bs
)
5176 bs
->iostatus_enabled
= true;
5177 bs
->iostatus
= BLOCK_DEVICE_IO_STATUS_OK
;
5180 /* The I/O status is only enabled if the drive explicitly
5181 * enables it _and_ the VM is configured to stop on errors */
5182 bool bdrv_iostatus_is_enabled(const BlockDriverState
*bs
)
5184 return (bs
->iostatus_enabled
&&
5185 (bs
->on_write_error
== BLOCKDEV_ON_ERROR_ENOSPC
||
5186 bs
->on_write_error
== BLOCKDEV_ON_ERROR_STOP
||
5187 bs
->on_read_error
== BLOCKDEV_ON_ERROR_STOP
));
5190 void bdrv_iostatus_disable(BlockDriverState
*bs
)
5192 bs
->iostatus_enabled
= false;
5195 void bdrv_iostatus_reset(BlockDriverState
*bs
)
5197 if (bdrv_iostatus_is_enabled(bs
)) {
5198 bs
->iostatus
= BLOCK_DEVICE_IO_STATUS_OK
;
5200 block_job_iostatus_reset(bs
->job
);
5205 void bdrv_iostatus_set_err(BlockDriverState
*bs
, int error
)
5207 assert(bdrv_iostatus_is_enabled(bs
));
5208 if (bs
->iostatus
== BLOCK_DEVICE_IO_STATUS_OK
) {
5209 bs
->iostatus
= error
== ENOSPC
? BLOCK_DEVICE_IO_STATUS_NOSPACE
:
5210 BLOCK_DEVICE_IO_STATUS_FAILED
;
5215 bdrv_acct_start(BlockDriverState
*bs
, BlockAcctCookie
*cookie
, int64_t bytes
,
5216 enum BlockAcctType type
)
5218 assert(type
< BDRV_MAX_IOTYPE
);
5220 cookie
->bytes
= bytes
;
5221 cookie
->start_time_ns
= get_clock();
5222 cookie
->type
= type
;
5226 bdrv_acct_done(BlockDriverState
*bs
, BlockAcctCookie
*cookie
)
5228 assert(cookie
->type
< BDRV_MAX_IOTYPE
);
5230 bs
->nr_bytes
[cookie
->type
] += cookie
->bytes
;
5231 bs
->nr_ops
[cookie
->type
]++;
5232 bs
->total_time_ns
[cookie
->type
] += get_clock() - cookie
->start_time_ns
;
5235 void bdrv_img_create(const char *filename
, const char *fmt
,
5236 const char *base_filename
, const char *base_fmt
,
5237 char *options
, uint64_t img_size
, int flags
,
5238 Error
**errp
, bool quiet
)
5240 QEMUOptionParameter
*param
= NULL
, *create_options
= NULL
;
5241 QEMUOptionParameter
*backing_fmt
, *backing_file
, *size
;
5242 BlockDriver
*drv
, *proto_drv
;
5243 BlockDriver
*backing_drv
= NULL
;
5244 Error
*local_err
= NULL
;
5247 /* Find driver and parse its options */
5248 drv
= bdrv_find_format(fmt
);
5250 error_setg(errp
, "Unknown file format '%s'", fmt
);
5254 proto_drv
= bdrv_find_protocol(filename
, true);
5256 error_setg(errp
, "Unknown protocol '%s'", filename
);
5260 create_options
= append_option_parameters(create_options
,
5261 drv
->create_options
);
5262 create_options
= append_option_parameters(create_options
,
5263 proto_drv
->create_options
);
5265 /* Create parameter list with default values */
5266 param
= parse_option_parameters("", create_options
, param
);
5268 set_option_parameter_int(param
, BLOCK_OPT_SIZE
, img_size
);
5270 /* Parse -o options */
5272 param
= parse_option_parameters(options
, create_options
, param
);
5273 if (param
== NULL
) {
5274 error_setg(errp
, "Invalid options for file format '%s'.", fmt
);
5279 if (base_filename
) {
5280 if (set_option_parameter(param
, BLOCK_OPT_BACKING_FILE
,
5282 error_setg(errp
, "Backing file not supported for file format '%s'",
5289 if (set_option_parameter(param
, BLOCK_OPT_BACKING_FMT
, base_fmt
)) {
5290 error_setg(errp
, "Backing file format not supported for file "
5291 "format '%s'", fmt
);
5296 backing_file
= get_option_parameter(param
, BLOCK_OPT_BACKING_FILE
);
5297 if (backing_file
&& backing_file
->value
.s
) {
5298 if (!strcmp(filename
, backing_file
->value
.s
)) {
5299 error_setg(errp
, "Error: Trying to create an image with the "
5300 "same filename as the backing file");
5305 backing_fmt
= get_option_parameter(param
, BLOCK_OPT_BACKING_FMT
);
5306 if (backing_fmt
&& backing_fmt
->value
.s
) {
5307 backing_drv
= bdrv_find_format(backing_fmt
->value
.s
);
5309 error_setg(errp
, "Unknown backing file format '%s'",
5310 backing_fmt
->value
.s
);
5315 // The size for the image must always be specified, with one exception:
5316 // If we are using a backing file, we can obtain the size from there
5317 size
= get_option_parameter(param
, BLOCK_OPT_SIZE
);
5318 if (size
&& size
->value
.n
== -1) {
5319 if (backing_file
&& backing_file
->value
.s
) {
5320 BlockDriverState
*bs
;
5325 /* backing files always opened read-only */
5327 flags
& ~(BDRV_O_RDWR
| BDRV_O_SNAPSHOT
| BDRV_O_NO_BACKING
);
5330 ret
= bdrv_open(&bs
, backing_file
->value
.s
, NULL
, NULL
, back_flags
,
5331 backing_drv
, &local_err
);
5333 error_setg_errno(errp
, -ret
, "Could not open '%s': %s",
5334 backing_file
->value
.s
,
5335 error_get_pretty(local_err
));
5336 error_free(local_err
);
5340 bdrv_get_geometry(bs
, &size
);
5343 snprintf(buf
, sizeof(buf
), "%" PRId64
, size
);
5344 set_option_parameter(param
, BLOCK_OPT_SIZE
, buf
);
5348 error_setg(errp
, "Image creation needs a size parameter");
5354 printf("Formatting '%s', fmt=%s ", filename
, fmt
);
5355 print_option_parameters(param
);
5358 ret
= bdrv_create(drv
, filename
, param
, &local_err
);
5359 if (ret
== -EFBIG
) {
5360 /* This is generally a better message than whatever the driver would
5361 * deliver (especially because of the cluster_size_hint), since that
5362 * is most probably not much different from "image too large". */
5363 const char *cluster_size_hint
= "";
5364 if (get_option_parameter(create_options
, BLOCK_OPT_CLUSTER_SIZE
)) {
5365 cluster_size_hint
= " (try using a larger cluster size)";
5367 error_setg(errp
, "The image size is too large for file format '%s'"
5368 "%s", fmt
, cluster_size_hint
);
5369 error_free(local_err
);
5374 free_option_parameters(create_options
);
5375 free_option_parameters(param
);
5378 error_propagate(errp
, local_err
);
5382 AioContext
*bdrv_get_aio_context(BlockDriverState
*bs
)
5384 /* Currently BlockDriverState always uses the main loop AioContext */
5385 return qemu_get_aio_context();
5388 void bdrv_add_before_write_notifier(BlockDriverState
*bs
,
5389 NotifierWithReturn
*notifier
)
5391 notifier_with_return_list_add(&bs
->before_write_notifiers
, notifier
);
5394 int bdrv_amend_options(BlockDriverState
*bs
, QEMUOptionParameter
*options
)
5396 if (bs
->drv
->bdrv_amend_options
== NULL
) {
5399 return bs
->drv
->bdrv_amend_options(bs
, options
);
5402 /* Used to recurse on single child block filters.
5403 * Single child block filter will store their child in bs->file.
5405 bool bdrv_generic_is_first_non_filter(BlockDriverState
*bs
,
5406 BlockDriverState
*candidate
)
5412 if (!bs
->drv
->authorizations
[BS_IS_A_FILTER
]) {
5413 if (bs
== candidate
) {
5420 if (!bs
->drv
->authorizations
[BS_FILTER_PASS_DOWN
]) {
5428 return bdrv_recurse_is_first_non_filter(bs
->file
, candidate
);
5431 bool bdrv_recurse_is_first_non_filter(BlockDriverState
*bs
,
5432 BlockDriverState
*candidate
)
5434 if (bs
->drv
&& bs
->drv
->bdrv_recurse_is_first_non_filter
) {
5435 return bs
->drv
->bdrv_recurse_is_first_non_filter(bs
, candidate
);
5438 return bdrv_generic_is_first_non_filter(bs
, candidate
);
5441 /* This function checks if the candidate is the first non filter bs down it's
5442 * bs chain. Since we don't have pointers to parents it explore all bs chains
5443 * from the top. Some filters can choose not to pass down the recursion.
5445 bool bdrv_is_first_non_filter(BlockDriverState
*candidate
)
5447 BlockDriverState
*bs
;
5449 /* walk down the bs forest recursively */
5450 QTAILQ_FOREACH(bs
, &bdrv_states
, device_list
) {
5453 perm
= bdrv_recurse_is_first_non_filter(bs
, candidate
);
5455 /* candidate is the first non filter */