block: vpc image file reopen
[qemu/ar7.git] / block.c
blobc7c1a3bd331b508b0b3d0b0571de7ab9c0403dd3
1 /*
2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qjson.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
35 #ifdef CONFIG_BSD
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
40 #ifndef __DragonFly__
41 #include <sys/disk.h>
42 #endif
43 #endif
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
51 typedef enum {
52 BDRV_REQ_COPY_ON_READ = 0x1,
53 BDRV_REQ_ZERO_WRITE = 0x2,
54 } BdrvRequestFlags;
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59 BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62 BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
65 QEMUIOVector *iov);
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
68 QEMUIOVector *iov);
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76 int64_t sector_num,
77 QEMUIOVector *qiov,
78 int nb_sectors,
79 BlockDriverCompletionFunc *cb,
80 void *opaque,
81 bool is_write);
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
83 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84 int64_t sector_num, int nb_sectors);
86 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87 bool is_write, double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89 double elapsed_time, uint64_t *wait);
90 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91 bool is_write, int64_t *wait);
93 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94 QTAILQ_HEAD_INITIALIZER(bdrv_states);
96 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97 QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 /* The device to use for VM snapshots */
100 static BlockDriverState *bs_snapshots;
102 /* If non-zero, use only whitelisted block drivers */
103 static int use_bdrv_whitelist;
105 #ifdef _WIN32
106 static int is_windows_drive_prefix(const char *filename)
108 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110 filename[1] == ':');
113 int is_windows_drive(const char *filename)
115 if (is_windows_drive_prefix(filename) &&
116 filename[2] == '\0')
117 return 1;
118 if (strstart(filename, "\\\\.\\", NULL) ||
119 strstart(filename, "//./", NULL))
120 return 1;
121 return 0;
123 #endif
125 /* throttling disk I/O limits */
126 void bdrv_io_limits_disable(BlockDriverState *bs)
128 bs->io_limits_enabled = false;
130 while (qemu_co_queue_next(&bs->throttled_reqs));
132 if (bs->block_timer) {
133 qemu_del_timer(bs->block_timer);
134 qemu_free_timer(bs->block_timer);
135 bs->block_timer = NULL;
138 bs->slice_start = 0;
139 bs->slice_end = 0;
140 bs->slice_time = 0;
141 memset(&bs->io_base, 0, sizeof(bs->io_base));
144 static void bdrv_block_timer(void *opaque)
146 BlockDriverState *bs = opaque;
148 qemu_co_queue_next(&bs->throttled_reqs);
151 void bdrv_io_limits_enable(BlockDriverState *bs)
153 qemu_co_queue_init(&bs->throttled_reqs);
154 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
156 bs->slice_start = qemu_get_clock_ns(vm_clock);
157 bs->slice_end = bs->slice_start + bs->slice_time;
158 memset(&bs->io_base, 0, sizeof(bs->io_base));
159 bs->io_limits_enabled = true;
162 bool bdrv_io_limits_enabled(BlockDriverState *bs)
164 BlockIOLimit *io_limits = &bs->io_limits;
165 return io_limits->bps[BLOCK_IO_LIMIT_READ]
166 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168 || io_limits->iops[BLOCK_IO_LIMIT_READ]
169 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
173 static void bdrv_io_limits_intercept(BlockDriverState *bs,
174 bool is_write, int nb_sectors)
176 int64_t wait_time = -1;
178 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179 qemu_co_queue_wait(&bs->throttled_reqs);
182 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183 * throttled requests will not be dequeued until the current request is
184 * allowed to be serviced. So if the current request still exceeds the
185 * limits, it will be inserted to the head. All requests followed it will
186 * be still in throttled_reqs queue.
189 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190 qemu_mod_timer(bs->block_timer,
191 wait_time + qemu_get_clock_ns(vm_clock));
192 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
195 qemu_co_queue_next(&bs->throttled_reqs);
198 /* check if the path starts with "<protocol>:" */
199 static int path_has_protocol(const char *path)
201 const char *p;
203 #ifdef _WIN32
204 if (is_windows_drive(path) ||
205 is_windows_drive_prefix(path)) {
206 return 0;
208 p = path + strcspn(path, ":/\\");
209 #else
210 p = path + strcspn(path, ":/");
211 #endif
213 return *p == ':';
216 int path_is_absolute(const char *path)
218 #ifdef _WIN32
219 /* specific case for names like: "\\.\d:" */
220 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
221 return 1;
223 return (*path == '/' || *path == '\\');
224 #else
225 return (*path == '/');
226 #endif
229 /* if filename is absolute, just copy it to dest. Otherwise, build a
230 path to it by considering it is relative to base_path. URL are
231 supported. */
232 void path_combine(char *dest, int dest_size,
233 const char *base_path,
234 const char *filename)
236 const char *p, *p1;
237 int len;
239 if (dest_size <= 0)
240 return;
241 if (path_is_absolute(filename)) {
242 pstrcpy(dest, dest_size, filename);
243 } else {
244 p = strchr(base_path, ':');
245 if (p)
246 p++;
247 else
248 p = base_path;
249 p1 = strrchr(base_path, '/');
250 #ifdef _WIN32
252 const char *p2;
253 p2 = strrchr(base_path, '\\');
254 if (!p1 || p2 > p1)
255 p1 = p2;
257 #endif
258 if (p1)
259 p1++;
260 else
261 p1 = base_path;
262 if (p1 > p)
263 p = p1;
264 len = p - base_path;
265 if (len > dest_size - 1)
266 len = dest_size - 1;
267 memcpy(dest, base_path, len);
268 dest[len] = '\0';
269 pstrcat(dest, dest_size, filename);
273 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
275 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
276 pstrcpy(dest, sz, bs->backing_file);
277 } else {
278 path_combine(dest, sz, bs->filename, bs->backing_file);
282 void bdrv_register(BlockDriver *bdrv)
284 /* Block drivers without coroutine functions need emulation */
285 if (!bdrv->bdrv_co_readv) {
286 bdrv->bdrv_co_readv = bdrv_co_readv_em;
287 bdrv->bdrv_co_writev = bdrv_co_writev_em;
289 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
290 * the block driver lacks aio we need to emulate that too.
292 if (!bdrv->bdrv_aio_readv) {
293 /* add AIO emulation layer */
294 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
295 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
299 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
302 /* create a new block device (by default it is empty) */
303 BlockDriverState *bdrv_new(const char *device_name)
305 BlockDriverState *bs;
307 bs = g_malloc0(sizeof(BlockDriverState));
308 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
309 if (device_name[0] != '\0') {
310 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
312 bdrv_iostatus_disable(bs);
313 return bs;
316 BlockDriver *bdrv_find_format(const char *format_name)
318 BlockDriver *drv1;
319 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
320 if (!strcmp(drv1->format_name, format_name)) {
321 return drv1;
324 return NULL;
327 static int bdrv_is_whitelisted(BlockDriver *drv)
329 static const char *whitelist[] = {
330 CONFIG_BDRV_WHITELIST
332 const char **p;
334 if (!whitelist[0])
335 return 1; /* no whitelist, anything goes */
337 for (p = whitelist; *p; p++) {
338 if (!strcmp(drv->format_name, *p)) {
339 return 1;
342 return 0;
345 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
347 BlockDriver *drv = bdrv_find_format(format_name);
348 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
351 typedef struct CreateCo {
352 BlockDriver *drv;
353 char *filename;
354 QEMUOptionParameter *options;
355 int ret;
356 } CreateCo;
358 static void coroutine_fn bdrv_create_co_entry(void *opaque)
360 CreateCo *cco = opaque;
361 assert(cco->drv);
363 cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
366 int bdrv_create(BlockDriver *drv, const char* filename,
367 QEMUOptionParameter *options)
369 int ret;
371 Coroutine *co;
372 CreateCo cco = {
373 .drv = drv,
374 .filename = g_strdup(filename),
375 .options = options,
376 .ret = NOT_DONE,
379 if (!drv->bdrv_create) {
380 return -ENOTSUP;
383 if (qemu_in_coroutine()) {
384 /* Fast-path if already in coroutine context */
385 bdrv_create_co_entry(&cco);
386 } else {
387 co = qemu_coroutine_create(bdrv_create_co_entry);
388 qemu_coroutine_enter(co, &cco);
389 while (cco.ret == NOT_DONE) {
390 qemu_aio_wait();
394 ret = cco.ret;
395 g_free(cco.filename);
397 return ret;
400 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
402 BlockDriver *drv;
404 drv = bdrv_find_protocol(filename);
405 if (drv == NULL) {
406 return -ENOENT;
409 return bdrv_create(drv, filename, options);
413 * Create a uniquely-named empty temporary file.
414 * Return 0 upon success, otherwise a negative errno value.
416 int get_tmp_filename(char *filename, int size)
418 #ifdef _WIN32
419 char temp_dir[MAX_PATH];
420 /* GetTempFileName requires that its output buffer (4th param)
421 have length MAX_PATH or greater. */
422 assert(size >= MAX_PATH);
423 return (GetTempPath(MAX_PATH, temp_dir)
424 && GetTempFileName(temp_dir, "qem", 0, filename)
425 ? 0 : -GetLastError());
426 #else
427 int fd;
428 const char *tmpdir;
429 tmpdir = getenv("TMPDIR");
430 if (!tmpdir)
431 tmpdir = "/tmp";
432 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
433 return -EOVERFLOW;
435 fd = mkstemp(filename);
436 if (fd < 0) {
437 return -errno;
439 if (close(fd) != 0) {
440 unlink(filename);
441 return -errno;
443 return 0;
444 #endif
448 * Detect host devices. By convention, /dev/cdrom[N] is always
449 * recognized as a host CDROM.
451 static BlockDriver *find_hdev_driver(const char *filename)
453 int score_max = 0, score;
454 BlockDriver *drv = NULL, *d;
456 QLIST_FOREACH(d, &bdrv_drivers, list) {
457 if (d->bdrv_probe_device) {
458 score = d->bdrv_probe_device(filename);
459 if (score > score_max) {
460 score_max = score;
461 drv = d;
466 return drv;
469 BlockDriver *bdrv_find_protocol(const char *filename)
471 BlockDriver *drv1;
472 char protocol[128];
473 int len;
474 const char *p;
476 /* TODO Drivers without bdrv_file_open must be specified explicitly */
479 * XXX(hch): we really should not let host device detection
480 * override an explicit protocol specification, but moving this
481 * later breaks access to device names with colons in them.
482 * Thanks to the brain-dead persistent naming schemes on udev-
483 * based Linux systems those actually are quite common.
485 drv1 = find_hdev_driver(filename);
486 if (drv1) {
487 return drv1;
490 if (!path_has_protocol(filename)) {
491 return bdrv_find_format("file");
493 p = strchr(filename, ':');
494 assert(p != NULL);
495 len = p - filename;
496 if (len > sizeof(protocol) - 1)
497 len = sizeof(protocol) - 1;
498 memcpy(protocol, filename, len);
499 protocol[len] = '\0';
500 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
501 if (drv1->protocol_name &&
502 !strcmp(drv1->protocol_name, protocol)) {
503 return drv1;
506 return NULL;
509 static int find_image_format(const char *filename, BlockDriver **pdrv)
511 int ret, score, score_max;
512 BlockDriver *drv1, *drv;
513 uint8_t buf[2048];
514 BlockDriverState *bs;
516 ret = bdrv_file_open(&bs, filename, 0);
517 if (ret < 0) {
518 *pdrv = NULL;
519 return ret;
522 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
523 if (bs->sg || !bdrv_is_inserted(bs)) {
524 bdrv_delete(bs);
525 drv = bdrv_find_format("raw");
526 if (!drv) {
527 ret = -ENOENT;
529 *pdrv = drv;
530 return ret;
533 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
534 bdrv_delete(bs);
535 if (ret < 0) {
536 *pdrv = NULL;
537 return ret;
540 score_max = 0;
541 drv = NULL;
542 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
543 if (drv1->bdrv_probe) {
544 score = drv1->bdrv_probe(buf, ret, filename);
545 if (score > score_max) {
546 score_max = score;
547 drv = drv1;
551 if (!drv) {
552 ret = -ENOENT;
554 *pdrv = drv;
555 return ret;
559 * Set the current 'total_sectors' value
561 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
563 BlockDriver *drv = bs->drv;
565 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
566 if (bs->sg)
567 return 0;
569 /* query actual device if possible, otherwise just trust the hint */
570 if (drv->bdrv_getlength) {
571 int64_t length = drv->bdrv_getlength(bs);
572 if (length < 0) {
573 return length;
575 hint = length >> BDRV_SECTOR_BITS;
578 bs->total_sectors = hint;
579 return 0;
583 * Set open flags for a given cache mode
585 * Return 0 on success, -1 if the cache mode was invalid.
587 int bdrv_parse_cache_flags(const char *mode, int *flags)
589 *flags &= ~BDRV_O_CACHE_MASK;
591 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
592 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
593 } else if (!strcmp(mode, "directsync")) {
594 *flags |= BDRV_O_NOCACHE;
595 } else if (!strcmp(mode, "writeback")) {
596 *flags |= BDRV_O_CACHE_WB;
597 } else if (!strcmp(mode, "unsafe")) {
598 *flags |= BDRV_O_CACHE_WB;
599 *flags |= BDRV_O_NO_FLUSH;
600 } else if (!strcmp(mode, "writethrough")) {
601 /* this is the default */
602 } else {
603 return -1;
606 return 0;
610 * The copy-on-read flag is actually a reference count so multiple users may
611 * use the feature without worrying about clobbering its previous state.
612 * Copy-on-read stays enabled until all users have called to disable it.
614 void bdrv_enable_copy_on_read(BlockDriverState *bs)
616 bs->copy_on_read++;
619 void bdrv_disable_copy_on_read(BlockDriverState *bs)
621 assert(bs->copy_on_read > 0);
622 bs->copy_on_read--;
626 * Common part for opening disk images and files
628 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
629 int flags, BlockDriver *drv)
631 int ret, open_flags;
633 assert(drv != NULL);
634 assert(bs->file == NULL);
636 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
638 bs->open_flags = flags;
639 bs->buffer_alignment = 512;
641 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
642 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
643 bdrv_enable_copy_on_read(bs);
646 pstrcpy(bs->filename, sizeof(bs->filename), filename);
648 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
649 return -ENOTSUP;
652 bs->drv = drv;
653 bs->opaque = g_malloc0(drv->instance_size);
655 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
656 open_flags = flags | BDRV_O_CACHE_WB;
659 * Clear flags that are internal to the block layer before opening the
660 * image.
662 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
665 * Snapshots should be writable.
667 if (bs->is_temporary) {
668 open_flags |= BDRV_O_RDWR;
671 bs->read_only = !(open_flags & BDRV_O_RDWR);
673 /* Open the image, either directly or using a protocol */
674 if (drv->bdrv_file_open) {
675 ret = drv->bdrv_file_open(bs, filename, open_flags);
676 } else {
677 ret = bdrv_file_open(&bs->file, filename, open_flags);
678 if (ret >= 0) {
679 ret = drv->bdrv_open(bs, open_flags);
683 if (ret < 0) {
684 goto free_and_fail;
687 ret = refresh_total_sectors(bs, bs->total_sectors);
688 if (ret < 0) {
689 goto free_and_fail;
692 #ifndef _WIN32
693 if (bs->is_temporary) {
694 unlink(filename);
696 #endif
697 return 0;
699 free_and_fail:
700 if (bs->file) {
701 bdrv_delete(bs->file);
702 bs->file = NULL;
704 g_free(bs->opaque);
705 bs->opaque = NULL;
706 bs->drv = NULL;
707 return ret;
711 * Opens a file using a protocol (file, host_device, nbd, ...)
713 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
715 BlockDriverState *bs;
716 BlockDriver *drv;
717 int ret;
719 drv = bdrv_find_protocol(filename);
720 if (!drv) {
721 return -ENOENT;
724 bs = bdrv_new("");
725 ret = bdrv_open_common(bs, filename, flags, drv);
726 if (ret < 0) {
727 bdrv_delete(bs);
728 return ret;
730 bs->growable = 1;
731 *pbs = bs;
732 return 0;
736 * Opens a disk image (raw, qcow2, vmdk, ...)
738 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
739 BlockDriver *drv)
741 int ret;
742 char tmp_filename[PATH_MAX];
744 if (flags & BDRV_O_SNAPSHOT) {
745 BlockDriverState *bs1;
746 int64_t total_size;
747 int is_protocol = 0;
748 BlockDriver *bdrv_qcow2;
749 QEMUOptionParameter *options;
750 char backing_filename[PATH_MAX];
752 /* if snapshot, we create a temporary backing file and open it
753 instead of opening 'filename' directly */
755 /* if there is a backing file, use it */
756 bs1 = bdrv_new("");
757 ret = bdrv_open(bs1, filename, 0, drv);
758 if (ret < 0) {
759 bdrv_delete(bs1);
760 return ret;
762 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
764 if (bs1->drv && bs1->drv->protocol_name)
765 is_protocol = 1;
767 bdrv_delete(bs1);
769 ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
770 if (ret < 0) {
771 return ret;
774 /* Real path is meaningless for protocols */
775 if (is_protocol)
776 snprintf(backing_filename, sizeof(backing_filename),
777 "%s", filename);
778 else if (!realpath(filename, backing_filename))
779 return -errno;
781 bdrv_qcow2 = bdrv_find_format("qcow2");
782 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
784 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
785 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
786 if (drv) {
787 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
788 drv->format_name);
791 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
792 free_option_parameters(options);
793 if (ret < 0) {
794 return ret;
797 filename = tmp_filename;
798 drv = bdrv_qcow2;
799 bs->is_temporary = 1;
802 /* Find the right image format driver */
803 if (!drv) {
804 ret = find_image_format(filename, &drv);
807 if (!drv) {
808 goto unlink_and_fail;
811 if (flags & BDRV_O_RDWR) {
812 flags |= BDRV_O_ALLOW_RDWR;
815 bs->keep_read_only = !(flags & BDRV_O_ALLOW_RDWR);
817 /* Open the image */
818 ret = bdrv_open_common(bs, filename, flags, drv);
819 if (ret < 0) {
820 goto unlink_and_fail;
823 /* If there is a backing file, use it */
824 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
825 char backing_filename[PATH_MAX];
826 int back_flags;
827 BlockDriver *back_drv = NULL;
829 bs->backing_hd = bdrv_new("");
830 bdrv_get_full_backing_filename(bs, backing_filename,
831 sizeof(backing_filename));
833 if (bs->backing_format[0] != '\0') {
834 back_drv = bdrv_find_format(bs->backing_format);
837 /* backing files always opened read-only */
838 back_flags =
839 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
841 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
842 if (ret < 0) {
843 bdrv_close(bs);
844 return ret;
848 if (!bdrv_key_required(bs)) {
849 bdrv_dev_change_media_cb(bs, true);
852 /* throttling disk I/O limits */
853 if (bs->io_limits_enabled) {
854 bdrv_io_limits_enable(bs);
857 return 0;
859 unlink_and_fail:
860 if (bs->is_temporary) {
861 unlink(filename);
863 return ret;
866 typedef struct BlockReopenQueueEntry {
867 bool prepared;
868 BDRVReopenState state;
869 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
870 } BlockReopenQueueEntry;
873 * Adds a BlockDriverState to a simple queue for an atomic, transactional
874 * reopen of multiple devices.
876 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
877 * already performed, or alternatively may be NULL a new BlockReopenQueue will
878 * be created and initialized. This newly created BlockReopenQueue should be
879 * passed back in for subsequent calls that are intended to be of the same
880 * atomic 'set'.
882 * bs is the BlockDriverState to add to the reopen queue.
884 * flags contains the open flags for the associated bs
886 * returns a pointer to bs_queue, which is either the newly allocated
887 * bs_queue, or the existing bs_queue being used.
890 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
891 BlockDriverState *bs, int flags)
893 assert(bs != NULL);
895 BlockReopenQueueEntry *bs_entry;
896 if (bs_queue == NULL) {
897 bs_queue = g_new0(BlockReopenQueue, 1);
898 QSIMPLEQ_INIT(bs_queue);
901 if (bs->file) {
902 bdrv_reopen_queue(bs_queue, bs->file, flags);
905 bs_entry = g_new0(BlockReopenQueueEntry, 1);
906 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
908 bs_entry->state.bs = bs;
909 bs_entry->state.flags = flags;
911 return bs_queue;
915 * Reopen multiple BlockDriverStates atomically & transactionally.
917 * The queue passed in (bs_queue) must have been built up previous
918 * via bdrv_reopen_queue().
920 * Reopens all BDS specified in the queue, with the appropriate
921 * flags. All devices are prepared for reopen, and failure of any
922 * device will cause all device changes to be abandonded, and intermediate
923 * data cleaned up.
925 * If all devices prepare successfully, then the changes are committed
926 * to all devices.
929 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
931 int ret = -1;
932 BlockReopenQueueEntry *bs_entry, *next;
933 Error *local_err = NULL;
935 assert(bs_queue != NULL);
937 bdrv_drain_all();
939 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
940 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
941 error_propagate(errp, local_err);
942 goto cleanup;
944 bs_entry->prepared = true;
947 /* If we reach this point, we have success and just need to apply the
948 * changes
950 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
951 bdrv_reopen_commit(&bs_entry->state);
954 ret = 0;
956 cleanup:
957 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
958 if (ret && bs_entry->prepared) {
959 bdrv_reopen_abort(&bs_entry->state);
961 g_free(bs_entry);
963 g_free(bs_queue);
964 return ret;
968 /* Reopen a single BlockDriverState with the specified flags. */
969 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
971 int ret = -1;
972 Error *local_err = NULL;
973 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
975 ret = bdrv_reopen_multiple(queue, &local_err);
976 if (local_err != NULL) {
977 error_propagate(errp, local_err);
979 return ret;
984 * Prepares a BlockDriverState for reopen. All changes are staged in the
985 * 'opaque' field of the BDRVReopenState, which is used and allocated by
986 * the block driver layer .bdrv_reopen_prepare()
988 * bs is the BlockDriverState to reopen
989 * flags are the new open flags
990 * queue is the reopen queue
992 * Returns 0 on success, non-zero on error. On error errp will be set
993 * as well.
995 * On failure, bdrv_reopen_abort() will be called to clean up any data.
996 * It is the responsibility of the caller to then call the abort() or
997 * commit() for any other BDS that have been left in a prepare() state
1000 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1001 Error **errp)
1003 int ret = -1;
1004 Error *local_err = NULL;
1005 BlockDriver *drv;
1007 assert(reopen_state != NULL);
1008 assert(reopen_state->bs->drv != NULL);
1009 drv = reopen_state->bs->drv;
1011 /* if we are to stay read-only, do not allow permission change
1012 * to r/w */
1013 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1014 reopen_state->flags & BDRV_O_RDWR) {
1015 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1016 reopen_state->bs->device_name);
1017 goto error;
1021 ret = bdrv_flush(reopen_state->bs);
1022 if (ret) {
1023 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1024 strerror(-ret));
1025 goto error;
1028 if (drv->bdrv_reopen_prepare) {
1029 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1030 if (ret) {
1031 if (local_err != NULL) {
1032 error_propagate(errp, local_err);
1033 } else {
1034 error_set(errp, QERR_OPEN_FILE_FAILED,
1035 reopen_state->bs->filename);
1037 goto error;
1039 } else {
1040 /* It is currently mandatory to have a bdrv_reopen_prepare()
1041 * handler for each supported drv. */
1042 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1043 drv->format_name, reopen_state->bs->device_name,
1044 "reopening of file");
1045 ret = -1;
1046 goto error;
1049 ret = 0;
1051 error:
1052 return ret;
1056 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1057 * makes them final by swapping the staging BlockDriverState contents into
1058 * the active BlockDriverState contents.
1060 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1062 BlockDriver *drv;
1064 assert(reopen_state != NULL);
1065 drv = reopen_state->bs->drv;
1066 assert(drv != NULL);
1068 /* If there are any driver level actions to take */
1069 if (drv->bdrv_reopen_commit) {
1070 drv->bdrv_reopen_commit(reopen_state);
1073 /* set BDS specific flags now */
1074 reopen_state->bs->open_flags = reopen_state->flags;
1075 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1076 BDRV_O_CACHE_WB);
1077 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1081 * Abort the reopen, and delete and free the staged changes in
1082 * reopen_state
1084 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1086 BlockDriver *drv;
1088 assert(reopen_state != NULL);
1089 drv = reopen_state->bs->drv;
1090 assert(drv != NULL);
1092 if (drv->bdrv_reopen_abort) {
1093 drv->bdrv_reopen_abort(reopen_state);
1098 void bdrv_close(BlockDriverState *bs)
1100 bdrv_flush(bs);
1101 if (bs->drv) {
1102 if (bs->job) {
1103 block_job_cancel_sync(bs->job);
1105 bdrv_drain_all();
1107 if (bs == bs_snapshots) {
1108 bs_snapshots = NULL;
1110 if (bs->backing_hd) {
1111 bdrv_delete(bs->backing_hd);
1112 bs->backing_hd = NULL;
1114 bs->drv->bdrv_close(bs);
1115 g_free(bs->opaque);
1116 #ifdef _WIN32
1117 if (bs->is_temporary) {
1118 unlink(bs->filename);
1120 #endif
1121 bs->opaque = NULL;
1122 bs->drv = NULL;
1123 bs->copy_on_read = 0;
1124 bs->backing_file[0] = '\0';
1125 bs->backing_format[0] = '\0';
1126 bs->total_sectors = 0;
1127 bs->encrypted = 0;
1128 bs->valid_key = 0;
1129 bs->sg = 0;
1130 bs->growable = 0;
1132 if (bs->file != NULL) {
1133 bdrv_delete(bs->file);
1134 bs->file = NULL;
1138 bdrv_dev_change_media_cb(bs, false);
1140 /*throttling disk I/O limits*/
1141 if (bs->io_limits_enabled) {
1142 bdrv_io_limits_disable(bs);
1146 void bdrv_close_all(void)
1148 BlockDriverState *bs;
1150 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1151 bdrv_close(bs);
1156 * Wait for pending requests to complete across all BlockDriverStates
1158 * This function does not flush data to disk, use bdrv_flush_all() for that
1159 * after calling this function.
1161 * Note that completion of an asynchronous I/O operation can trigger any
1162 * number of other I/O operations on other devices---for example a coroutine
1163 * can be arbitrarily complex and a constant flow of I/O can come until the
1164 * coroutine is complete. Because of this, it is not possible to have a
1165 * function to drain a single device's I/O queue.
1167 void bdrv_drain_all(void)
1169 BlockDriverState *bs;
1170 bool busy;
1172 do {
1173 busy = qemu_aio_wait();
1175 /* FIXME: We do not have timer support here, so this is effectively
1176 * a busy wait.
1178 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1179 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
1180 qemu_co_queue_restart_all(&bs->throttled_reqs);
1181 busy = true;
1184 } while (busy);
1186 /* If requests are still pending there is a bug somewhere */
1187 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1188 assert(QLIST_EMPTY(&bs->tracked_requests));
1189 assert(qemu_co_queue_empty(&bs->throttled_reqs));
1193 /* make a BlockDriverState anonymous by removing from bdrv_state list.
1194 Also, NULL terminate the device_name to prevent double remove */
1195 void bdrv_make_anon(BlockDriverState *bs)
1197 if (bs->device_name[0] != '\0') {
1198 QTAILQ_REMOVE(&bdrv_states, bs, list);
1200 bs->device_name[0] = '\0';
1203 static void bdrv_rebind(BlockDriverState *bs)
1205 if (bs->drv && bs->drv->bdrv_rebind) {
1206 bs->drv->bdrv_rebind(bs);
1210 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1211 BlockDriverState *bs_src)
1213 /* move some fields that need to stay attached to the device */
1214 bs_dest->open_flags = bs_src->open_flags;
1216 /* dev info */
1217 bs_dest->dev_ops = bs_src->dev_ops;
1218 bs_dest->dev_opaque = bs_src->dev_opaque;
1219 bs_dest->dev = bs_src->dev;
1220 bs_dest->buffer_alignment = bs_src->buffer_alignment;
1221 bs_dest->copy_on_read = bs_src->copy_on_read;
1223 bs_dest->enable_write_cache = bs_src->enable_write_cache;
1225 /* i/o timing parameters */
1226 bs_dest->slice_time = bs_src->slice_time;
1227 bs_dest->slice_start = bs_src->slice_start;
1228 bs_dest->slice_end = bs_src->slice_end;
1229 bs_dest->io_limits = bs_src->io_limits;
1230 bs_dest->io_base = bs_src->io_base;
1231 bs_dest->throttled_reqs = bs_src->throttled_reqs;
1232 bs_dest->block_timer = bs_src->block_timer;
1233 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
1235 /* r/w error */
1236 bs_dest->on_read_error = bs_src->on_read_error;
1237 bs_dest->on_write_error = bs_src->on_write_error;
1239 /* i/o status */
1240 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
1241 bs_dest->iostatus = bs_src->iostatus;
1243 /* dirty bitmap */
1244 bs_dest->dirty_count = bs_src->dirty_count;
1245 bs_dest->dirty_bitmap = bs_src->dirty_bitmap;
1247 /* job */
1248 bs_dest->in_use = bs_src->in_use;
1249 bs_dest->job = bs_src->job;
1251 /* keep the same entry in bdrv_states */
1252 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1253 bs_src->device_name);
1254 bs_dest->list = bs_src->list;
1258 * Swap bs contents for two image chains while they are live,
1259 * while keeping required fields on the BlockDriverState that is
1260 * actually attached to a device.
1262 * This will modify the BlockDriverState fields, and swap contents
1263 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1265 * bs_new is required to be anonymous.
1267 * This function does not create any image files.
1269 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1271 BlockDriverState tmp;
1273 /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1274 assert(bs_new->device_name[0] == '\0');
1275 assert(bs_new->dirty_bitmap == NULL);
1276 assert(bs_new->job == NULL);
1277 assert(bs_new->dev == NULL);
1278 assert(bs_new->in_use == 0);
1279 assert(bs_new->io_limits_enabled == false);
1280 assert(bs_new->block_timer == NULL);
1282 tmp = *bs_new;
1283 *bs_new = *bs_old;
1284 *bs_old = tmp;
1286 /* there are some fields that should not be swapped, move them back */
1287 bdrv_move_feature_fields(&tmp, bs_old);
1288 bdrv_move_feature_fields(bs_old, bs_new);
1289 bdrv_move_feature_fields(bs_new, &tmp);
1291 /* bs_new shouldn't be in bdrv_states even after the swap! */
1292 assert(bs_new->device_name[0] == '\0');
1294 /* Check a few fields that should remain attached to the device */
1295 assert(bs_new->dev == NULL);
1296 assert(bs_new->job == NULL);
1297 assert(bs_new->in_use == 0);
1298 assert(bs_new->io_limits_enabled == false);
1299 assert(bs_new->block_timer == NULL);
1301 bdrv_rebind(bs_new);
1302 bdrv_rebind(bs_old);
1306 * Add new bs contents at the top of an image chain while the chain is
1307 * live, while keeping required fields on the top layer.
1309 * This will modify the BlockDriverState fields, and swap contents
1310 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1312 * bs_new is required to be anonymous.
1314 * This function does not create any image files.
1316 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1318 bdrv_swap(bs_new, bs_top);
1320 /* The contents of 'tmp' will become bs_top, as we are
1321 * swapping bs_new and bs_top contents. */
1322 bs_top->backing_hd = bs_new;
1323 bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1324 pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1325 bs_new->filename);
1326 pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1327 bs_new->drv ? bs_new->drv->format_name : "");
1330 void bdrv_delete(BlockDriverState *bs)
1332 assert(!bs->dev);
1333 assert(!bs->job);
1334 assert(!bs->in_use);
1336 /* remove from list, if necessary */
1337 bdrv_make_anon(bs);
1339 bdrv_close(bs);
1341 assert(bs != bs_snapshots);
1342 g_free(bs);
1345 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1346 /* TODO change to DeviceState *dev when all users are qdevified */
1348 if (bs->dev) {
1349 return -EBUSY;
1351 bs->dev = dev;
1352 bdrv_iostatus_reset(bs);
1353 return 0;
1356 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1357 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1359 if (bdrv_attach_dev(bs, dev) < 0) {
1360 abort();
1364 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1365 /* TODO change to DeviceState *dev when all users are qdevified */
1367 assert(bs->dev == dev);
1368 bs->dev = NULL;
1369 bs->dev_ops = NULL;
1370 bs->dev_opaque = NULL;
1371 bs->buffer_alignment = 512;
1374 /* TODO change to return DeviceState * when all users are qdevified */
1375 void *bdrv_get_attached_dev(BlockDriverState *bs)
1377 return bs->dev;
1380 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1381 void *opaque)
1383 bs->dev_ops = ops;
1384 bs->dev_opaque = opaque;
1385 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1386 bs_snapshots = NULL;
1390 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1391 BlockQMPEventAction action, int is_read)
1393 QObject *data;
1394 const char *action_str;
1396 switch (action) {
1397 case BDRV_ACTION_REPORT:
1398 action_str = "report";
1399 break;
1400 case BDRV_ACTION_IGNORE:
1401 action_str = "ignore";
1402 break;
1403 case BDRV_ACTION_STOP:
1404 action_str = "stop";
1405 break;
1406 default:
1407 abort();
1410 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1411 bdrv->device_name,
1412 action_str,
1413 is_read ? "read" : "write");
1414 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1416 qobject_decref(data);
1419 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1421 QObject *data;
1423 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1424 bdrv_get_device_name(bs), ejected);
1425 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1427 qobject_decref(data);
1430 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1432 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1433 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1434 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1435 if (tray_was_closed) {
1436 /* tray open */
1437 bdrv_emit_qmp_eject_event(bs, true);
1439 if (load) {
1440 /* tray close */
1441 bdrv_emit_qmp_eject_event(bs, false);
1446 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1448 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1451 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1453 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1454 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1458 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1460 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1461 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1463 return false;
1466 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1468 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1469 bs->dev_ops->resize_cb(bs->dev_opaque);
1473 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1475 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1476 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1478 return false;
1482 * Run consistency checks on an image
1484 * Returns 0 if the check could be completed (it doesn't mean that the image is
1485 * free of errors) or -errno when an internal error occurred. The results of the
1486 * check are stored in res.
1488 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1490 if (bs->drv->bdrv_check == NULL) {
1491 return -ENOTSUP;
1494 memset(res, 0, sizeof(*res));
1495 return bs->drv->bdrv_check(bs, res, fix);
1498 #define COMMIT_BUF_SECTORS 2048
1500 /* commit COW file into the raw image */
1501 int bdrv_commit(BlockDriverState *bs)
1503 BlockDriver *drv = bs->drv;
1504 BlockDriver *backing_drv;
1505 int64_t sector, total_sectors;
1506 int n, ro, open_flags;
1507 int ret = 0, rw_ret = 0;
1508 uint8_t *buf;
1509 char filename[1024];
1510 BlockDriverState *bs_rw, *bs_ro;
1512 if (!drv)
1513 return -ENOMEDIUM;
1515 if (!bs->backing_hd) {
1516 return -ENOTSUP;
1519 if (bs->backing_hd->keep_read_only) {
1520 return -EACCES;
1523 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1524 return -EBUSY;
1527 backing_drv = bs->backing_hd->drv;
1528 ro = bs->backing_hd->read_only;
1529 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1530 open_flags = bs->backing_hd->open_flags;
1532 if (ro) {
1533 /* re-open as RW */
1534 bdrv_delete(bs->backing_hd);
1535 bs->backing_hd = NULL;
1536 bs_rw = bdrv_new("");
1537 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1538 backing_drv);
1539 if (rw_ret < 0) {
1540 bdrv_delete(bs_rw);
1541 /* try to re-open read-only */
1542 bs_ro = bdrv_new("");
1543 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1544 backing_drv);
1545 if (ret < 0) {
1546 bdrv_delete(bs_ro);
1547 /* drive not functional anymore */
1548 bs->drv = NULL;
1549 return ret;
1551 bs->backing_hd = bs_ro;
1552 return rw_ret;
1554 bs->backing_hd = bs_rw;
1557 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1558 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1560 for (sector = 0; sector < total_sectors; sector += n) {
1561 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1563 if (bdrv_read(bs, sector, buf, n) != 0) {
1564 ret = -EIO;
1565 goto ro_cleanup;
1568 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1569 ret = -EIO;
1570 goto ro_cleanup;
1575 if (drv->bdrv_make_empty) {
1576 ret = drv->bdrv_make_empty(bs);
1577 bdrv_flush(bs);
1581 * Make sure all data we wrote to the backing device is actually
1582 * stable on disk.
1584 if (bs->backing_hd)
1585 bdrv_flush(bs->backing_hd);
1587 ro_cleanup:
1588 g_free(buf);
1590 if (ro) {
1591 /* re-open as RO */
1592 bdrv_delete(bs->backing_hd);
1593 bs->backing_hd = NULL;
1594 bs_ro = bdrv_new("");
1595 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1596 backing_drv);
1597 if (ret < 0) {
1598 bdrv_delete(bs_ro);
1599 /* drive not functional anymore */
1600 bs->drv = NULL;
1601 return ret;
1603 bs->backing_hd = bs_ro;
1604 bs->backing_hd->keep_read_only = 0;
1607 return ret;
1610 int bdrv_commit_all(void)
1612 BlockDriverState *bs;
1614 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1615 int ret = bdrv_commit(bs);
1616 if (ret < 0) {
1617 return ret;
1620 return 0;
1623 struct BdrvTrackedRequest {
1624 BlockDriverState *bs;
1625 int64_t sector_num;
1626 int nb_sectors;
1627 bool is_write;
1628 QLIST_ENTRY(BdrvTrackedRequest) list;
1629 Coroutine *co; /* owner, used for deadlock detection */
1630 CoQueue wait_queue; /* coroutines blocked on this request */
1634 * Remove an active request from the tracked requests list
1636 * This function should be called when a tracked request is completing.
1638 static void tracked_request_end(BdrvTrackedRequest *req)
1640 QLIST_REMOVE(req, list);
1641 qemu_co_queue_restart_all(&req->wait_queue);
1645 * Add an active request to the tracked requests list
1647 static void tracked_request_begin(BdrvTrackedRequest *req,
1648 BlockDriverState *bs,
1649 int64_t sector_num,
1650 int nb_sectors, bool is_write)
1652 *req = (BdrvTrackedRequest){
1653 .bs = bs,
1654 .sector_num = sector_num,
1655 .nb_sectors = nb_sectors,
1656 .is_write = is_write,
1657 .co = qemu_coroutine_self(),
1660 qemu_co_queue_init(&req->wait_queue);
1662 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1666 * Round a region to cluster boundaries
1668 static void round_to_clusters(BlockDriverState *bs,
1669 int64_t sector_num, int nb_sectors,
1670 int64_t *cluster_sector_num,
1671 int *cluster_nb_sectors)
1673 BlockDriverInfo bdi;
1675 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1676 *cluster_sector_num = sector_num;
1677 *cluster_nb_sectors = nb_sectors;
1678 } else {
1679 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1680 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1681 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1682 nb_sectors, c);
1686 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1687 int64_t sector_num, int nb_sectors) {
1688 /* aaaa bbbb */
1689 if (sector_num >= req->sector_num + req->nb_sectors) {
1690 return false;
1692 /* bbbb aaaa */
1693 if (req->sector_num >= sector_num + nb_sectors) {
1694 return false;
1696 return true;
1699 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1700 int64_t sector_num, int nb_sectors)
1702 BdrvTrackedRequest *req;
1703 int64_t cluster_sector_num;
1704 int cluster_nb_sectors;
1705 bool retry;
1707 /* If we touch the same cluster it counts as an overlap. This guarantees
1708 * that allocating writes will be serialized and not race with each other
1709 * for the same cluster. For example, in copy-on-read it ensures that the
1710 * CoR read and write operations are atomic and guest writes cannot
1711 * interleave between them.
1713 round_to_clusters(bs, sector_num, nb_sectors,
1714 &cluster_sector_num, &cluster_nb_sectors);
1716 do {
1717 retry = false;
1718 QLIST_FOREACH(req, &bs->tracked_requests, list) {
1719 if (tracked_request_overlaps(req, cluster_sector_num,
1720 cluster_nb_sectors)) {
1721 /* Hitting this means there was a reentrant request, for
1722 * example, a block driver issuing nested requests. This must
1723 * never happen since it means deadlock.
1725 assert(qemu_coroutine_self() != req->co);
1727 qemu_co_queue_wait(&req->wait_queue);
1728 retry = true;
1729 break;
1732 } while (retry);
1736 * Return values:
1737 * 0 - success
1738 * -EINVAL - backing format specified, but no file
1739 * -ENOSPC - can't update the backing file because no space is left in the
1740 * image file header
1741 * -ENOTSUP - format driver doesn't support changing the backing file
1743 int bdrv_change_backing_file(BlockDriverState *bs,
1744 const char *backing_file, const char *backing_fmt)
1746 BlockDriver *drv = bs->drv;
1747 int ret;
1749 /* Backing file format doesn't make sense without a backing file */
1750 if (backing_fmt && !backing_file) {
1751 return -EINVAL;
1754 if (drv->bdrv_change_backing_file != NULL) {
1755 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1756 } else {
1757 ret = -ENOTSUP;
1760 if (ret == 0) {
1761 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1762 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1764 return ret;
1767 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1768 size_t size)
1770 int64_t len;
1772 if (!bdrv_is_inserted(bs))
1773 return -ENOMEDIUM;
1775 if (bs->growable)
1776 return 0;
1778 len = bdrv_getlength(bs);
1780 if (offset < 0)
1781 return -EIO;
1783 if ((offset > len) || (len - offset < size))
1784 return -EIO;
1786 return 0;
1789 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1790 int nb_sectors)
1792 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1793 nb_sectors * BDRV_SECTOR_SIZE);
1796 typedef struct RwCo {
1797 BlockDriverState *bs;
1798 int64_t sector_num;
1799 int nb_sectors;
1800 QEMUIOVector *qiov;
1801 bool is_write;
1802 int ret;
1803 } RwCo;
1805 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1807 RwCo *rwco = opaque;
1809 if (!rwco->is_write) {
1810 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1811 rwco->nb_sectors, rwco->qiov, 0);
1812 } else {
1813 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1814 rwco->nb_sectors, rwco->qiov, 0);
1819 * Process a synchronous request using coroutines
1821 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1822 int nb_sectors, bool is_write)
1824 QEMUIOVector qiov;
1825 struct iovec iov = {
1826 .iov_base = (void *)buf,
1827 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1829 Coroutine *co;
1830 RwCo rwco = {
1831 .bs = bs,
1832 .sector_num = sector_num,
1833 .nb_sectors = nb_sectors,
1834 .qiov = &qiov,
1835 .is_write = is_write,
1836 .ret = NOT_DONE,
1839 qemu_iovec_init_external(&qiov, &iov, 1);
1842 * In sync call context, when the vcpu is blocked, this throttling timer
1843 * will not fire; so the I/O throttling function has to be disabled here
1844 * if it has been enabled.
1846 if (bs->io_limits_enabled) {
1847 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1848 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1849 bdrv_io_limits_disable(bs);
1852 if (qemu_in_coroutine()) {
1853 /* Fast-path if already in coroutine context */
1854 bdrv_rw_co_entry(&rwco);
1855 } else {
1856 co = qemu_coroutine_create(bdrv_rw_co_entry);
1857 qemu_coroutine_enter(co, &rwco);
1858 while (rwco.ret == NOT_DONE) {
1859 qemu_aio_wait();
1862 return rwco.ret;
1865 /* return < 0 if error. See bdrv_write() for the return codes */
1866 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1867 uint8_t *buf, int nb_sectors)
1869 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1872 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
1873 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
1874 uint8_t *buf, int nb_sectors)
1876 bool enabled;
1877 int ret;
1879 enabled = bs->io_limits_enabled;
1880 bs->io_limits_enabled = false;
1881 ret = bdrv_read(bs, 0, buf, 1);
1882 bs->io_limits_enabled = enabled;
1883 return ret;
1886 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
1888 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1889 int nb_sectors, int dirty)
1891 int64_t start, end;
1892 unsigned long val, idx, bit;
1894 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1895 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1897 for (; start <= end; start++) {
1898 idx = start / BITS_PER_LONG;
1899 bit = start % BITS_PER_LONG;
1900 val = bs->dirty_bitmap[idx];
1901 if (dirty) {
1902 if (!(val & (1UL << bit))) {
1903 bs->dirty_count++;
1904 val |= 1UL << bit;
1906 } else {
1907 if (val & (1UL << bit)) {
1908 bs->dirty_count--;
1909 val &= ~(1UL << bit);
1912 bs->dirty_bitmap[idx] = val;
1916 /* Return < 0 if error. Important errors are:
1917 -EIO generic I/O error (may happen for all errors)
1918 -ENOMEDIUM No media inserted.
1919 -EINVAL Invalid sector number or nb_sectors
1920 -EACCES Trying to write a read-only device
1922 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1923 const uint8_t *buf, int nb_sectors)
1925 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1928 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1929 void *buf, int count1)
1931 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1932 int len, nb_sectors, count;
1933 int64_t sector_num;
1934 int ret;
1936 count = count1;
1937 /* first read to align to sector start */
1938 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1939 if (len > count)
1940 len = count;
1941 sector_num = offset >> BDRV_SECTOR_BITS;
1942 if (len > 0) {
1943 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1944 return ret;
1945 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1946 count -= len;
1947 if (count == 0)
1948 return count1;
1949 sector_num++;
1950 buf += len;
1953 /* read the sectors "in place" */
1954 nb_sectors = count >> BDRV_SECTOR_BITS;
1955 if (nb_sectors > 0) {
1956 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1957 return ret;
1958 sector_num += nb_sectors;
1959 len = nb_sectors << BDRV_SECTOR_BITS;
1960 buf += len;
1961 count -= len;
1964 /* add data from the last sector */
1965 if (count > 0) {
1966 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1967 return ret;
1968 memcpy(buf, tmp_buf, count);
1970 return count1;
1973 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1974 const void *buf, int count1)
1976 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1977 int len, nb_sectors, count;
1978 int64_t sector_num;
1979 int ret;
1981 count = count1;
1982 /* first write to align to sector start */
1983 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1984 if (len > count)
1985 len = count;
1986 sector_num = offset >> BDRV_SECTOR_BITS;
1987 if (len > 0) {
1988 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1989 return ret;
1990 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1991 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1992 return ret;
1993 count -= len;
1994 if (count == 0)
1995 return count1;
1996 sector_num++;
1997 buf += len;
2000 /* write the sectors "in place" */
2001 nb_sectors = count >> BDRV_SECTOR_BITS;
2002 if (nb_sectors > 0) {
2003 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
2004 return ret;
2005 sector_num += nb_sectors;
2006 len = nb_sectors << BDRV_SECTOR_BITS;
2007 buf += len;
2008 count -= len;
2011 /* add data from the last sector */
2012 if (count > 0) {
2013 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2014 return ret;
2015 memcpy(tmp_buf, buf, count);
2016 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2017 return ret;
2019 return count1;
2023 * Writes to the file and ensures that no writes are reordered across this
2024 * request (acts as a barrier)
2026 * Returns 0 on success, -errno in error cases.
2028 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2029 const void *buf, int count)
2031 int ret;
2033 ret = bdrv_pwrite(bs, offset, buf, count);
2034 if (ret < 0) {
2035 return ret;
2038 /* No flush needed for cache modes that already do it */
2039 if (bs->enable_write_cache) {
2040 bdrv_flush(bs);
2043 return 0;
2046 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2047 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2049 /* Perform I/O through a temporary buffer so that users who scribble over
2050 * their read buffer while the operation is in progress do not end up
2051 * modifying the image file. This is critical for zero-copy guest I/O
2052 * where anything might happen inside guest memory.
2054 void *bounce_buffer;
2056 BlockDriver *drv = bs->drv;
2057 struct iovec iov;
2058 QEMUIOVector bounce_qiov;
2059 int64_t cluster_sector_num;
2060 int cluster_nb_sectors;
2061 size_t skip_bytes;
2062 int ret;
2064 /* Cover entire cluster so no additional backing file I/O is required when
2065 * allocating cluster in the image file.
2067 round_to_clusters(bs, sector_num, nb_sectors,
2068 &cluster_sector_num, &cluster_nb_sectors);
2070 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2071 cluster_sector_num, cluster_nb_sectors);
2073 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2074 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2075 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2077 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2078 &bounce_qiov);
2079 if (ret < 0) {
2080 goto err;
2083 if (drv->bdrv_co_write_zeroes &&
2084 buffer_is_zero(bounce_buffer, iov.iov_len)) {
2085 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2086 cluster_nb_sectors);
2087 } else {
2088 /* This does not change the data on the disk, it is not necessary
2089 * to flush even in cache=writethrough mode.
2091 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2092 &bounce_qiov);
2095 if (ret < 0) {
2096 /* It might be okay to ignore write errors for guest requests. If this
2097 * is a deliberate copy-on-read then we don't want to ignore the error.
2098 * Simply report it in all cases.
2100 goto err;
2103 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2104 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2105 nb_sectors * BDRV_SECTOR_SIZE);
2107 err:
2108 qemu_vfree(bounce_buffer);
2109 return ret;
2113 * Handle a read request in coroutine context
2115 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2116 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2117 BdrvRequestFlags flags)
2119 BlockDriver *drv = bs->drv;
2120 BdrvTrackedRequest req;
2121 int ret;
2123 if (!drv) {
2124 return -ENOMEDIUM;
2126 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2127 return -EIO;
2130 /* throttling disk read I/O */
2131 if (bs->io_limits_enabled) {
2132 bdrv_io_limits_intercept(bs, false, nb_sectors);
2135 if (bs->copy_on_read) {
2136 flags |= BDRV_REQ_COPY_ON_READ;
2138 if (flags & BDRV_REQ_COPY_ON_READ) {
2139 bs->copy_on_read_in_flight++;
2142 if (bs->copy_on_read_in_flight) {
2143 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2146 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2148 if (flags & BDRV_REQ_COPY_ON_READ) {
2149 int pnum;
2151 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
2152 if (ret < 0) {
2153 goto out;
2156 if (!ret || pnum != nb_sectors) {
2157 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2158 goto out;
2162 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2164 out:
2165 tracked_request_end(&req);
2167 if (flags & BDRV_REQ_COPY_ON_READ) {
2168 bs->copy_on_read_in_flight--;
2171 return ret;
2174 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2175 int nb_sectors, QEMUIOVector *qiov)
2177 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2179 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2182 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2183 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2185 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2187 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2188 BDRV_REQ_COPY_ON_READ);
2191 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2192 int64_t sector_num, int nb_sectors)
2194 BlockDriver *drv = bs->drv;
2195 QEMUIOVector qiov;
2196 struct iovec iov;
2197 int ret;
2199 /* TODO Emulate only part of misaligned requests instead of letting block
2200 * drivers return -ENOTSUP and emulate everything */
2202 /* First try the efficient write zeroes operation */
2203 if (drv->bdrv_co_write_zeroes) {
2204 ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2205 if (ret != -ENOTSUP) {
2206 return ret;
2210 /* Fall back to bounce buffer if write zeroes is unsupported */
2211 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
2212 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
2213 memset(iov.iov_base, 0, iov.iov_len);
2214 qemu_iovec_init_external(&qiov, &iov, 1);
2216 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
2218 qemu_vfree(iov.iov_base);
2219 return ret;
2223 * Handle a write request in coroutine context
2225 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2226 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2227 BdrvRequestFlags flags)
2229 BlockDriver *drv = bs->drv;
2230 BdrvTrackedRequest req;
2231 int ret;
2233 if (!bs->drv) {
2234 return -ENOMEDIUM;
2236 if (bs->read_only) {
2237 return -EACCES;
2239 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2240 return -EIO;
2243 /* throttling disk write I/O */
2244 if (bs->io_limits_enabled) {
2245 bdrv_io_limits_intercept(bs, true, nb_sectors);
2248 if (bs->copy_on_read_in_flight) {
2249 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2252 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2254 if (flags & BDRV_REQ_ZERO_WRITE) {
2255 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2256 } else {
2257 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2260 if (ret == 0 && !bs->enable_write_cache) {
2261 ret = bdrv_co_flush(bs);
2264 if (bs->dirty_bitmap) {
2265 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2268 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2269 bs->wr_highest_sector = sector_num + nb_sectors - 1;
2272 tracked_request_end(&req);
2274 return ret;
2277 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2278 int nb_sectors, QEMUIOVector *qiov)
2280 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2282 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2285 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2286 int64_t sector_num, int nb_sectors)
2288 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2290 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2291 BDRV_REQ_ZERO_WRITE);
2295 * Truncate file to 'offset' bytes (needed only for file protocols)
2297 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2299 BlockDriver *drv = bs->drv;
2300 int ret;
2301 if (!drv)
2302 return -ENOMEDIUM;
2303 if (!drv->bdrv_truncate)
2304 return -ENOTSUP;
2305 if (bs->read_only)
2306 return -EACCES;
2307 if (bdrv_in_use(bs))
2308 return -EBUSY;
2309 ret = drv->bdrv_truncate(bs, offset);
2310 if (ret == 0) {
2311 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2312 bdrv_dev_resize_cb(bs);
2314 return ret;
2318 * Length of a allocated file in bytes. Sparse files are counted by actual
2319 * allocated space. Return < 0 if error or unknown.
2321 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2323 BlockDriver *drv = bs->drv;
2324 if (!drv) {
2325 return -ENOMEDIUM;
2327 if (drv->bdrv_get_allocated_file_size) {
2328 return drv->bdrv_get_allocated_file_size(bs);
2330 if (bs->file) {
2331 return bdrv_get_allocated_file_size(bs->file);
2333 return -ENOTSUP;
2337 * Length of a file in bytes. Return < 0 if error or unknown.
2339 int64_t bdrv_getlength(BlockDriverState *bs)
2341 BlockDriver *drv = bs->drv;
2342 if (!drv)
2343 return -ENOMEDIUM;
2345 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2346 if (drv->bdrv_getlength) {
2347 return drv->bdrv_getlength(bs);
2350 return bs->total_sectors * BDRV_SECTOR_SIZE;
2353 /* return 0 as number of sectors if no device present or error */
2354 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2356 int64_t length;
2357 length = bdrv_getlength(bs);
2358 if (length < 0)
2359 length = 0;
2360 else
2361 length = length >> BDRV_SECTOR_BITS;
2362 *nb_sectors_ptr = length;
2365 /* throttling disk io limits */
2366 void bdrv_set_io_limits(BlockDriverState *bs,
2367 BlockIOLimit *io_limits)
2369 bs->io_limits = *io_limits;
2370 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2373 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2374 BlockErrorAction on_write_error)
2376 bs->on_read_error = on_read_error;
2377 bs->on_write_error = on_write_error;
2380 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2382 return is_read ? bs->on_read_error : bs->on_write_error;
2385 int bdrv_is_read_only(BlockDriverState *bs)
2387 return bs->read_only;
2390 int bdrv_is_sg(BlockDriverState *bs)
2392 return bs->sg;
2395 int bdrv_enable_write_cache(BlockDriverState *bs)
2397 return bs->enable_write_cache;
2400 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2402 bs->enable_write_cache = wce;
2404 /* so a reopen() will preserve wce */
2405 if (wce) {
2406 bs->open_flags |= BDRV_O_CACHE_WB;
2407 } else {
2408 bs->open_flags &= ~BDRV_O_CACHE_WB;
2412 int bdrv_is_encrypted(BlockDriverState *bs)
2414 if (bs->backing_hd && bs->backing_hd->encrypted)
2415 return 1;
2416 return bs->encrypted;
2419 int bdrv_key_required(BlockDriverState *bs)
2421 BlockDriverState *backing_hd = bs->backing_hd;
2423 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2424 return 1;
2425 return (bs->encrypted && !bs->valid_key);
2428 int bdrv_set_key(BlockDriverState *bs, const char *key)
2430 int ret;
2431 if (bs->backing_hd && bs->backing_hd->encrypted) {
2432 ret = bdrv_set_key(bs->backing_hd, key);
2433 if (ret < 0)
2434 return ret;
2435 if (!bs->encrypted)
2436 return 0;
2438 if (!bs->encrypted) {
2439 return -EINVAL;
2440 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2441 return -ENOMEDIUM;
2443 ret = bs->drv->bdrv_set_key(bs, key);
2444 if (ret < 0) {
2445 bs->valid_key = 0;
2446 } else if (!bs->valid_key) {
2447 bs->valid_key = 1;
2448 /* call the change callback now, we skipped it on open */
2449 bdrv_dev_change_media_cb(bs, true);
2451 return ret;
2454 const char *bdrv_get_format_name(BlockDriverState *bs)
2456 return bs->drv ? bs->drv->format_name : NULL;
2459 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2460 void *opaque)
2462 BlockDriver *drv;
2464 QLIST_FOREACH(drv, &bdrv_drivers, list) {
2465 it(opaque, drv->format_name);
2469 BlockDriverState *bdrv_find(const char *name)
2471 BlockDriverState *bs;
2473 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2474 if (!strcmp(name, bs->device_name)) {
2475 return bs;
2478 return NULL;
2481 BlockDriverState *bdrv_next(BlockDriverState *bs)
2483 if (!bs) {
2484 return QTAILQ_FIRST(&bdrv_states);
2486 return QTAILQ_NEXT(bs, list);
2489 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2491 BlockDriverState *bs;
2493 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2494 it(opaque, bs);
2498 const char *bdrv_get_device_name(BlockDriverState *bs)
2500 return bs->device_name;
2503 int bdrv_get_flags(BlockDriverState *bs)
2505 return bs->open_flags;
2508 void bdrv_flush_all(void)
2510 BlockDriverState *bs;
2512 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2513 bdrv_flush(bs);
2517 int bdrv_has_zero_init(BlockDriverState *bs)
2519 assert(bs->drv);
2521 if (bs->drv->bdrv_has_zero_init) {
2522 return bs->drv->bdrv_has_zero_init(bs);
2525 return 1;
2528 typedef struct BdrvCoIsAllocatedData {
2529 BlockDriverState *bs;
2530 int64_t sector_num;
2531 int nb_sectors;
2532 int *pnum;
2533 int ret;
2534 bool done;
2535 } BdrvCoIsAllocatedData;
2538 * Returns true iff the specified sector is present in the disk image. Drivers
2539 * not implementing the functionality are assumed to not support backing files,
2540 * hence all their sectors are reported as allocated.
2542 * If 'sector_num' is beyond the end of the disk image the return value is 0
2543 * and 'pnum' is set to 0.
2545 * 'pnum' is set to the number of sectors (including and immediately following
2546 * the specified sector) that are known to be in the same
2547 * allocated/unallocated state.
2549 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2550 * beyond the end of the disk image it will be clamped.
2552 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2553 int nb_sectors, int *pnum)
2555 int64_t n;
2557 if (sector_num >= bs->total_sectors) {
2558 *pnum = 0;
2559 return 0;
2562 n = bs->total_sectors - sector_num;
2563 if (n < nb_sectors) {
2564 nb_sectors = n;
2567 if (!bs->drv->bdrv_co_is_allocated) {
2568 *pnum = nb_sectors;
2569 return 1;
2572 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2575 /* Coroutine wrapper for bdrv_is_allocated() */
2576 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2578 BdrvCoIsAllocatedData *data = opaque;
2579 BlockDriverState *bs = data->bs;
2581 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2582 data->pnum);
2583 data->done = true;
2587 * Synchronous wrapper around bdrv_co_is_allocated().
2589 * See bdrv_co_is_allocated() for details.
2591 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2592 int *pnum)
2594 Coroutine *co;
2595 BdrvCoIsAllocatedData data = {
2596 .bs = bs,
2597 .sector_num = sector_num,
2598 .nb_sectors = nb_sectors,
2599 .pnum = pnum,
2600 .done = false,
2603 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2604 qemu_coroutine_enter(co, &data);
2605 while (!data.done) {
2606 qemu_aio_wait();
2608 return data.ret;
2612 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2614 * Return true if the given sector is allocated in any image between
2615 * BASE and TOP (inclusive). BASE can be NULL to check if the given
2616 * sector is allocated in any image of the chain. Return false otherwise.
2618 * 'pnum' is set to the number of sectors (including and immediately following
2619 * the specified sector) that are known to be in the same
2620 * allocated/unallocated state.
2623 int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
2624 BlockDriverState *base,
2625 int64_t sector_num,
2626 int nb_sectors, int *pnum)
2628 BlockDriverState *intermediate;
2629 int ret, n = nb_sectors;
2631 intermediate = top;
2632 while (intermediate && intermediate != base) {
2633 int pnum_inter;
2634 ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
2635 &pnum_inter);
2636 if (ret < 0) {
2637 return ret;
2638 } else if (ret) {
2639 *pnum = pnum_inter;
2640 return 1;
2644 * [sector_num, nb_sectors] is unallocated on top but intermediate
2645 * might have
2647 * [sector_num+x, nr_sectors] allocated.
2649 if (n > pnum_inter) {
2650 n = pnum_inter;
2653 intermediate = intermediate->backing_hd;
2656 *pnum = n;
2657 return 0;
2660 BlockInfoList *qmp_query_block(Error **errp)
2662 BlockInfoList *head = NULL, *cur_item = NULL;
2663 BlockDriverState *bs;
2665 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2666 BlockInfoList *info = g_malloc0(sizeof(*info));
2668 info->value = g_malloc0(sizeof(*info->value));
2669 info->value->device = g_strdup(bs->device_name);
2670 info->value->type = g_strdup("unknown");
2671 info->value->locked = bdrv_dev_is_medium_locked(bs);
2672 info->value->removable = bdrv_dev_has_removable_media(bs);
2674 if (bdrv_dev_has_removable_media(bs)) {
2675 info->value->has_tray_open = true;
2676 info->value->tray_open = bdrv_dev_is_tray_open(bs);
2679 if (bdrv_iostatus_is_enabled(bs)) {
2680 info->value->has_io_status = true;
2681 info->value->io_status = bs->iostatus;
2684 if (bs->drv) {
2685 info->value->has_inserted = true;
2686 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2687 info->value->inserted->file = g_strdup(bs->filename);
2688 info->value->inserted->ro = bs->read_only;
2689 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2690 info->value->inserted->encrypted = bs->encrypted;
2691 info->value->inserted->encryption_key_missing = bdrv_key_required(bs);
2692 if (bs->backing_file[0]) {
2693 info->value->inserted->has_backing_file = true;
2694 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2697 info->value->inserted->backing_file_depth =
2698 bdrv_get_backing_file_depth(bs);
2700 if (bs->io_limits_enabled) {
2701 info->value->inserted->bps =
2702 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2703 info->value->inserted->bps_rd =
2704 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2705 info->value->inserted->bps_wr =
2706 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2707 info->value->inserted->iops =
2708 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2709 info->value->inserted->iops_rd =
2710 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2711 info->value->inserted->iops_wr =
2712 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2716 /* XXX: waiting for the qapi to support GSList */
2717 if (!cur_item) {
2718 head = cur_item = info;
2719 } else {
2720 cur_item->next = info;
2721 cur_item = info;
2725 return head;
2728 /* Consider exposing this as a full fledged QMP command */
2729 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2731 BlockStats *s;
2733 s = g_malloc0(sizeof(*s));
2735 if (bs->device_name[0]) {
2736 s->has_device = true;
2737 s->device = g_strdup(bs->device_name);
2740 s->stats = g_malloc0(sizeof(*s->stats));
2741 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2742 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2743 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2744 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2745 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2746 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2747 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2748 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2749 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2751 if (bs->file) {
2752 s->has_parent = true;
2753 s->parent = qmp_query_blockstat(bs->file, NULL);
2756 return s;
2759 BlockStatsList *qmp_query_blockstats(Error **errp)
2761 BlockStatsList *head = NULL, *cur_item = NULL;
2762 BlockDriverState *bs;
2764 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2765 BlockStatsList *info = g_malloc0(sizeof(*info));
2766 info->value = qmp_query_blockstat(bs, NULL);
2768 /* XXX: waiting for the qapi to support GSList */
2769 if (!cur_item) {
2770 head = cur_item = info;
2771 } else {
2772 cur_item->next = info;
2773 cur_item = info;
2777 return head;
2780 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2782 if (bs->backing_hd && bs->backing_hd->encrypted)
2783 return bs->backing_file;
2784 else if (bs->encrypted)
2785 return bs->filename;
2786 else
2787 return NULL;
2790 void bdrv_get_backing_filename(BlockDriverState *bs,
2791 char *filename, int filename_size)
2793 pstrcpy(filename, filename_size, bs->backing_file);
2796 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2797 const uint8_t *buf, int nb_sectors)
2799 BlockDriver *drv = bs->drv;
2800 if (!drv)
2801 return -ENOMEDIUM;
2802 if (!drv->bdrv_write_compressed)
2803 return -ENOTSUP;
2804 if (bdrv_check_request(bs, sector_num, nb_sectors))
2805 return -EIO;
2807 if (bs->dirty_bitmap) {
2808 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2811 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2814 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2816 BlockDriver *drv = bs->drv;
2817 if (!drv)
2818 return -ENOMEDIUM;
2819 if (!drv->bdrv_get_info)
2820 return -ENOTSUP;
2821 memset(bdi, 0, sizeof(*bdi));
2822 return drv->bdrv_get_info(bs, bdi);
2825 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2826 int64_t pos, int size)
2828 BlockDriver *drv = bs->drv;
2829 if (!drv)
2830 return -ENOMEDIUM;
2831 if (drv->bdrv_save_vmstate)
2832 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2833 if (bs->file)
2834 return bdrv_save_vmstate(bs->file, buf, pos, size);
2835 return -ENOTSUP;
2838 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2839 int64_t pos, int size)
2841 BlockDriver *drv = bs->drv;
2842 if (!drv)
2843 return -ENOMEDIUM;
2844 if (drv->bdrv_load_vmstate)
2845 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2846 if (bs->file)
2847 return bdrv_load_vmstate(bs->file, buf, pos, size);
2848 return -ENOTSUP;
2851 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2853 BlockDriver *drv = bs->drv;
2855 if (!drv || !drv->bdrv_debug_event) {
2856 return;
2859 drv->bdrv_debug_event(bs, event);
2863 /**************************************************************/
2864 /* handling of snapshots */
2866 int bdrv_can_snapshot(BlockDriverState *bs)
2868 BlockDriver *drv = bs->drv;
2869 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2870 return 0;
2873 if (!drv->bdrv_snapshot_create) {
2874 if (bs->file != NULL) {
2875 return bdrv_can_snapshot(bs->file);
2877 return 0;
2880 return 1;
2883 int bdrv_is_snapshot(BlockDriverState *bs)
2885 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2888 BlockDriverState *bdrv_snapshots(void)
2890 BlockDriverState *bs;
2892 if (bs_snapshots) {
2893 return bs_snapshots;
2896 bs = NULL;
2897 while ((bs = bdrv_next(bs))) {
2898 if (bdrv_can_snapshot(bs)) {
2899 bs_snapshots = bs;
2900 return bs;
2903 return NULL;
2906 int bdrv_snapshot_create(BlockDriverState *bs,
2907 QEMUSnapshotInfo *sn_info)
2909 BlockDriver *drv = bs->drv;
2910 if (!drv)
2911 return -ENOMEDIUM;
2912 if (drv->bdrv_snapshot_create)
2913 return drv->bdrv_snapshot_create(bs, sn_info);
2914 if (bs->file)
2915 return bdrv_snapshot_create(bs->file, sn_info);
2916 return -ENOTSUP;
2919 int bdrv_snapshot_goto(BlockDriverState *bs,
2920 const char *snapshot_id)
2922 BlockDriver *drv = bs->drv;
2923 int ret, open_ret;
2925 if (!drv)
2926 return -ENOMEDIUM;
2927 if (drv->bdrv_snapshot_goto)
2928 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2930 if (bs->file) {
2931 drv->bdrv_close(bs);
2932 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2933 open_ret = drv->bdrv_open(bs, bs->open_flags);
2934 if (open_ret < 0) {
2935 bdrv_delete(bs->file);
2936 bs->drv = NULL;
2937 return open_ret;
2939 return ret;
2942 return -ENOTSUP;
2945 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2947 BlockDriver *drv = bs->drv;
2948 if (!drv)
2949 return -ENOMEDIUM;
2950 if (drv->bdrv_snapshot_delete)
2951 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2952 if (bs->file)
2953 return bdrv_snapshot_delete(bs->file, snapshot_id);
2954 return -ENOTSUP;
2957 int bdrv_snapshot_list(BlockDriverState *bs,
2958 QEMUSnapshotInfo **psn_info)
2960 BlockDriver *drv = bs->drv;
2961 if (!drv)
2962 return -ENOMEDIUM;
2963 if (drv->bdrv_snapshot_list)
2964 return drv->bdrv_snapshot_list(bs, psn_info);
2965 if (bs->file)
2966 return bdrv_snapshot_list(bs->file, psn_info);
2967 return -ENOTSUP;
2970 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2971 const char *snapshot_name)
2973 BlockDriver *drv = bs->drv;
2974 if (!drv) {
2975 return -ENOMEDIUM;
2977 if (!bs->read_only) {
2978 return -EINVAL;
2980 if (drv->bdrv_snapshot_load_tmp) {
2981 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2983 return -ENOTSUP;
2986 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2987 const char *backing_file)
2989 if (!bs->drv) {
2990 return NULL;
2993 if (bs->backing_hd) {
2994 if (strcmp(bs->backing_file, backing_file) == 0) {
2995 return bs->backing_hd;
2996 } else {
2997 return bdrv_find_backing_image(bs->backing_hd, backing_file);
3001 return NULL;
3004 int bdrv_get_backing_file_depth(BlockDriverState *bs)
3006 if (!bs->drv) {
3007 return 0;
3010 if (!bs->backing_hd) {
3011 return 0;
3014 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3017 #define NB_SUFFIXES 4
3019 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
3021 static const char suffixes[NB_SUFFIXES] = "KMGT";
3022 int64_t base;
3023 int i;
3025 if (size <= 999) {
3026 snprintf(buf, buf_size, "%" PRId64, size);
3027 } else {
3028 base = 1024;
3029 for(i = 0; i < NB_SUFFIXES; i++) {
3030 if (size < (10 * base)) {
3031 snprintf(buf, buf_size, "%0.1f%c",
3032 (double)size / base,
3033 suffixes[i]);
3034 break;
3035 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
3036 snprintf(buf, buf_size, "%" PRId64 "%c",
3037 ((size + (base >> 1)) / base),
3038 suffixes[i]);
3039 break;
3041 base = base * 1024;
3044 return buf;
3047 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
3049 char buf1[128], date_buf[128], clock_buf[128];
3050 #ifdef _WIN32
3051 struct tm *ptm;
3052 #else
3053 struct tm tm;
3054 #endif
3055 time_t ti;
3056 int64_t secs;
3058 if (!sn) {
3059 snprintf(buf, buf_size,
3060 "%-10s%-20s%7s%20s%15s",
3061 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
3062 } else {
3063 ti = sn->date_sec;
3064 #ifdef _WIN32
3065 ptm = localtime(&ti);
3066 strftime(date_buf, sizeof(date_buf),
3067 "%Y-%m-%d %H:%M:%S", ptm);
3068 #else
3069 localtime_r(&ti, &tm);
3070 strftime(date_buf, sizeof(date_buf),
3071 "%Y-%m-%d %H:%M:%S", &tm);
3072 #endif
3073 secs = sn->vm_clock_nsec / 1000000000;
3074 snprintf(clock_buf, sizeof(clock_buf),
3075 "%02d:%02d:%02d.%03d",
3076 (int)(secs / 3600),
3077 (int)((secs / 60) % 60),
3078 (int)(secs % 60),
3079 (int)((sn->vm_clock_nsec / 1000000) % 1000));
3080 snprintf(buf, buf_size,
3081 "%-10s%-20s%7s%20s%15s",
3082 sn->id_str, sn->name,
3083 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
3084 date_buf,
3085 clock_buf);
3087 return buf;
3090 /**************************************************************/
3091 /* async I/Os */
3093 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3094 QEMUIOVector *qiov, int nb_sectors,
3095 BlockDriverCompletionFunc *cb, void *opaque)
3097 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3099 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3100 cb, opaque, false);
3103 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3104 QEMUIOVector *qiov, int nb_sectors,
3105 BlockDriverCompletionFunc *cb, void *opaque)
3107 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3109 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3110 cb, opaque, true);
3114 typedef struct MultiwriteCB {
3115 int error;
3116 int num_requests;
3117 int num_callbacks;
3118 struct {
3119 BlockDriverCompletionFunc *cb;
3120 void *opaque;
3121 QEMUIOVector *free_qiov;
3122 } callbacks[];
3123 } MultiwriteCB;
3125 static void multiwrite_user_cb(MultiwriteCB *mcb)
3127 int i;
3129 for (i = 0; i < mcb->num_callbacks; i++) {
3130 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3131 if (mcb->callbacks[i].free_qiov) {
3132 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3134 g_free(mcb->callbacks[i].free_qiov);
3138 static void multiwrite_cb(void *opaque, int ret)
3140 MultiwriteCB *mcb = opaque;
3142 trace_multiwrite_cb(mcb, ret);
3144 if (ret < 0 && !mcb->error) {
3145 mcb->error = ret;
3148 mcb->num_requests--;
3149 if (mcb->num_requests == 0) {
3150 multiwrite_user_cb(mcb);
3151 g_free(mcb);
3155 static int multiwrite_req_compare(const void *a, const void *b)
3157 const BlockRequest *req1 = a, *req2 = b;
3160 * Note that we can't simply subtract req2->sector from req1->sector
3161 * here as that could overflow the return value.
3163 if (req1->sector > req2->sector) {
3164 return 1;
3165 } else if (req1->sector < req2->sector) {
3166 return -1;
3167 } else {
3168 return 0;
3173 * Takes a bunch of requests and tries to merge them. Returns the number of
3174 * requests that remain after merging.
3176 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3177 int num_reqs, MultiwriteCB *mcb)
3179 int i, outidx;
3181 // Sort requests by start sector
3182 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3184 // Check if adjacent requests touch the same clusters. If so, combine them,
3185 // filling up gaps with zero sectors.
3186 outidx = 0;
3187 for (i = 1; i < num_reqs; i++) {
3188 int merge = 0;
3189 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3191 // Handle exactly sequential writes and overlapping writes.
3192 if (reqs[i].sector <= oldreq_last) {
3193 merge = 1;
3196 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3197 merge = 0;
3200 if (merge) {
3201 size_t size;
3202 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3203 qemu_iovec_init(qiov,
3204 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3206 // Add the first request to the merged one. If the requests are
3207 // overlapping, drop the last sectors of the first request.
3208 size = (reqs[i].sector - reqs[outidx].sector) << 9;
3209 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
3211 // We should need to add any zeros between the two requests
3212 assert (reqs[i].sector <= oldreq_last);
3214 // Add the second request
3215 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
3217 reqs[outidx].nb_sectors = qiov->size >> 9;
3218 reqs[outidx].qiov = qiov;
3220 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3221 } else {
3222 outidx++;
3223 reqs[outidx].sector = reqs[i].sector;
3224 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3225 reqs[outidx].qiov = reqs[i].qiov;
3229 return outidx + 1;
3233 * Submit multiple AIO write requests at once.
3235 * On success, the function returns 0 and all requests in the reqs array have
3236 * been submitted. In error case this function returns -1, and any of the
3237 * requests may or may not be submitted yet. In particular, this means that the
3238 * callback will be called for some of the requests, for others it won't. The
3239 * caller must check the error field of the BlockRequest to wait for the right
3240 * callbacks (if error != 0, no callback will be called).
3242 * The implementation may modify the contents of the reqs array, e.g. to merge
3243 * requests. However, the fields opaque and error are left unmodified as they
3244 * are used to signal failure for a single request to the caller.
3246 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3248 MultiwriteCB *mcb;
3249 int i;
3251 /* don't submit writes if we don't have a medium */
3252 if (bs->drv == NULL) {
3253 for (i = 0; i < num_reqs; i++) {
3254 reqs[i].error = -ENOMEDIUM;
3256 return -1;
3259 if (num_reqs == 0) {
3260 return 0;
3263 // Create MultiwriteCB structure
3264 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3265 mcb->num_requests = 0;
3266 mcb->num_callbacks = num_reqs;
3268 for (i = 0; i < num_reqs; i++) {
3269 mcb->callbacks[i].cb = reqs[i].cb;
3270 mcb->callbacks[i].opaque = reqs[i].opaque;
3273 // Check for mergable requests
3274 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3276 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3278 /* Run the aio requests. */
3279 mcb->num_requests = num_reqs;
3280 for (i = 0; i < num_reqs; i++) {
3281 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3282 reqs[i].nb_sectors, multiwrite_cb, mcb);
3285 return 0;
3288 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3290 acb->pool->cancel(acb);
3293 /* block I/O throttling */
3294 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3295 bool is_write, double elapsed_time, uint64_t *wait)
3297 uint64_t bps_limit = 0;
3298 double bytes_limit, bytes_base, bytes_res;
3299 double slice_time, wait_time;
3301 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3302 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3303 } else if (bs->io_limits.bps[is_write]) {
3304 bps_limit = bs->io_limits.bps[is_write];
3305 } else {
3306 if (wait) {
3307 *wait = 0;
3310 return false;
3313 slice_time = bs->slice_end - bs->slice_start;
3314 slice_time /= (NANOSECONDS_PER_SECOND);
3315 bytes_limit = bps_limit * slice_time;
3316 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3317 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3318 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3321 /* bytes_base: the bytes of data which have been read/written; and
3322 * it is obtained from the history statistic info.
3323 * bytes_res: the remaining bytes of data which need to be read/written.
3324 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3325 * the total time for completing reading/writting all data.
3327 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3329 if (bytes_base + bytes_res <= bytes_limit) {
3330 if (wait) {
3331 *wait = 0;
3334 return false;
3337 /* Calc approx time to dispatch */
3338 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3340 /* When the I/O rate at runtime exceeds the limits,
3341 * bs->slice_end need to be extended in order that the current statistic
3342 * info can be kept until the timer fire, so it is increased and tuned
3343 * based on the result of experiment.
3345 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3346 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3347 if (wait) {
3348 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3351 return true;
3354 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3355 double elapsed_time, uint64_t *wait)
3357 uint64_t iops_limit = 0;
3358 double ios_limit, ios_base;
3359 double slice_time, wait_time;
3361 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3362 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3363 } else if (bs->io_limits.iops[is_write]) {
3364 iops_limit = bs->io_limits.iops[is_write];
3365 } else {
3366 if (wait) {
3367 *wait = 0;
3370 return false;
3373 slice_time = bs->slice_end - bs->slice_start;
3374 slice_time /= (NANOSECONDS_PER_SECOND);
3375 ios_limit = iops_limit * slice_time;
3376 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3377 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3378 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3381 if (ios_base + 1 <= ios_limit) {
3382 if (wait) {
3383 *wait = 0;
3386 return false;
3389 /* Calc approx time to dispatch */
3390 wait_time = (ios_base + 1) / iops_limit;
3391 if (wait_time > elapsed_time) {
3392 wait_time = wait_time - elapsed_time;
3393 } else {
3394 wait_time = 0;
3397 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3398 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3399 if (wait) {
3400 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3403 return true;
3406 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3407 bool is_write, int64_t *wait)
3409 int64_t now, max_wait;
3410 uint64_t bps_wait = 0, iops_wait = 0;
3411 double elapsed_time;
3412 int bps_ret, iops_ret;
3414 now = qemu_get_clock_ns(vm_clock);
3415 if ((bs->slice_start < now)
3416 && (bs->slice_end > now)) {
3417 bs->slice_end = now + bs->slice_time;
3418 } else {
3419 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3420 bs->slice_start = now;
3421 bs->slice_end = now + bs->slice_time;
3423 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3424 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3426 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3427 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3430 elapsed_time = now - bs->slice_start;
3431 elapsed_time /= (NANOSECONDS_PER_SECOND);
3433 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3434 is_write, elapsed_time, &bps_wait);
3435 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3436 elapsed_time, &iops_wait);
3437 if (bps_ret || iops_ret) {
3438 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3439 if (wait) {
3440 *wait = max_wait;
3443 now = qemu_get_clock_ns(vm_clock);
3444 if (bs->slice_end < now + max_wait) {
3445 bs->slice_end = now + max_wait;
3448 return true;
3451 if (wait) {
3452 *wait = 0;
3455 return false;
3458 /**************************************************************/
3459 /* async block device emulation */
3461 typedef struct BlockDriverAIOCBSync {
3462 BlockDriverAIOCB common;
3463 QEMUBH *bh;
3464 int ret;
3465 /* vector translation state */
3466 QEMUIOVector *qiov;
3467 uint8_t *bounce;
3468 int is_write;
3469 } BlockDriverAIOCBSync;
3471 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3473 BlockDriverAIOCBSync *acb =
3474 container_of(blockacb, BlockDriverAIOCBSync, common);
3475 qemu_bh_delete(acb->bh);
3476 acb->bh = NULL;
3477 qemu_aio_release(acb);
3480 static AIOPool bdrv_em_aio_pool = {
3481 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3482 .cancel = bdrv_aio_cancel_em,
3485 static void bdrv_aio_bh_cb(void *opaque)
3487 BlockDriverAIOCBSync *acb = opaque;
3489 if (!acb->is_write)
3490 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3491 qemu_vfree(acb->bounce);
3492 acb->common.cb(acb->common.opaque, acb->ret);
3493 qemu_bh_delete(acb->bh);
3494 acb->bh = NULL;
3495 qemu_aio_release(acb);
3498 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3499 int64_t sector_num,
3500 QEMUIOVector *qiov,
3501 int nb_sectors,
3502 BlockDriverCompletionFunc *cb,
3503 void *opaque,
3504 int is_write)
3507 BlockDriverAIOCBSync *acb;
3509 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3510 acb->is_write = is_write;
3511 acb->qiov = qiov;
3512 acb->bounce = qemu_blockalign(bs, qiov->size);
3513 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3515 if (is_write) {
3516 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3517 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3518 } else {
3519 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3522 qemu_bh_schedule(acb->bh);
3524 return &acb->common;
3527 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3528 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3529 BlockDriverCompletionFunc *cb, void *opaque)
3531 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3534 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3535 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3536 BlockDriverCompletionFunc *cb, void *opaque)
3538 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3542 typedef struct BlockDriverAIOCBCoroutine {
3543 BlockDriverAIOCB common;
3544 BlockRequest req;
3545 bool is_write;
3546 QEMUBH* bh;
3547 } BlockDriverAIOCBCoroutine;
3549 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3551 qemu_aio_flush();
3554 static AIOPool bdrv_em_co_aio_pool = {
3555 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3556 .cancel = bdrv_aio_co_cancel_em,
3559 static void bdrv_co_em_bh(void *opaque)
3561 BlockDriverAIOCBCoroutine *acb = opaque;
3563 acb->common.cb(acb->common.opaque, acb->req.error);
3564 qemu_bh_delete(acb->bh);
3565 qemu_aio_release(acb);
3568 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3569 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3571 BlockDriverAIOCBCoroutine *acb = opaque;
3572 BlockDriverState *bs = acb->common.bs;
3574 if (!acb->is_write) {
3575 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3576 acb->req.nb_sectors, acb->req.qiov, 0);
3577 } else {
3578 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3579 acb->req.nb_sectors, acb->req.qiov, 0);
3582 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3583 qemu_bh_schedule(acb->bh);
3586 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3587 int64_t sector_num,
3588 QEMUIOVector *qiov,
3589 int nb_sectors,
3590 BlockDriverCompletionFunc *cb,
3591 void *opaque,
3592 bool is_write)
3594 Coroutine *co;
3595 BlockDriverAIOCBCoroutine *acb;
3597 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3598 acb->req.sector = sector_num;
3599 acb->req.nb_sectors = nb_sectors;
3600 acb->req.qiov = qiov;
3601 acb->is_write = is_write;
3603 co = qemu_coroutine_create(bdrv_co_do_rw);
3604 qemu_coroutine_enter(co, acb);
3606 return &acb->common;
3609 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3611 BlockDriverAIOCBCoroutine *acb = opaque;
3612 BlockDriverState *bs = acb->common.bs;
3614 acb->req.error = bdrv_co_flush(bs);
3615 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3616 qemu_bh_schedule(acb->bh);
3619 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3620 BlockDriverCompletionFunc *cb, void *opaque)
3622 trace_bdrv_aio_flush(bs, opaque);
3624 Coroutine *co;
3625 BlockDriverAIOCBCoroutine *acb;
3627 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3628 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3629 qemu_coroutine_enter(co, acb);
3631 return &acb->common;
3634 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3636 BlockDriverAIOCBCoroutine *acb = opaque;
3637 BlockDriverState *bs = acb->common.bs;
3639 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3640 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3641 qemu_bh_schedule(acb->bh);
3644 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3645 int64_t sector_num, int nb_sectors,
3646 BlockDriverCompletionFunc *cb, void *opaque)
3648 Coroutine *co;
3649 BlockDriverAIOCBCoroutine *acb;
3651 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3653 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3654 acb->req.sector = sector_num;
3655 acb->req.nb_sectors = nb_sectors;
3656 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3657 qemu_coroutine_enter(co, acb);
3659 return &acb->common;
3662 void bdrv_init(void)
3664 module_call_init(MODULE_INIT_BLOCK);
3667 void bdrv_init_with_whitelist(void)
3669 use_bdrv_whitelist = 1;
3670 bdrv_init();
3673 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3674 BlockDriverCompletionFunc *cb, void *opaque)
3676 BlockDriverAIOCB *acb;
3678 if (pool->free_aiocb) {
3679 acb = pool->free_aiocb;
3680 pool->free_aiocb = acb->next;
3681 } else {
3682 acb = g_malloc0(pool->aiocb_size);
3683 acb->pool = pool;
3685 acb->bs = bs;
3686 acb->cb = cb;
3687 acb->opaque = opaque;
3688 return acb;
3691 void qemu_aio_release(void *p)
3693 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3694 AIOPool *pool = acb->pool;
3695 acb->next = pool->free_aiocb;
3696 pool->free_aiocb = acb;
3699 /**************************************************************/
3700 /* Coroutine block device emulation */
3702 typedef struct CoroutineIOCompletion {
3703 Coroutine *coroutine;
3704 int ret;
3705 } CoroutineIOCompletion;
3707 static void bdrv_co_io_em_complete(void *opaque, int ret)
3709 CoroutineIOCompletion *co = opaque;
3711 co->ret = ret;
3712 qemu_coroutine_enter(co->coroutine, NULL);
3715 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3716 int nb_sectors, QEMUIOVector *iov,
3717 bool is_write)
3719 CoroutineIOCompletion co = {
3720 .coroutine = qemu_coroutine_self(),
3722 BlockDriverAIOCB *acb;
3724 if (is_write) {
3725 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3726 bdrv_co_io_em_complete, &co);
3727 } else {
3728 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3729 bdrv_co_io_em_complete, &co);
3732 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3733 if (!acb) {
3734 return -EIO;
3736 qemu_coroutine_yield();
3738 return co.ret;
3741 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3742 int64_t sector_num, int nb_sectors,
3743 QEMUIOVector *iov)
3745 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3748 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3749 int64_t sector_num, int nb_sectors,
3750 QEMUIOVector *iov)
3752 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3755 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3757 RwCo *rwco = opaque;
3759 rwco->ret = bdrv_co_flush(rwco->bs);
3762 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3764 int ret;
3766 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3767 return 0;
3770 /* Write back cached data to the OS even with cache=unsafe */
3771 if (bs->drv->bdrv_co_flush_to_os) {
3772 ret = bs->drv->bdrv_co_flush_to_os(bs);
3773 if (ret < 0) {
3774 return ret;
3778 /* But don't actually force it to the disk with cache=unsafe */
3779 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3780 goto flush_parent;
3783 if (bs->drv->bdrv_co_flush_to_disk) {
3784 ret = bs->drv->bdrv_co_flush_to_disk(bs);
3785 } else if (bs->drv->bdrv_aio_flush) {
3786 BlockDriverAIOCB *acb;
3787 CoroutineIOCompletion co = {
3788 .coroutine = qemu_coroutine_self(),
3791 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3792 if (acb == NULL) {
3793 ret = -EIO;
3794 } else {
3795 qemu_coroutine_yield();
3796 ret = co.ret;
3798 } else {
3800 * Some block drivers always operate in either writethrough or unsafe
3801 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3802 * know how the server works (because the behaviour is hardcoded or
3803 * depends on server-side configuration), so we can't ensure that
3804 * everything is safe on disk. Returning an error doesn't work because
3805 * that would break guests even if the server operates in writethrough
3806 * mode.
3808 * Let's hope the user knows what he's doing.
3810 ret = 0;
3812 if (ret < 0) {
3813 return ret;
3816 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3817 * in the case of cache=unsafe, so there are no useless flushes.
3819 flush_parent:
3820 return bdrv_co_flush(bs->file);
3823 void bdrv_invalidate_cache(BlockDriverState *bs)
3825 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3826 bs->drv->bdrv_invalidate_cache(bs);
3830 void bdrv_invalidate_cache_all(void)
3832 BlockDriverState *bs;
3834 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3835 bdrv_invalidate_cache(bs);
3839 void bdrv_clear_incoming_migration_all(void)
3841 BlockDriverState *bs;
3843 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3844 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3848 int bdrv_flush(BlockDriverState *bs)
3850 Coroutine *co;
3851 RwCo rwco = {
3852 .bs = bs,
3853 .ret = NOT_DONE,
3856 if (qemu_in_coroutine()) {
3857 /* Fast-path if already in coroutine context */
3858 bdrv_flush_co_entry(&rwco);
3859 } else {
3860 co = qemu_coroutine_create(bdrv_flush_co_entry);
3861 qemu_coroutine_enter(co, &rwco);
3862 while (rwco.ret == NOT_DONE) {
3863 qemu_aio_wait();
3867 return rwco.ret;
3870 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3872 RwCo *rwco = opaque;
3874 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3877 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3878 int nb_sectors)
3880 if (!bs->drv) {
3881 return -ENOMEDIUM;
3882 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3883 return -EIO;
3884 } else if (bs->read_only) {
3885 return -EROFS;
3886 } else if (bs->drv->bdrv_co_discard) {
3887 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3888 } else if (bs->drv->bdrv_aio_discard) {
3889 BlockDriverAIOCB *acb;
3890 CoroutineIOCompletion co = {
3891 .coroutine = qemu_coroutine_self(),
3894 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3895 bdrv_co_io_em_complete, &co);
3896 if (acb == NULL) {
3897 return -EIO;
3898 } else {
3899 qemu_coroutine_yield();
3900 return co.ret;
3902 } else {
3903 return 0;
3907 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3909 Coroutine *co;
3910 RwCo rwco = {
3911 .bs = bs,
3912 .sector_num = sector_num,
3913 .nb_sectors = nb_sectors,
3914 .ret = NOT_DONE,
3917 if (qemu_in_coroutine()) {
3918 /* Fast-path if already in coroutine context */
3919 bdrv_discard_co_entry(&rwco);
3920 } else {
3921 co = qemu_coroutine_create(bdrv_discard_co_entry);
3922 qemu_coroutine_enter(co, &rwco);
3923 while (rwco.ret == NOT_DONE) {
3924 qemu_aio_wait();
3928 return rwco.ret;
3931 /**************************************************************/
3932 /* removable device support */
3935 * Return TRUE if the media is present
3937 int bdrv_is_inserted(BlockDriverState *bs)
3939 BlockDriver *drv = bs->drv;
3941 if (!drv)
3942 return 0;
3943 if (!drv->bdrv_is_inserted)
3944 return 1;
3945 return drv->bdrv_is_inserted(bs);
3949 * Return whether the media changed since the last call to this
3950 * function, or -ENOTSUP if we don't know. Most drivers don't know.
3952 int bdrv_media_changed(BlockDriverState *bs)
3954 BlockDriver *drv = bs->drv;
3956 if (drv && drv->bdrv_media_changed) {
3957 return drv->bdrv_media_changed(bs);
3959 return -ENOTSUP;
3963 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3965 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3967 BlockDriver *drv = bs->drv;
3969 if (drv && drv->bdrv_eject) {
3970 drv->bdrv_eject(bs, eject_flag);
3973 if (bs->device_name[0] != '\0') {
3974 bdrv_emit_qmp_eject_event(bs, eject_flag);
3979 * Lock or unlock the media (if it is locked, the user won't be able
3980 * to eject it manually).
3982 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3984 BlockDriver *drv = bs->drv;
3986 trace_bdrv_lock_medium(bs, locked);
3988 if (drv && drv->bdrv_lock_medium) {
3989 drv->bdrv_lock_medium(bs, locked);
3993 /* needed for generic scsi interface */
3995 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3997 BlockDriver *drv = bs->drv;
3999 if (drv && drv->bdrv_ioctl)
4000 return drv->bdrv_ioctl(bs, req, buf);
4001 return -ENOTSUP;
4004 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4005 unsigned long int req, void *buf,
4006 BlockDriverCompletionFunc *cb, void *opaque)
4008 BlockDriver *drv = bs->drv;
4010 if (drv && drv->bdrv_aio_ioctl)
4011 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4012 return NULL;
4015 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4017 bs->buffer_alignment = align;
4020 void *qemu_blockalign(BlockDriverState *bs, size_t size)
4022 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4025 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
4027 int64_t bitmap_size;
4029 bs->dirty_count = 0;
4030 if (enable) {
4031 if (!bs->dirty_bitmap) {
4032 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
4033 BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
4034 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
4036 bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
4038 } else {
4039 if (bs->dirty_bitmap) {
4040 g_free(bs->dirty_bitmap);
4041 bs->dirty_bitmap = NULL;
4046 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4048 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
4050 if (bs->dirty_bitmap &&
4051 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
4052 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
4053 (1UL << (chunk % (sizeof(unsigned long) * 8))));
4054 } else {
4055 return 0;
4059 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4060 int nr_sectors)
4062 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
4065 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4067 return bs->dirty_count;
4070 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4072 assert(bs->in_use != in_use);
4073 bs->in_use = in_use;
4076 int bdrv_in_use(BlockDriverState *bs)
4078 return bs->in_use;
4081 void bdrv_iostatus_enable(BlockDriverState *bs)
4083 bs->iostatus_enabled = true;
4084 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4087 /* The I/O status is only enabled if the drive explicitly
4088 * enables it _and_ the VM is configured to stop on errors */
4089 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4091 return (bs->iostatus_enabled &&
4092 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
4093 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
4094 bs->on_read_error == BLOCK_ERR_STOP_ANY));
4097 void bdrv_iostatus_disable(BlockDriverState *bs)
4099 bs->iostatus_enabled = false;
4102 void bdrv_iostatus_reset(BlockDriverState *bs)
4104 if (bdrv_iostatus_is_enabled(bs)) {
4105 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4109 /* XXX: Today this is set by device models because it makes the implementation
4110 quite simple. However, the block layer knows about the error, so it's
4111 possible to implement this without device models being involved */
4112 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4114 if (bdrv_iostatus_is_enabled(bs) &&
4115 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4116 assert(error >= 0);
4117 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4118 BLOCK_DEVICE_IO_STATUS_FAILED;
4122 void
4123 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4124 enum BlockAcctType type)
4126 assert(type < BDRV_MAX_IOTYPE);
4128 cookie->bytes = bytes;
4129 cookie->start_time_ns = get_clock();
4130 cookie->type = type;
4133 void
4134 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4136 assert(cookie->type < BDRV_MAX_IOTYPE);
4138 bs->nr_bytes[cookie->type] += cookie->bytes;
4139 bs->nr_ops[cookie->type]++;
4140 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4143 int bdrv_img_create(const char *filename, const char *fmt,
4144 const char *base_filename, const char *base_fmt,
4145 char *options, uint64_t img_size, int flags)
4147 QEMUOptionParameter *param = NULL, *create_options = NULL;
4148 QEMUOptionParameter *backing_fmt, *backing_file, *size;
4149 BlockDriverState *bs = NULL;
4150 BlockDriver *drv, *proto_drv;
4151 BlockDriver *backing_drv = NULL;
4152 int ret = 0;
4154 /* Find driver and parse its options */
4155 drv = bdrv_find_format(fmt);
4156 if (!drv) {
4157 error_report("Unknown file format '%s'", fmt);
4158 ret = -EINVAL;
4159 goto out;
4162 proto_drv = bdrv_find_protocol(filename);
4163 if (!proto_drv) {
4164 error_report("Unknown protocol '%s'", filename);
4165 ret = -EINVAL;
4166 goto out;
4169 create_options = append_option_parameters(create_options,
4170 drv->create_options);
4171 create_options = append_option_parameters(create_options,
4172 proto_drv->create_options);
4174 /* Create parameter list with default values */
4175 param = parse_option_parameters("", create_options, param);
4177 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4179 /* Parse -o options */
4180 if (options) {
4181 param = parse_option_parameters(options, create_options, param);
4182 if (param == NULL) {
4183 error_report("Invalid options for file format '%s'.", fmt);
4184 ret = -EINVAL;
4185 goto out;
4189 if (base_filename) {
4190 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4191 base_filename)) {
4192 error_report("Backing file not supported for file format '%s'",
4193 fmt);
4194 ret = -EINVAL;
4195 goto out;
4199 if (base_fmt) {
4200 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4201 error_report("Backing file format not supported for file "
4202 "format '%s'", fmt);
4203 ret = -EINVAL;
4204 goto out;
4208 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4209 if (backing_file && backing_file->value.s) {
4210 if (!strcmp(filename, backing_file->value.s)) {
4211 error_report("Error: Trying to create an image with the "
4212 "same filename as the backing file");
4213 ret = -EINVAL;
4214 goto out;
4218 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4219 if (backing_fmt && backing_fmt->value.s) {
4220 backing_drv = bdrv_find_format(backing_fmt->value.s);
4221 if (!backing_drv) {
4222 error_report("Unknown backing file format '%s'",
4223 backing_fmt->value.s);
4224 ret = -EINVAL;
4225 goto out;
4229 // The size for the image must always be specified, with one exception:
4230 // If we are using a backing file, we can obtain the size from there
4231 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4232 if (size && size->value.n == -1) {
4233 if (backing_file && backing_file->value.s) {
4234 uint64_t size;
4235 char buf[32];
4236 int back_flags;
4238 /* backing files always opened read-only */
4239 back_flags =
4240 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4242 bs = bdrv_new("");
4244 ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
4245 if (ret < 0) {
4246 error_report("Could not open '%s'", backing_file->value.s);
4247 goto out;
4249 bdrv_get_geometry(bs, &size);
4250 size *= 512;
4252 snprintf(buf, sizeof(buf), "%" PRId64, size);
4253 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4254 } else {
4255 error_report("Image creation needs a size parameter");
4256 ret = -EINVAL;
4257 goto out;
4261 printf("Formatting '%s', fmt=%s ", filename, fmt);
4262 print_option_parameters(param);
4263 puts("");
4265 ret = bdrv_create(drv, filename, param);
4267 if (ret < 0) {
4268 if (ret == -ENOTSUP) {
4269 error_report("Formatting or formatting option not supported for "
4270 "file format '%s'", fmt);
4271 } else if (ret == -EFBIG) {
4272 error_report("The image size is too large for file format '%s'",
4273 fmt);
4274 } else {
4275 error_report("%s: error while creating %s: %s", filename, fmt,
4276 strerror(-ret));
4280 out:
4281 free_option_parameters(create_options);
4282 free_option_parameters(param);
4284 if (bs) {
4285 bdrv_delete(bs);
4288 return ret;
4291 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4292 int64_t speed, BlockDriverCompletionFunc *cb,
4293 void *opaque, Error **errp)
4295 BlockJob *job;
4297 if (bs->job || bdrv_in_use(bs)) {
4298 error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4299 return NULL;
4301 bdrv_set_in_use(bs, 1);
4303 job = g_malloc0(job_type->instance_size);
4304 job->job_type = job_type;
4305 job->bs = bs;
4306 job->cb = cb;
4307 job->opaque = opaque;
4308 job->busy = true;
4309 bs->job = job;
4311 /* Only set speed when necessary to avoid NotSupported error */
4312 if (speed != 0) {
4313 Error *local_err = NULL;
4315 block_job_set_speed(job, speed, &local_err);
4316 if (error_is_set(&local_err)) {
4317 bs->job = NULL;
4318 g_free(job);
4319 bdrv_set_in_use(bs, 0);
4320 error_propagate(errp, local_err);
4321 return NULL;
4324 return job;
4327 void block_job_complete(BlockJob *job, int ret)
4329 BlockDriverState *bs = job->bs;
4331 assert(bs->job == job);
4332 job->cb(job->opaque, ret);
4333 bs->job = NULL;
4334 g_free(job);
4335 bdrv_set_in_use(bs, 0);
4338 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4340 Error *local_err = NULL;
4342 if (!job->job_type->set_speed) {
4343 error_set(errp, QERR_NOT_SUPPORTED);
4344 return;
4346 job->job_type->set_speed(job, speed, &local_err);
4347 if (error_is_set(&local_err)) {
4348 error_propagate(errp, local_err);
4349 return;
4352 job->speed = speed;
4355 void block_job_cancel(BlockJob *job)
4357 job->cancelled = true;
4358 if (job->co && !job->busy) {
4359 qemu_coroutine_enter(job->co, NULL);
4363 bool block_job_is_cancelled(BlockJob *job)
4365 return job->cancelled;
4368 struct BlockCancelData {
4369 BlockJob *job;
4370 BlockDriverCompletionFunc *cb;
4371 void *opaque;
4372 bool cancelled;
4373 int ret;
4376 static void block_job_cancel_cb(void *opaque, int ret)
4378 struct BlockCancelData *data = opaque;
4380 data->cancelled = block_job_is_cancelled(data->job);
4381 data->ret = ret;
4382 data->cb(data->opaque, ret);
4385 int block_job_cancel_sync(BlockJob *job)
4387 struct BlockCancelData data;
4388 BlockDriverState *bs = job->bs;
4390 assert(bs->job == job);
4392 /* Set up our own callback to store the result and chain to
4393 * the original callback.
4395 data.job = job;
4396 data.cb = job->cb;
4397 data.opaque = job->opaque;
4398 data.ret = -EINPROGRESS;
4399 job->cb = block_job_cancel_cb;
4400 job->opaque = &data;
4401 block_job_cancel(job);
4402 while (data.ret == -EINPROGRESS) {
4403 qemu_aio_wait();
4405 return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
4408 void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns)
4410 /* Check cancellation *before* setting busy = false, too! */
4411 if (!block_job_is_cancelled(job)) {
4412 job->busy = false;
4413 co_sleep_ns(clock, ns);
4414 job->busy = true;