qemu-img: add dirty flag status
[qemu/ar7.git] / block.c
blob33630ebacc4cb8c98508b0102b9bb9d04bda78c5
1 /*
2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qjson.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
35 #ifdef CONFIG_BSD
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
40 #ifndef __DragonFly__
41 #include <sys/disk.h>
42 #endif
43 #endif
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
51 typedef enum {
52 BDRV_REQ_COPY_ON_READ = 0x1,
53 BDRV_REQ_ZERO_WRITE = 0x2,
54 } BdrvRequestFlags;
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59 BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62 BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
65 QEMUIOVector *iov);
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
68 QEMUIOVector *iov);
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76 int64_t sector_num,
77 QEMUIOVector *qiov,
78 int nb_sectors,
79 BlockDriverCompletionFunc *cb,
80 void *opaque,
81 bool is_write);
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
84 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
85 bool is_write, double elapsed_time, uint64_t *wait);
86 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
87 double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
89 bool is_write, int64_t *wait);
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
94 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
95 QLIST_HEAD_INITIALIZER(bdrv_drivers);
97 /* The device to use for VM snapshots */
98 static BlockDriverState *bs_snapshots;
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
111 int is_windows_drive(const char *filename)
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
121 #endif
123 /* throttling disk I/O limits */
124 void bdrv_io_limits_disable(BlockDriverState *bs)
126 bs->io_limits_enabled = false;
128 while (qemu_co_queue_next(&bs->throttled_reqs));
130 if (bs->block_timer) {
131 qemu_del_timer(bs->block_timer);
132 qemu_free_timer(bs->block_timer);
133 bs->block_timer = NULL;
136 bs->slice_start = 0;
137 bs->slice_end = 0;
138 bs->slice_time = 0;
139 memset(&bs->io_base, 0, sizeof(bs->io_base));
142 static void bdrv_block_timer(void *opaque)
144 BlockDriverState *bs = opaque;
146 qemu_co_queue_next(&bs->throttled_reqs);
149 void bdrv_io_limits_enable(BlockDriverState *bs)
151 qemu_co_queue_init(&bs->throttled_reqs);
152 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
153 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
154 bs->slice_start = qemu_get_clock_ns(vm_clock);
155 bs->slice_end = bs->slice_start + bs->slice_time;
156 memset(&bs->io_base, 0, sizeof(bs->io_base));
157 bs->io_limits_enabled = true;
160 bool bdrv_io_limits_enabled(BlockDriverState *bs)
162 BlockIOLimit *io_limits = &bs->io_limits;
163 return io_limits->bps[BLOCK_IO_LIMIT_READ]
164 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
165 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
166 || io_limits->iops[BLOCK_IO_LIMIT_READ]
167 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
168 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
171 static void bdrv_io_limits_intercept(BlockDriverState *bs,
172 bool is_write, int nb_sectors)
174 int64_t wait_time = -1;
176 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
177 qemu_co_queue_wait(&bs->throttled_reqs);
180 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
181 * throttled requests will not be dequeued until the current request is
182 * allowed to be serviced. So if the current request still exceeds the
183 * limits, it will be inserted to the head. All requests followed it will
184 * be still in throttled_reqs queue.
187 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
188 qemu_mod_timer(bs->block_timer,
189 wait_time + qemu_get_clock_ns(vm_clock));
190 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
193 qemu_co_queue_next(&bs->throttled_reqs);
196 /* check if the path starts with "<protocol>:" */
197 static int path_has_protocol(const char *path)
199 #ifdef _WIN32
200 if (is_windows_drive(path) ||
201 is_windows_drive_prefix(path)) {
202 return 0;
204 #endif
206 return strchr(path, ':') != NULL;
209 int path_is_absolute(const char *path)
211 const char *p;
212 #ifdef _WIN32
213 /* specific case for names like: "\\.\d:" */
214 if (*path == '/' || *path == '\\')
215 return 1;
216 #endif
217 p = strchr(path, ':');
218 if (p)
219 p++;
220 else
221 p = path;
222 #ifdef _WIN32
223 return (*p == '/' || *p == '\\');
224 #else
225 return (*p == '/');
226 #endif
229 /* if filename is absolute, just copy it to dest. Otherwise, build a
230 path to it by considering it is relative to base_path. URL are
231 supported. */
232 void path_combine(char *dest, int dest_size,
233 const char *base_path,
234 const char *filename)
236 const char *p, *p1;
237 int len;
239 if (dest_size <= 0)
240 return;
241 if (path_is_absolute(filename)) {
242 pstrcpy(dest, dest_size, filename);
243 } else {
244 p = strchr(base_path, ':');
245 if (p)
246 p++;
247 else
248 p = base_path;
249 p1 = strrchr(base_path, '/');
250 #ifdef _WIN32
252 const char *p2;
253 p2 = strrchr(base_path, '\\');
254 if (!p1 || p2 > p1)
255 p1 = p2;
257 #endif
258 if (p1)
259 p1++;
260 else
261 p1 = base_path;
262 if (p1 > p)
263 p = p1;
264 len = p - base_path;
265 if (len > dest_size - 1)
266 len = dest_size - 1;
267 memcpy(dest, base_path, len);
268 dest[len] = '\0';
269 pstrcat(dest, dest_size, filename);
273 void bdrv_register(BlockDriver *bdrv)
275 /* Block drivers without coroutine functions need emulation */
276 if (!bdrv->bdrv_co_readv) {
277 bdrv->bdrv_co_readv = bdrv_co_readv_em;
278 bdrv->bdrv_co_writev = bdrv_co_writev_em;
280 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
281 * the block driver lacks aio we need to emulate that too.
283 if (!bdrv->bdrv_aio_readv) {
284 /* add AIO emulation layer */
285 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
286 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
290 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
293 /* create a new block device (by default it is empty) */
294 BlockDriverState *bdrv_new(const char *device_name)
296 BlockDriverState *bs;
298 bs = g_malloc0(sizeof(BlockDriverState));
299 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
300 if (device_name[0] != '\0') {
301 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
303 bdrv_iostatus_disable(bs);
304 return bs;
307 BlockDriver *bdrv_find_format(const char *format_name)
309 BlockDriver *drv1;
310 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
311 if (!strcmp(drv1->format_name, format_name)) {
312 return drv1;
315 return NULL;
318 static int bdrv_is_whitelisted(BlockDriver *drv)
320 static const char *whitelist[] = {
321 CONFIG_BDRV_WHITELIST
323 const char **p;
325 if (!whitelist[0])
326 return 1; /* no whitelist, anything goes */
328 for (p = whitelist; *p; p++) {
329 if (!strcmp(drv->format_name, *p)) {
330 return 1;
333 return 0;
336 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
338 BlockDriver *drv = bdrv_find_format(format_name);
339 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
342 int bdrv_create(BlockDriver *drv, const char* filename,
343 QEMUOptionParameter *options)
345 if (!drv->bdrv_create)
346 return -ENOTSUP;
348 return drv->bdrv_create(filename, options);
351 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
353 BlockDriver *drv;
355 drv = bdrv_find_protocol(filename);
356 if (drv == NULL) {
357 return -ENOENT;
360 return bdrv_create(drv, filename, options);
363 #ifdef _WIN32
364 void get_tmp_filename(char *filename, int size)
366 char temp_dir[MAX_PATH];
368 GetTempPath(MAX_PATH, temp_dir);
369 GetTempFileName(temp_dir, "qem", 0, filename);
371 #else
372 void get_tmp_filename(char *filename, int size)
374 int fd;
375 const char *tmpdir;
376 /* XXX: race condition possible */
377 tmpdir = getenv("TMPDIR");
378 if (!tmpdir)
379 tmpdir = "/tmp";
380 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
381 fd = mkstemp(filename);
382 close(fd);
384 #endif
387 * Detect host devices. By convention, /dev/cdrom[N] is always
388 * recognized as a host CDROM.
390 static BlockDriver *find_hdev_driver(const char *filename)
392 int score_max = 0, score;
393 BlockDriver *drv = NULL, *d;
395 QLIST_FOREACH(d, &bdrv_drivers, list) {
396 if (d->bdrv_probe_device) {
397 score = d->bdrv_probe_device(filename);
398 if (score > score_max) {
399 score_max = score;
400 drv = d;
405 return drv;
408 BlockDriver *bdrv_find_protocol(const char *filename)
410 BlockDriver *drv1;
411 char protocol[128];
412 int len;
413 const char *p;
415 /* TODO Drivers without bdrv_file_open must be specified explicitly */
418 * XXX(hch): we really should not let host device detection
419 * override an explicit protocol specification, but moving this
420 * later breaks access to device names with colons in them.
421 * Thanks to the brain-dead persistent naming schemes on udev-
422 * based Linux systems those actually are quite common.
424 drv1 = find_hdev_driver(filename);
425 if (drv1) {
426 return drv1;
429 if (!path_has_protocol(filename)) {
430 return bdrv_find_format("file");
432 p = strchr(filename, ':');
433 assert(p != NULL);
434 len = p - filename;
435 if (len > sizeof(protocol) - 1)
436 len = sizeof(protocol) - 1;
437 memcpy(protocol, filename, len);
438 protocol[len] = '\0';
439 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
440 if (drv1->protocol_name &&
441 !strcmp(drv1->protocol_name, protocol)) {
442 return drv1;
445 return NULL;
448 static int find_image_format(const char *filename, BlockDriver **pdrv)
450 int ret, score, score_max;
451 BlockDriver *drv1, *drv;
452 uint8_t buf[2048];
453 BlockDriverState *bs;
455 ret = bdrv_file_open(&bs, filename, 0);
456 if (ret < 0) {
457 *pdrv = NULL;
458 return ret;
461 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
462 if (bs->sg || !bdrv_is_inserted(bs)) {
463 bdrv_delete(bs);
464 drv = bdrv_find_format("raw");
465 if (!drv) {
466 ret = -ENOENT;
468 *pdrv = drv;
469 return ret;
472 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
473 bdrv_delete(bs);
474 if (ret < 0) {
475 *pdrv = NULL;
476 return ret;
479 score_max = 0;
480 drv = NULL;
481 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
482 if (drv1->bdrv_probe) {
483 score = drv1->bdrv_probe(buf, ret, filename);
484 if (score > score_max) {
485 score_max = score;
486 drv = drv1;
490 if (!drv) {
491 ret = -ENOENT;
493 *pdrv = drv;
494 return ret;
498 * Set the current 'total_sectors' value
500 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
502 BlockDriver *drv = bs->drv;
504 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
505 if (bs->sg)
506 return 0;
508 /* query actual device if possible, otherwise just trust the hint */
509 if (drv->bdrv_getlength) {
510 int64_t length = drv->bdrv_getlength(bs);
511 if (length < 0) {
512 return length;
514 hint = length >> BDRV_SECTOR_BITS;
517 bs->total_sectors = hint;
518 return 0;
522 * Set open flags for a given cache mode
524 * Return 0 on success, -1 if the cache mode was invalid.
526 int bdrv_parse_cache_flags(const char *mode, int *flags)
528 *flags &= ~BDRV_O_CACHE_MASK;
530 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
531 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
532 } else if (!strcmp(mode, "directsync")) {
533 *flags |= BDRV_O_NOCACHE;
534 } else if (!strcmp(mode, "writeback")) {
535 *flags |= BDRV_O_CACHE_WB;
536 } else if (!strcmp(mode, "unsafe")) {
537 *flags |= BDRV_O_CACHE_WB;
538 *flags |= BDRV_O_NO_FLUSH;
539 } else if (!strcmp(mode, "writethrough")) {
540 /* this is the default */
541 } else {
542 return -1;
545 return 0;
549 * The copy-on-read flag is actually a reference count so multiple users may
550 * use the feature without worrying about clobbering its previous state.
551 * Copy-on-read stays enabled until all users have called to disable it.
553 void bdrv_enable_copy_on_read(BlockDriverState *bs)
555 bs->copy_on_read++;
558 void bdrv_disable_copy_on_read(BlockDriverState *bs)
560 assert(bs->copy_on_read > 0);
561 bs->copy_on_read--;
565 * Common part for opening disk images and files
567 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
568 int flags, BlockDriver *drv)
570 int ret, open_flags;
572 assert(drv != NULL);
574 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
576 bs->file = NULL;
577 bs->total_sectors = 0;
578 bs->encrypted = 0;
579 bs->valid_key = 0;
580 bs->sg = 0;
581 bs->open_flags = flags;
582 bs->growable = 0;
583 bs->buffer_alignment = 512;
585 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
586 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
587 bdrv_enable_copy_on_read(bs);
590 pstrcpy(bs->filename, sizeof(bs->filename), filename);
591 bs->backing_file[0] = '\0';
593 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
594 return -ENOTSUP;
597 bs->drv = drv;
598 bs->opaque = g_malloc0(drv->instance_size);
600 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
603 * Clear flags that are internal to the block layer before opening the
604 * image.
606 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
609 * Snapshots should be writable.
611 if (bs->is_temporary) {
612 open_flags |= BDRV_O_RDWR;
615 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
617 /* Open the image, either directly or using a protocol */
618 if (drv->bdrv_file_open) {
619 ret = drv->bdrv_file_open(bs, filename, open_flags);
620 } else {
621 ret = bdrv_file_open(&bs->file, filename, open_flags);
622 if (ret >= 0) {
623 ret = drv->bdrv_open(bs, open_flags);
627 if (ret < 0) {
628 goto free_and_fail;
631 ret = refresh_total_sectors(bs, bs->total_sectors);
632 if (ret < 0) {
633 goto free_and_fail;
636 #ifndef _WIN32
637 if (bs->is_temporary) {
638 unlink(filename);
640 #endif
641 return 0;
643 free_and_fail:
644 if (bs->file) {
645 bdrv_delete(bs->file);
646 bs->file = NULL;
648 g_free(bs->opaque);
649 bs->opaque = NULL;
650 bs->drv = NULL;
651 return ret;
655 * Opens a file using a protocol (file, host_device, nbd, ...)
657 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
659 BlockDriverState *bs;
660 BlockDriver *drv;
661 int ret;
663 drv = bdrv_find_protocol(filename);
664 if (!drv) {
665 return -ENOENT;
668 bs = bdrv_new("");
669 ret = bdrv_open_common(bs, filename, flags, drv);
670 if (ret < 0) {
671 bdrv_delete(bs);
672 return ret;
674 bs->growable = 1;
675 *pbs = bs;
676 return 0;
680 * Opens a disk image (raw, qcow2, vmdk, ...)
682 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
683 BlockDriver *drv)
685 int ret;
686 char tmp_filename[PATH_MAX];
688 if (flags & BDRV_O_SNAPSHOT) {
689 BlockDriverState *bs1;
690 int64_t total_size;
691 int is_protocol = 0;
692 BlockDriver *bdrv_qcow2;
693 QEMUOptionParameter *options;
694 char backing_filename[PATH_MAX];
696 /* if snapshot, we create a temporary backing file and open it
697 instead of opening 'filename' directly */
699 /* if there is a backing file, use it */
700 bs1 = bdrv_new("");
701 ret = bdrv_open(bs1, filename, 0, drv);
702 if (ret < 0) {
703 bdrv_delete(bs1);
704 return ret;
706 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
708 if (bs1->drv && bs1->drv->protocol_name)
709 is_protocol = 1;
711 bdrv_delete(bs1);
713 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
715 /* Real path is meaningless for protocols */
716 if (is_protocol)
717 snprintf(backing_filename, sizeof(backing_filename),
718 "%s", filename);
719 else if (!realpath(filename, backing_filename))
720 return -errno;
722 bdrv_qcow2 = bdrv_find_format("qcow2");
723 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
725 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
726 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
727 if (drv) {
728 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
729 drv->format_name);
732 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
733 free_option_parameters(options);
734 if (ret < 0) {
735 return ret;
738 filename = tmp_filename;
739 drv = bdrv_qcow2;
740 bs->is_temporary = 1;
743 /* Find the right image format driver */
744 if (!drv) {
745 ret = find_image_format(filename, &drv);
748 if (!drv) {
749 goto unlink_and_fail;
752 /* Open the image */
753 ret = bdrv_open_common(bs, filename, flags, drv);
754 if (ret < 0) {
755 goto unlink_and_fail;
758 /* If there is a backing file, use it */
759 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
760 char backing_filename[PATH_MAX];
761 int back_flags;
762 BlockDriver *back_drv = NULL;
764 bs->backing_hd = bdrv_new("");
766 if (path_has_protocol(bs->backing_file)) {
767 pstrcpy(backing_filename, sizeof(backing_filename),
768 bs->backing_file);
769 } else {
770 path_combine(backing_filename, sizeof(backing_filename),
771 filename, bs->backing_file);
774 if (bs->backing_format[0] != '\0') {
775 back_drv = bdrv_find_format(bs->backing_format);
778 /* backing files always opened read-only */
779 back_flags =
780 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
782 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
783 if (ret < 0) {
784 bdrv_close(bs);
785 return ret;
787 if (bs->is_temporary) {
788 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
789 } else {
790 /* base image inherits from "parent" */
791 bs->backing_hd->keep_read_only = bs->keep_read_only;
795 if (!bdrv_key_required(bs)) {
796 bdrv_dev_change_media_cb(bs, true);
799 /* throttling disk I/O limits */
800 if (bs->io_limits_enabled) {
801 bdrv_io_limits_enable(bs);
804 return 0;
806 unlink_and_fail:
807 if (bs->is_temporary) {
808 unlink(filename);
810 return ret;
813 void bdrv_close(BlockDriverState *bs)
815 if (bs->drv) {
816 if (bs->job) {
817 block_job_cancel_sync(bs->job);
819 if (bs == bs_snapshots) {
820 bs_snapshots = NULL;
822 if (bs->backing_hd) {
823 bdrv_delete(bs->backing_hd);
824 bs->backing_hd = NULL;
826 bs->drv->bdrv_close(bs);
827 g_free(bs->opaque);
828 #ifdef _WIN32
829 if (bs->is_temporary) {
830 unlink(bs->filename);
832 #endif
833 bs->opaque = NULL;
834 bs->drv = NULL;
835 bs->copy_on_read = 0;
837 if (bs->file != NULL) {
838 bdrv_close(bs->file);
841 bdrv_dev_change_media_cb(bs, false);
844 /*throttling disk I/O limits*/
845 if (bs->io_limits_enabled) {
846 bdrv_io_limits_disable(bs);
850 void bdrv_close_all(void)
852 BlockDriverState *bs;
854 QTAILQ_FOREACH(bs, &bdrv_states, list) {
855 bdrv_close(bs);
860 * Wait for pending requests to complete across all BlockDriverStates
862 * This function does not flush data to disk, use bdrv_flush_all() for that
863 * after calling this function.
865 void bdrv_drain_all(void)
867 BlockDriverState *bs;
869 qemu_aio_flush();
871 /* If requests are still pending there is a bug somewhere */
872 QTAILQ_FOREACH(bs, &bdrv_states, list) {
873 assert(QLIST_EMPTY(&bs->tracked_requests));
874 assert(qemu_co_queue_empty(&bs->throttled_reqs));
878 /* make a BlockDriverState anonymous by removing from bdrv_state list.
879 Also, NULL terminate the device_name to prevent double remove */
880 void bdrv_make_anon(BlockDriverState *bs)
882 if (bs->device_name[0] != '\0') {
883 QTAILQ_REMOVE(&bdrv_states, bs, list);
885 bs->device_name[0] = '\0';
889 * Add new bs contents at the top of an image chain while the chain is
890 * live, while keeping required fields on the top layer.
892 * This will modify the BlockDriverState fields, and swap contents
893 * between bs_new and bs_top. Both bs_new and bs_top are modified.
895 * This function does not create any image files.
897 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
899 BlockDriverState tmp;
901 /* the new bs must not be in bdrv_states */
902 bdrv_make_anon(bs_new);
904 tmp = *bs_new;
906 /* there are some fields that need to stay on the top layer: */
908 /* dev info */
909 tmp.dev_ops = bs_top->dev_ops;
910 tmp.dev_opaque = bs_top->dev_opaque;
911 tmp.dev = bs_top->dev;
912 tmp.buffer_alignment = bs_top->buffer_alignment;
913 tmp.copy_on_read = bs_top->copy_on_read;
915 /* i/o timing parameters */
916 tmp.slice_time = bs_top->slice_time;
917 tmp.slice_start = bs_top->slice_start;
918 tmp.slice_end = bs_top->slice_end;
919 tmp.io_limits = bs_top->io_limits;
920 tmp.io_base = bs_top->io_base;
921 tmp.throttled_reqs = bs_top->throttled_reqs;
922 tmp.block_timer = bs_top->block_timer;
923 tmp.io_limits_enabled = bs_top->io_limits_enabled;
925 /* geometry */
926 tmp.cyls = bs_top->cyls;
927 tmp.heads = bs_top->heads;
928 tmp.secs = bs_top->secs;
929 tmp.translation = bs_top->translation;
931 /* r/w error */
932 tmp.on_read_error = bs_top->on_read_error;
933 tmp.on_write_error = bs_top->on_write_error;
935 /* i/o status */
936 tmp.iostatus_enabled = bs_top->iostatus_enabled;
937 tmp.iostatus = bs_top->iostatus;
939 /* keep the same entry in bdrv_states */
940 pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
941 tmp.list = bs_top->list;
943 /* The contents of 'tmp' will become bs_top, as we are
944 * swapping bs_new and bs_top contents. */
945 tmp.backing_hd = bs_new;
946 pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
948 /* swap contents of the fixed new bs and the current top */
949 *bs_new = *bs_top;
950 *bs_top = tmp;
952 /* clear the copied fields in the new backing file */
953 bdrv_detach_dev(bs_new, bs_new->dev);
955 qemu_co_queue_init(&bs_new->throttled_reqs);
956 memset(&bs_new->io_base, 0, sizeof(bs_new->io_base));
957 memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
958 bdrv_iostatus_disable(bs_new);
960 /* we don't use bdrv_io_limits_disable() for this, because we don't want
961 * to affect or delete the block_timer, as it has been moved to bs_top */
962 bs_new->io_limits_enabled = false;
963 bs_new->block_timer = NULL;
964 bs_new->slice_time = 0;
965 bs_new->slice_start = 0;
966 bs_new->slice_end = 0;
969 void bdrv_delete(BlockDriverState *bs)
971 assert(!bs->dev);
972 assert(!bs->job);
973 assert(!bs->in_use);
975 /* remove from list, if necessary */
976 bdrv_make_anon(bs);
978 bdrv_close(bs);
979 if (bs->file != NULL) {
980 bdrv_delete(bs->file);
983 assert(bs != bs_snapshots);
984 g_free(bs);
987 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
988 /* TODO change to DeviceState *dev when all users are qdevified */
990 if (bs->dev) {
991 return -EBUSY;
993 bs->dev = dev;
994 bdrv_iostatus_reset(bs);
995 return 0;
998 /* TODO qdevified devices don't use this, remove when devices are qdevified */
999 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1001 if (bdrv_attach_dev(bs, dev) < 0) {
1002 abort();
1006 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1007 /* TODO change to DeviceState *dev when all users are qdevified */
1009 assert(bs->dev == dev);
1010 bs->dev = NULL;
1011 bs->dev_ops = NULL;
1012 bs->dev_opaque = NULL;
1013 bs->buffer_alignment = 512;
1016 /* TODO change to return DeviceState * when all users are qdevified */
1017 void *bdrv_get_attached_dev(BlockDriverState *bs)
1019 return bs->dev;
1022 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1023 void *opaque)
1025 bs->dev_ops = ops;
1026 bs->dev_opaque = opaque;
1027 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1028 bs_snapshots = NULL;
1032 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1033 BlockQMPEventAction action, int is_read)
1035 QObject *data;
1036 const char *action_str;
1038 switch (action) {
1039 case BDRV_ACTION_REPORT:
1040 action_str = "report";
1041 break;
1042 case BDRV_ACTION_IGNORE:
1043 action_str = "ignore";
1044 break;
1045 case BDRV_ACTION_STOP:
1046 action_str = "stop";
1047 break;
1048 default:
1049 abort();
1052 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1053 bdrv->device_name,
1054 action_str,
1055 is_read ? "read" : "write");
1056 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1058 qobject_decref(data);
1061 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1063 QObject *data;
1065 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1066 bdrv_get_device_name(bs), ejected);
1067 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1069 qobject_decref(data);
1072 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1074 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1075 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1076 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1077 if (tray_was_closed) {
1078 /* tray open */
1079 bdrv_emit_qmp_eject_event(bs, true);
1081 if (load) {
1082 /* tray close */
1083 bdrv_emit_qmp_eject_event(bs, false);
1088 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1090 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1093 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1095 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1096 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1100 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1102 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1103 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1105 return false;
1108 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1110 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1111 bs->dev_ops->resize_cb(bs->dev_opaque);
1115 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1117 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1118 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1120 return false;
1124 * Run consistency checks on an image
1126 * Returns 0 if the check could be completed (it doesn't mean that the image is
1127 * free of errors) or -errno when an internal error occurred. The results of the
1128 * check are stored in res.
1130 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
1132 if (bs->drv->bdrv_check == NULL) {
1133 return -ENOTSUP;
1136 memset(res, 0, sizeof(*res));
1137 return bs->drv->bdrv_check(bs, res);
1140 #define COMMIT_BUF_SECTORS 2048
1142 /* commit COW file into the raw image */
1143 int bdrv_commit(BlockDriverState *bs)
1145 BlockDriver *drv = bs->drv;
1146 BlockDriver *backing_drv;
1147 int64_t sector, total_sectors;
1148 int n, ro, open_flags;
1149 int ret = 0, rw_ret = 0;
1150 uint8_t *buf;
1151 char filename[1024];
1152 BlockDriverState *bs_rw, *bs_ro;
1154 if (!drv)
1155 return -ENOMEDIUM;
1157 if (!bs->backing_hd) {
1158 return -ENOTSUP;
1161 if (bs->backing_hd->keep_read_only) {
1162 return -EACCES;
1165 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1166 return -EBUSY;
1169 backing_drv = bs->backing_hd->drv;
1170 ro = bs->backing_hd->read_only;
1171 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1172 open_flags = bs->backing_hd->open_flags;
1174 if (ro) {
1175 /* re-open as RW */
1176 bdrv_delete(bs->backing_hd);
1177 bs->backing_hd = NULL;
1178 bs_rw = bdrv_new("");
1179 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1180 backing_drv);
1181 if (rw_ret < 0) {
1182 bdrv_delete(bs_rw);
1183 /* try to re-open read-only */
1184 bs_ro = bdrv_new("");
1185 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1186 backing_drv);
1187 if (ret < 0) {
1188 bdrv_delete(bs_ro);
1189 /* drive not functional anymore */
1190 bs->drv = NULL;
1191 return ret;
1193 bs->backing_hd = bs_ro;
1194 return rw_ret;
1196 bs->backing_hd = bs_rw;
1199 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1200 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1202 for (sector = 0; sector < total_sectors; sector += n) {
1203 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1205 if (bdrv_read(bs, sector, buf, n) != 0) {
1206 ret = -EIO;
1207 goto ro_cleanup;
1210 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1211 ret = -EIO;
1212 goto ro_cleanup;
1217 if (drv->bdrv_make_empty) {
1218 ret = drv->bdrv_make_empty(bs);
1219 bdrv_flush(bs);
1223 * Make sure all data we wrote to the backing device is actually
1224 * stable on disk.
1226 if (bs->backing_hd)
1227 bdrv_flush(bs->backing_hd);
1229 ro_cleanup:
1230 g_free(buf);
1232 if (ro) {
1233 /* re-open as RO */
1234 bdrv_delete(bs->backing_hd);
1235 bs->backing_hd = NULL;
1236 bs_ro = bdrv_new("");
1237 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1238 backing_drv);
1239 if (ret < 0) {
1240 bdrv_delete(bs_ro);
1241 /* drive not functional anymore */
1242 bs->drv = NULL;
1243 return ret;
1245 bs->backing_hd = bs_ro;
1246 bs->backing_hd->keep_read_only = 0;
1249 return ret;
1252 int bdrv_commit_all(void)
1254 BlockDriverState *bs;
1256 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1257 int ret = bdrv_commit(bs);
1258 if (ret < 0) {
1259 return ret;
1262 return 0;
1265 struct BdrvTrackedRequest {
1266 BlockDriverState *bs;
1267 int64_t sector_num;
1268 int nb_sectors;
1269 bool is_write;
1270 QLIST_ENTRY(BdrvTrackedRequest) list;
1271 Coroutine *co; /* owner, used for deadlock detection */
1272 CoQueue wait_queue; /* coroutines blocked on this request */
1276 * Remove an active request from the tracked requests list
1278 * This function should be called when a tracked request is completing.
1280 static void tracked_request_end(BdrvTrackedRequest *req)
1282 QLIST_REMOVE(req, list);
1283 qemu_co_queue_restart_all(&req->wait_queue);
1287 * Add an active request to the tracked requests list
1289 static void tracked_request_begin(BdrvTrackedRequest *req,
1290 BlockDriverState *bs,
1291 int64_t sector_num,
1292 int nb_sectors, bool is_write)
1294 *req = (BdrvTrackedRequest){
1295 .bs = bs,
1296 .sector_num = sector_num,
1297 .nb_sectors = nb_sectors,
1298 .is_write = is_write,
1299 .co = qemu_coroutine_self(),
1302 qemu_co_queue_init(&req->wait_queue);
1304 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1308 * Round a region to cluster boundaries
1310 static void round_to_clusters(BlockDriverState *bs,
1311 int64_t sector_num, int nb_sectors,
1312 int64_t *cluster_sector_num,
1313 int *cluster_nb_sectors)
1315 BlockDriverInfo bdi;
1317 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1318 *cluster_sector_num = sector_num;
1319 *cluster_nb_sectors = nb_sectors;
1320 } else {
1321 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1322 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1323 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1324 nb_sectors, c);
1328 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1329 int64_t sector_num, int nb_sectors) {
1330 /* aaaa bbbb */
1331 if (sector_num >= req->sector_num + req->nb_sectors) {
1332 return false;
1334 /* bbbb aaaa */
1335 if (req->sector_num >= sector_num + nb_sectors) {
1336 return false;
1338 return true;
1341 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1342 int64_t sector_num, int nb_sectors)
1344 BdrvTrackedRequest *req;
1345 int64_t cluster_sector_num;
1346 int cluster_nb_sectors;
1347 bool retry;
1349 /* If we touch the same cluster it counts as an overlap. This guarantees
1350 * that allocating writes will be serialized and not race with each other
1351 * for the same cluster. For example, in copy-on-read it ensures that the
1352 * CoR read and write operations are atomic and guest writes cannot
1353 * interleave between them.
1355 round_to_clusters(bs, sector_num, nb_sectors,
1356 &cluster_sector_num, &cluster_nb_sectors);
1358 do {
1359 retry = false;
1360 QLIST_FOREACH(req, &bs->tracked_requests, list) {
1361 if (tracked_request_overlaps(req, cluster_sector_num,
1362 cluster_nb_sectors)) {
1363 /* Hitting this means there was a reentrant request, for
1364 * example, a block driver issuing nested requests. This must
1365 * never happen since it means deadlock.
1367 assert(qemu_coroutine_self() != req->co);
1369 qemu_co_queue_wait(&req->wait_queue);
1370 retry = true;
1371 break;
1374 } while (retry);
1378 * Return values:
1379 * 0 - success
1380 * -EINVAL - backing format specified, but no file
1381 * -ENOSPC - can't update the backing file because no space is left in the
1382 * image file header
1383 * -ENOTSUP - format driver doesn't support changing the backing file
1385 int bdrv_change_backing_file(BlockDriverState *bs,
1386 const char *backing_file, const char *backing_fmt)
1388 BlockDriver *drv = bs->drv;
1390 if (drv->bdrv_change_backing_file != NULL) {
1391 return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1392 } else {
1393 return -ENOTSUP;
1397 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1398 size_t size)
1400 int64_t len;
1402 if (!bdrv_is_inserted(bs))
1403 return -ENOMEDIUM;
1405 if (bs->growable)
1406 return 0;
1408 len = bdrv_getlength(bs);
1410 if (offset < 0)
1411 return -EIO;
1413 if ((offset > len) || (len - offset < size))
1414 return -EIO;
1416 return 0;
1419 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1420 int nb_sectors)
1422 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1423 nb_sectors * BDRV_SECTOR_SIZE);
1426 typedef struct RwCo {
1427 BlockDriverState *bs;
1428 int64_t sector_num;
1429 int nb_sectors;
1430 QEMUIOVector *qiov;
1431 bool is_write;
1432 int ret;
1433 } RwCo;
1435 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1437 RwCo *rwco = opaque;
1439 if (!rwco->is_write) {
1440 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1441 rwco->nb_sectors, rwco->qiov, 0);
1442 } else {
1443 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1444 rwco->nb_sectors, rwco->qiov, 0);
1449 * Process a synchronous request using coroutines
1451 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1452 int nb_sectors, bool is_write)
1454 QEMUIOVector qiov;
1455 struct iovec iov = {
1456 .iov_base = (void *)buf,
1457 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1459 Coroutine *co;
1460 RwCo rwco = {
1461 .bs = bs,
1462 .sector_num = sector_num,
1463 .nb_sectors = nb_sectors,
1464 .qiov = &qiov,
1465 .is_write = is_write,
1466 .ret = NOT_DONE,
1469 qemu_iovec_init_external(&qiov, &iov, 1);
1472 * In sync call context, when the vcpu is blocked, this throttling timer
1473 * will not fire; so the I/O throttling function has to be disabled here
1474 * if it has been enabled.
1476 if (bs->io_limits_enabled) {
1477 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1478 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1479 bdrv_io_limits_disable(bs);
1482 if (qemu_in_coroutine()) {
1483 /* Fast-path if already in coroutine context */
1484 bdrv_rw_co_entry(&rwco);
1485 } else {
1486 co = qemu_coroutine_create(bdrv_rw_co_entry);
1487 qemu_coroutine_enter(co, &rwco);
1488 while (rwco.ret == NOT_DONE) {
1489 qemu_aio_wait();
1492 return rwco.ret;
1495 /* return < 0 if error. See bdrv_write() for the return codes */
1496 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1497 uint8_t *buf, int nb_sectors)
1499 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1502 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1503 int nb_sectors, int dirty)
1505 int64_t start, end;
1506 unsigned long val, idx, bit;
1508 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1509 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1511 for (; start <= end; start++) {
1512 idx = start / (sizeof(unsigned long) * 8);
1513 bit = start % (sizeof(unsigned long) * 8);
1514 val = bs->dirty_bitmap[idx];
1515 if (dirty) {
1516 if (!(val & (1UL << bit))) {
1517 bs->dirty_count++;
1518 val |= 1UL << bit;
1520 } else {
1521 if (val & (1UL << bit)) {
1522 bs->dirty_count--;
1523 val &= ~(1UL << bit);
1526 bs->dirty_bitmap[idx] = val;
1530 /* Return < 0 if error. Important errors are:
1531 -EIO generic I/O error (may happen for all errors)
1532 -ENOMEDIUM No media inserted.
1533 -EINVAL Invalid sector number or nb_sectors
1534 -EACCES Trying to write a read-only device
1536 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1537 const uint8_t *buf, int nb_sectors)
1539 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1542 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1543 void *buf, int count1)
1545 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1546 int len, nb_sectors, count;
1547 int64_t sector_num;
1548 int ret;
1550 count = count1;
1551 /* first read to align to sector start */
1552 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1553 if (len > count)
1554 len = count;
1555 sector_num = offset >> BDRV_SECTOR_BITS;
1556 if (len > 0) {
1557 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1558 return ret;
1559 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1560 count -= len;
1561 if (count == 0)
1562 return count1;
1563 sector_num++;
1564 buf += len;
1567 /* read the sectors "in place" */
1568 nb_sectors = count >> BDRV_SECTOR_BITS;
1569 if (nb_sectors > 0) {
1570 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1571 return ret;
1572 sector_num += nb_sectors;
1573 len = nb_sectors << BDRV_SECTOR_BITS;
1574 buf += len;
1575 count -= len;
1578 /* add data from the last sector */
1579 if (count > 0) {
1580 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1581 return ret;
1582 memcpy(buf, tmp_buf, count);
1584 return count1;
1587 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1588 const void *buf, int count1)
1590 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1591 int len, nb_sectors, count;
1592 int64_t sector_num;
1593 int ret;
1595 count = count1;
1596 /* first write to align to sector start */
1597 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1598 if (len > count)
1599 len = count;
1600 sector_num = offset >> BDRV_SECTOR_BITS;
1601 if (len > 0) {
1602 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1603 return ret;
1604 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1605 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1606 return ret;
1607 count -= len;
1608 if (count == 0)
1609 return count1;
1610 sector_num++;
1611 buf += len;
1614 /* write the sectors "in place" */
1615 nb_sectors = count >> BDRV_SECTOR_BITS;
1616 if (nb_sectors > 0) {
1617 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1618 return ret;
1619 sector_num += nb_sectors;
1620 len = nb_sectors << BDRV_SECTOR_BITS;
1621 buf += len;
1622 count -= len;
1625 /* add data from the last sector */
1626 if (count > 0) {
1627 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1628 return ret;
1629 memcpy(tmp_buf, buf, count);
1630 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1631 return ret;
1633 return count1;
1637 * Writes to the file and ensures that no writes are reordered across this
1638 * request (acts as a barrier)
1640 * Returns 0 on success, -errno in error cases.
1642 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1643 const void *buf, int count)
1645 int ret;
1647 ret = bdrv_pwrite(bs, offset, buf, count);
1648 if (ret < 0) {
1649 return ret;
1652 /* No flush needed for cache modes that use O_DSYNC */
1653 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1654 bdrv_flush(bs);
1657 return 0;
1660 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1661 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1663 /* Perform I/O through a temporary buffer so that users who scribble over
1664 * their read buffer while the operation is in progress do not end up
1665 * modifying the image file. This is critical for zero-copy guest I/O
1666 * where anything might happen inside guest memory.
1668 void *bounce_buffer;
1670 BlockDriver *drv = bs->drv;
1671 struct iovec iov;
1672 QEMUIOVector bounce_qiov;
1673 int64_t cluster_sector_num;
1674 int cluster_nb_sectors;
1675 size_t skip_bytes;
1676 int ret;
1678 /* Cover entire cluster so no additional backing file I/O is required when
1679 * allocating cluster in the image file.
1681 round_to_clusters(bs, sector_num, nb_sectors,
1682 &cluster_sector_num, &cluster_nb_sectors);
1684 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1685 cluster_sector_num, cluster_nb_sectors);
1687 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1688 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1689 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1691 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1692 &bounce_qiov);
1693 if (ret < 0) {
1694 goto err;
1697 if (drv->bdrv_co_write_zeroes &&
1698 buffer_is_zero(bounce_buffer, iov.iov_len)) {
1699 ret = drv->bdrv_co_write_zeroes(bs, cluster_sector_num,
1700 cluster_nb_sectors);
1701 } else {
1702 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1703 &bounce_qiov);
1706 if (ret < 0) {
1707 /* It might be okay to ignore write errors for guest requests. If this
1708 * is a deliberate copy-on-read then we don't want to ignore the error.
1709 * Simply report it in all cases.
1711 goto err;
1714 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1715 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1716 nb_sectors * BDRV_SECTOR_SIZE);
1718 err:
1719 qemu_vfree(bounce_buffer);
1720 return ret;
1724 * Handle a read request in coroutine context
1726 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1727 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1728 BdrvRequestFlags flags)
1730 BlockDriver *drv = bs->drv;
1731 BdrvTrackedRequest req;
1732 int ret;
1734 if (!drv) {
1735 return -ENOMEDIUM;
1737 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1738 return -EIO;
1741 /* throttling disk read I/O */
1742 if (bs->io_limits_enabled) {
1743 bdrv_io_limits_intercept(bs, false, nb_sectors);
1746 if (bs->copy_on_read) {
1747 flags |= BDRV_REQ_COPY_ON_READ;
1749 if (flags & BDRV_REQ_COPY_ON_READ) {
1750 bs->copy_on_read_in_flight++;
1753 if (bs->copy_on_read_in_flight) {
1754 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1757 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1759 if (flags & BDRV_REQ_COPY_ON_READ) {
1760 int pnum;
1762 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1763 if (ret < 0) {
1764 goto out;
1767 if (!ret || pnum != nb_sectors) {
1768 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1769 goto out;
1773 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1775 out:
1776 tracked_request_end(&req);
1778 if (flags & BDRV_REQ_COPY_ON_READ) {
1779 bs->copy_on_read_in_flight--;
1782 return ret;
1785 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1786 int nb_sectors, QEMUIOVector *qiov)
1788 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1790 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1793 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1794 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1796 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1798 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1799 BDRV_REQ_COPY_ON_READ);
1802 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1803 int64_t sector_num, int nb_sectors)
1805 BlockDriver *drv = bs->drv;
1806 QEMUIOVector qiov;
1807 struct iovec iov;
1808 int ret;
1810 /* First try the efficient write zeroes operation */
1811 if (drv->bdrv_co_write_zeroes) {
1812 return drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1815 /* Fall back to bounce buffer if write zeroes is unsupported */
1816 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
1817 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1818 memset(iov.iov_base, 0, iov.iov_len);
1819 qemu_iovec_init_external(&qiov, &iov, 1);
1821 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1823 qemu_vfree(iov.iov_base);
1824 return ret;
1828 * Handle a write request in coroutine context
1830 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1831 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1832 BdrvRequestFlags flags)
1834 BlockDriver *drv = bs->drv;
1835 BdrvTrackedRequest req;
1836 int ret;
1838 if (!bs->drv) {
1839 return -ENOMEDIUM;
1841 if (bs->read_only) {
1842 return -EACCES;
1844 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1845 return -EIO;
1848 /* throttling disk write I/O */
1849 if (bs->io_limits_enabled) {
1850 bdrv_io_limits_intercept(bs, true, nb_sectors);
1853 if (bs->copy_on_read_in_flight) {
1854 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1857 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1859 if (flags & BDRV_REQ_ZERO_WRITE) {
1860 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1861 } else {
1862 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1865 if (bs->dirty_bitmap) {
1866 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1869 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1870 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1873 tracked_request_end(&req);
1875 return ret;
1878 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1879 int nb_sectors, QEMUIOVector *qiov)
1881 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1883 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1886 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1887 int64_t sector_num, int nb_sectors)
1889 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1891 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1892 BDRV_REQ_ZERO_WRITE);
1896 * Truncate file to 'offset' bytes (needed only for file protocols)
1898 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1900 BlockDriver *drv = bs->drv;
1901 int ret;
1902 if (!drv)
1903 return -ENOMEDIUM;
1904 if (!drv->bdrv_truncate)
1905 return -ENOTSUP;
1906 if (bs->read_only)
1907 return -EACCES;
1908 if (bdrv_in_use(bs))
1909 return -EBUSY;
1910 ret = drv->bdrv_truncate(bs, offset);
1911 if (ret == 0) {
1912 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1913 bdrv_dev_resize_cb(bs);
1915 return ret;
1919 * Length of a allocated file in bytes. Sparse files are counted by actual
1920 * allocated space. Return < 0 if error or unknown.
1922 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1924 BlockDriver *drv = bs->drv;
1925 if (!drv) {
1926 return -ENOMEDIUM;
1928 if (drv->bdrv_get_allocated_file_size) {
1929 return drv->bdrv_get_allocated_file_size(bs);
1931 if (bs->file) {
1932 return bdrv_get_allocated_file_size(bs->file);
1934 return -ENOTSUP;
1938 * Length of a file in bytes. Return < 0 if error or unknown.
1940 int64_t bdrv_getlength(BlockDriverState *bs)
1942 BlockDriver *drv = bs->drv;
1943 if (!drv)
1944 return -ENOMEDIUM;
1946 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
1947 if (drv->bdrv_getlength) {
1948 return drv->bdrv_getlength(bs);
1951 return bs->total_sectors * BDRV_SECTOR_SIZE;
1954 /* return 0 as number of sectors if no device present or error */
1955 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1957 int64_t length;
1958 length = bdrv_getlength(bs);
1959 if (length < 0)
1960 length = 0;
1961 else
1962 length = length >> BDRV_SECTOR_BITS;
1963 *nb_sectors_ptr = length;
1966 struct partition {
1967 uint8_t boot_ind; /* 0x80 - active */
1968 uint8_t head; /* starting head */
1969 uint8_t sector; /* starting sector */
1970 uint8_t cyl; /* starting cylinder */
1971 uint8_t sys_ind; /* What partition type */
1972 uint8_t end_head; /* end head */
1973 uint8_t end_sector; /* end sector */
1974 uint8_t end_cyl; /* end cylinder */
1975 uint32_t start_sect; /* starting sector counting from 0 */
1976 uint32_t nr_sects; /* nr of sectors in partition */
1977 } QEMU_PACKED;
1979 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1980 static int guess_disk_lchs(BlockDriverState *bs,
1981 int *pcylinders, int *pheads, int *psectors)
1983 uint8_t buf[BDRV_SECTOR_SIZE];
1984 int ret, i, heads, sectors, cylinders;
1985 struct partition *p;
1986 uint32_t nr_sects;
1987 uint64_t nb_sectors;
1988 bool enabled;
1990 bdrv_get_geometry(bs, &nb_sectors);
1993 * The function will be invoked during startup not only in sync I/O mode,
1994 * but also in async I/O mode. So the I/O throttling function has to
1995 * be disabled temporarily here, not permanently.
1997 enabled = bs->io_limits_enabled;
1998 bs->io_limits_enabled = false;
1999 ret = bdrv_read(bs, 0, buf, 1);
2000 bs->io_limits_enabled = enabled;
2001 if (ret < 0)
2002 return -1;
2003 /* test msdos magic */
2004 if (buf[510] != 0x55 || buf[511] != 0xaa)
2005 return -1;
2006 for(i = 0; i < 4; i++) {
2007 p = ((struct partition *)(buf + 0x1be)) + i;
2008 nr_sects = le32_to_cpu(p->nr_sects);
2009 if (nr_sects && p->end_head) {
2010 /* We make the assumption that the partition terminates on
2011 a cylinder boundary */
2012 heads = p->end_head + 1;
2013 sectors = p->end_sector & 63;
2014 if (sectors == 0)
2015 continue;
2016 cylinders = nb_sectors / (heads * sectors);
2017 if (cylinders < 1 || cylinders > 16383)
2018 continue;
2019 *pheads = heads;
2020 *psectors = sectors;
2021 *pcylinders = cylinders;
2022 #if 0
2023 printf("guessed geometry: LCHS=%d %d %d\n",
2024 cylinders, heads, sectors);
2025 #endif
2026 return 0;
2029 return -1;
2032 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2034 int translation, lba_detected = 0;
2035 int cylinders, heads, secs;
2036 uint64_t nb_sectors;
2038 /* if a geometry hint is available, use it */
2039 bdrv_get_geometry(bs, &nb_sectors);
2040 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2041 translation = bdrv_get_translation_hint(bs);
2042 if (cylinders != 0) {
2043 *pcyls = cylinders;
2044 *pheads = heads;
2045 *psecs = secs;
2046 } else {
2047 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2048 if (heads > 16) {
2049 /* if heads > 16, it means that a BIOS LBA
2050 translation was active, so the default
2051 hardware geometry is OK */
2052 lba_detected = 1;
2053 goto default_geometry;
2054 } else {
2055 *pcyls = cylinders;
2056 *pheads = heads;
2057 *psecs = secs;
2058 /* disable any translation to be in sync with
2059 the logical geometry */
2060 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2061 bdrv_set_translation_hint(bs,
2062 BIOS_ATA_TRANSLATION_NONE);
2065 } else {
2066 default_geometry:
2067 /* if no geometry, use a standard physical disk geometry */
2068 cylinders = nb_sectors / (16 * 63);
2070 if (cylinders > 16383)
2071 cylinders = 16383;
2072 else if (cylinders < 2)
2073 cylinders = 2;
2074 *pcyls = cylinders;
2075 *pheads = 16;
2076 *psecs = 63;
2077 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2078 if ((*pcyls * *pheads) <= 131072) {
2079 bdrv_set_translation_hint(bs,
2080 BIOS_ATA_TRANSLATION_LARGE);
2081 } else {
2082 bdrv_set_translation_hint(bs,
2083 BIOS_ATA_TRANSLATION_LBA);
2087 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2091 void bdrv_set_geometry_hint(BlockDriverState *bs,
2092 int cyls, int heads, int secs)
2094 bs->cyls = cyls;
2095 bs->heads = heads;
2096 bs->secs = secs;
2099 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2101 bs->translation = translation;
2104 void bdrv_get_geometry_hint(BlockDriverState *bs,
2105 int *pcyls, int *pheads, int *psecs)
2107 *pcyls = bs->cyls;
2108 *pheads = bs->heads;
2109 *psecs = bs->secs;
2112 /* throttling disk io limits */
2113 void bdrv_set_io_limits(BlockDriverState *bs,
2114 BlockIOLimit *io_limits)
2116 bs->io_limits = *io_limits;
2117 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2120 /* Recognize floppy formats */
2121 typedef struct FDFormat {
2122 FDriveType drive;
2123 uint8_t last_sect;
2124 uint8_t max_track;
2125 uint8_t max_head;
2126 FDriveRate rate;
2127 } FDFormat;
2129 static const FDFormat fd_formats[] = {
2130 /* First entry is default format */
2131 /* 1.44 MB 3"1/2 floppy disks */
2132 { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2133 { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2134 { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2135 { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2136 { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2137 { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2138 { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2139 { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2140 /* 2.88 MB 3"1/2 floppy disks */
2141 { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2142 { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2143 { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2144 { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2145 { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2146 /* 720 kB 3"1/2 floppy disks */
2147 { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, },
2148 { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2149 { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2150 { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2151 { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2152 { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2153 /* 1.2 MB 5"1/4 floppy disks */
2154 { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2155 { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2156 { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2157 { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2158 { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2159 /* 720 kB 5"1/4 floppy disks */
2160 { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, },
2161 { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2162 /* 360 kB 5"1/4 floppy disks */
2163 { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, },
2164 { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, },
2165 { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2166 { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2167 /* 320 kB 5"1/4 floppy disks */
2168 { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, },
2169 { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, },
2170 /* 360 kB must match 5"1/4 better than 3"1/2... */
2171 { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, },
2172 /* end */
2173 { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2176 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2177 int *max_track, int *last_sect,
2178 FDriveType drive_in, FDriveType *drive,
2179 FDriveRate *rate)
2181 const FDFormat *parse;
2182 uint64_t nb_sectors, size;
2183 int i, first_match, match;
2185 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2186 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2187 /* User defined disk */
2188 *rate = FDRIVE_RATE_500K;
2189 } else {
2190 bdrv_get_geometry(bs, &nb_sectors);
2191 match = -1;
2192 first_match = -1;
2193 for (i = 0; ; i++) {
2194 parse = &fd_formats[i];
2195 if (parse->drive == FDRIVE_DRV_NONE) {
2196 break;
2198 if (drive_in == parse->drive ||
2199 drive_in == FDRIVE_DRV_NONE) {
2200 size = (parse->max_head + 1) * parse->max_track *
2201 parse->last_sect;
2202 if (nb_sectors == size) {
2203 match = i;
2204 break;
2206 if (first_match == -1) {
2207 first_match = i;
2211 if (match == -1) {
2212 if (first_match == -1) {
2213 match = 1;
2214 } else {
2215 match = first_match;
2217 parse = &fd_formats[match];
2219 *nb_heads = parse->max_head + 1;
2220 *max_track = parse->max_track;
2221 *last_sect = parse->last_sect;
2222 *drive = parse->drive;
2223 *rate = parse->rate;
2227 int bdrv_get_translation_hint(BlockDriverState *bs)
2229 return bs->translation;
2232 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2233 BlockErrorAction on_write_error)
2235 bs->on_read_error = on_read_error;
2236 bs->on_write_error = on_write_error;
2239 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2241 return is_read ? bs->on_read_error : bs->on_write_error;
2244 int bdrv_is_read_only(BlockDriverState *bs)
2246 return bs->read_only;
2249 int bdrv_is_sg(BlockDriverState *bs)
2251 return bs->sg;
2254 int bdrv_enable_write_cache(BlockDriverState *bs)
2256 return bs->enable_write_cache;
2259 int bdrv_is_encrypted(BlockDriverState *bs)
2261 if (bs->backing_hd && bs->backing_hd->encrypted)
2262 return 1;
2263 return bs->encrypted;
2266 int bdrv_key_required(BlockDriverState *bs)
2268 BlockDriverState *backing_hd = bs->backing_hd;
2270 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2271 return 1;
2272 return (bs->encrypted && !bs->valid_key);
2275 int bdrv_set_key(BlockDriverState *bs, const char *key)
2277 int ret;
2278 if (bs->backing_hd && bs->backing_hd->encrypted) {
2279 ret = bdrv_set_key(bs->backing_hd, key);
2280 if (ret < 0)
2281 return ret;
2282 if (!bs->encrypted)
2283 return 0;
2285 if (!bs->encrypted) {
2286 return -EINVAL;
2287 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2288 return -ENOMEDIUM;
2290 ret = bs->drv->bdrv_set_key(bs, key);
2291 if (ret < 0) {
2292 bs->valid_key = 0;
2293 } else if (!bs->valid_key) {
2294 bs->valid_key = 1;
2295 /* call the change callback now, we skipped it on open */
2296 bdrv_dev_change_media_cb(bs, true);
2298 return ret;
2301 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2303 if (!bs->drv) {
2304 buf[0] = '\0';
2305 } else {
2306 pstrcpy(buf, buf_size, bs->drv->format_name);
2310 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2311 void *opaque)
2313 BlockDriver *drv;
2315 QLIST_FOREACH(drv, &bdrv_drivers, list) {
2316 it(opaque, drv->format_name);
2320 BlockDriverState *bdrv_find(const char *name)
2322 BlockDriverState *bs;
2324 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2325 if (!strcmp(name, bs->device_name)) {
2326 return bs;
2329 return NULL;
2332 BlockDriverState *bdrv_next(BlockDriverState *bs)
2334 if (!bs) {
2335 return QTAILQ_FIRST(&bdrv_states);
2337 return QTAILQ_NEXT(bs, list);
2340 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2342 BlockDriverState *bs;
2344 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2345 it(opaque, bs);
2349 const char *bdrv_get_device_name(BlockDriverState *bs)
2351 return bs->device_name;
2354 void bdrv_flush_all(void)
2356 BlockDriverState *bs;
2358 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2359 bdrv_flush(bs);
2363 int bdrv_has_zero_init(BlockDriverState *bs)
2365 assert(bs->drv);
2367 if (bs->drv->bdrv_has_zero_init) {
2368 return bs->drv->bdrv_has_zero_init(bs);
2371 return 1;
2374 typedef struct BdrvCoIsAllocatedData {
2375 BlockDriverState *bs;
2376 int64_t sector_num;
2377 int nb_sectors;
2378 int *pnum;
2379 int ret;
2380 bool done;
2381 } BdrvCoIsAllocatedData;
2384 * Returns true iff the specified sector is present in the disk image. Drivers
2385 * not implementing the functionality are assumed to not support backing files,
2386 * hence all their sectors are reported as allocated.
2388 * If 'sector_num' is beyond the end of the disk image the return value is 0
2389 * and 'pnum' is set to 0.
2391 * 'pnum' is set to the number of sectors (including and immediately following
2392 * the specified sector) that are known to be in the same
2393 * allocated/unallocated state.
2395 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2396 * beyond the end of the disk image it will be clamped.
2398 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2399 int nb_sectors, int *pnum)
2401 int64_t n;
2403 if (sector_num >= bs->total_sectors) {
2404 *pnum = 0;
2405 return 0;
2408 n = bs->total_sectors - sector_num;
2409 if (n < nb_sectors) {
2410 nb_sectors = n;
2413 if (!bs->drv->bdrv_co_is_allocated) {
2414 *pnum = nb_sectors;
2415 return 1;
2418 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2421 /* Coroutine wrapper for bdrv_is_allocated() */
2422 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2424 BdrvCoIsAllocatedData *data = opaque;
2425 BlockDriverState *bs = data->bs;
2427 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2428 data->pnum);
2429 data->done = true;
2433 * Synchronous wrapper around bdrv_co_is_allocated().
2435 * See bdrv_co_is_allocated() for details.
2437 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2438 int *pnum)
2440 Coroutine *co;
2441 BdrvCoIsAllocatedData data = {
2442 .bs = bs,
2443 .sector_num = sector_num,
2444 .nb_sectors = nb_sectors,
2445 .pnum = pnum,
2446 .done = false,
2449 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2450 qemu_coroutine_enter(co, &data);
2451 while (!data.done) {
2452 qemu_aio_wait();
2454 return data.ret;
2457 BlockInfoList *qmp_query_block(Error **errp)
2459 BlockInfoList *head = NULL, *cur_item = NULL;
2460 BlockDriverState *bs;
2462 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2463 BlockInfoList *info = g_malloc0(sizeof(*info));
2465 info->value = g_malloc0(sizeof(*info->value));
2466 info->value->device = g_strdup(bs->device_name);
2467 info->value->type = g_strdup("unknown");
2468 info->value->locked = bdrv_dev_is_medium_locked(bs);
2469 info->value->removable = bdrv_dev_has_removable_media(bs);
2471 if (bdrv_dev_has_removable_media(bs)) {
2472 info->value->has_tray_open = true;
2473 info->value->tray_open = bdrv_dev_is_tray_open(bs);
2476 if (bdrv_iostatus_is_enabled(bs)) {
2477 info->value->has_io_status = true;
2478 info->value->io_status = bs->iostatus;
2481 if (bs->drv) {
2482 info->value->has_inserted = true;
2483 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2484 info->value->inserted->file = g_strdup(bs->filename);
2485 info->value->inserted->ro = bs->read_only;
2486 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2487 info->value->inserted->encrypted = bs->encrypted;
2488 if (bs->backing_file[0]) {
2489 info->value->inserted->has_backing_file = true;
2490 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2493 if (bs->io_limits_enabled) {
2494 info->value->inserted->bps =
2495 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2496 info->value->inserted->bps_rd =
2497 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2498 info->value->inserted->bps_wr =
2499 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2500 info->value->inserted->iops =
2501 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2502 info->value->inserted->iops_rd =
2503 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2504 info->value->inserted->iops_wr =
2505 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2509 /* XXX: waiting for the qapi to support GSList */
2510 if (!cur_item) {
2511 head = cur_item = info;
2512 } else {
2513 cur_item->next = info;
2514 cur_item = info;
2518 return head;
2521 /* Consider exposing this as a full fledged QMP command */
2522 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2524 BlockStats *s;
2526 s = g_malloc0(sizeof(*s));
2528 if (bs->device_name[0]) {
2529 s->has_device = true;
2530 s->device = g_strdup(bs->device_name);
2533 s->stats = g_malloc0(sizeof(*s->stats));
2534 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2535 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2536 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2537 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2538 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2539 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2540 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2541 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2542 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2544 if (bs->file) {
2545 s->has_parent = true;
2546 s->parent = qmp_query_blockstat(bs->file, NULL);
2549 return s;
2552 BlockStatsList *qmp_query_blockstats(Error **errp)
2554 BlockStatsList *head = NULL, *cur_item = NULL;
2555 BlockDriverState *bs;
2557 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2558 BlockStatsList *info = g_malloc0(sizeof(*info));
2559 info->value = qmp_query_blockstat(bs, NULL);
2561 /* XXX: waiting for the qapi to support GSList */
2562 if (!cur_item) {
2563 head = cur_item = info;
2564 } else {
2565 cur_item->next = info;
2566 cur_item = info;
2570 return head;
2573 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2575 if (bs->backing_hd && bs->backing_hd->encrypted)
2576 return bs->backing_file;
2577 else if (bs->encrypted)
2578 return bs->filename;
2579 else
2580 return NULL;
2583 void bdrv_get_backing_filename(BlockDriverState *bs,
2584 char *filename, int filename_size)
2586 pstrcpy(filename, filename_size, bs->backing_file);
2589 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2590 const uint8_t *buf, int nb_sectors)
2592 BlockDriver *drv = bs->drv;
2593 if (!drv)
2594 return -ENOMEDIUM;
2595 if (!drv->bdrv_write_compressed)
2596 return -ENOTSUP;
2597 if (bdrv_check_request(bs, sector_num, nb_sectors))
2598 return -EIO;
2600 if (bs->dirty_bitmap) {
2601 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2604 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2607 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2609 BlockDriver *drv = bs->drv;
2610 if (!drv)
2611 return -ENOMEDIUM;
2612 if (!drv->bdrv_get_info)
2613 return -ENOTSUP;
2614 memset(bdi, 0, sizeof(*bdi));
2615 return drv->bdrv_get_info(bs, bdi);
2618 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2619 int64_t pos, int size)
2621 BlockDriver *drv = bs->drv;
2622 if (!drv)
2623 return -ENOMEDIUM;
2624 if (drv->bdrv_save_vmstate)
2625 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2626 if (bs->file)
2627 return bdrv_save_vmstate(bs->file, buf, pos, size);
2628 return -ENOTSUP;
2631 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2632 int64_t pos, int size)
2634 BlockDriver *drv = bs->drv;
2635 if (!drv)
2636 return -ENOMEDIUM;
2637 if (drv->bdrv_load_vmstate)
2638 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2639 if (bs->file)
2640 return bdrv_load_vmstate(bs->file, buf, pos, size);
2641 return -ENOTSUP;
2644 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2646 BlockDriver *drv = bs->drv;
2648 if (!drv || !drv->bdrv_debug_event) {
2649 return;
2652 return drv->bdrv_debug_event(bs, event);
2656 /**************************************************************/
2657 /* handling of snapshots */
2659 int bdrv_can_snapshot(BlockDriverState *bs)
2661 BlockDriver *drv = bs->drv;
2662 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2663 return 0;
2666 if (!drv->bdrv_snapshot_create) {
2667 if (bs->file != NULL) {
2668 return bdrv_can_snapshot(bs->file);
2670 return 0;
2673 return 1;
2676 int bdrv_is_snapshot(BlockDriverState *bs)
2678 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2681 BlockDriverState *bdrv_snapshots(void)
2683 BlockDriverState *bs;
2685 if (bs_snapshots) {
2686 return bs_snapshots;
2689 bs = NULL;
2690 while ((bs = bdrv_next(bs))) {
2691 if (bdrv_can_snapshot(bs)) {
2692 bs_snapshots = bs;
2693 return bs;
2696 return NULL;
2699 int bdrv_snapshot_create(BlockDriverState *bs,
2700 QEMUSnapshotInfo *sn_info)
2702 BlockDriver *drv = bs->drv;
2703 if (!drv)
2704 return -ENOMEDIUM;
2705 if (drv->bdrv_snapshot_create)
2706 return drv->bdrv_snapshot_create(bs, sn_info);
2707 if (bs->file)
2708 return bdrv_snapshot_create(bs->file, sn_info);
2709 return -ENOTSUP;
2712 int bdrv_snapshot_goto(BlockDriverState *bs,
2713 const char *snapshot_id)
2715 BlockDriver *drv = bs->drv;
2716 int ret, open_ret;
2718 if (!drv)
2719 return -ENOMEDIUM;
2720 if (drv->bdrv_snapshot_goto)
2721 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2723 if (bs->file) {
2724 drv->bdrv_close(bs);
2725 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2726 open_ret = drv->bdrv_open(bs, bs->open_flags);
2727 if (open_ret < 0) {
2728 bdrv_delete(bs->file);
2729 bs->drv = NULL;
2730 return open_ret;
2732 return ret;
2735 return -ENOTSUP;
2738 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2740 BlockDriver *drv = bs->drv;
2741 if (!drv)
2742 return -ENOMEDIUM;
2743 if (drv->bdrv_snapshot_delete)
2744 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2745 if (bs->file)
2746 return bdrv_snapshot_delete(bs->file, snapshot_id);
2747 return -ENOTSUP;
2750 int bdrv_snapshot_list(BlockDriverState *bs,
2751 QEMUSnapshotInfo **psn_info)
2753 BlockDriver *drv = bs->drv;
2754 if (!drv)
2755 return -ENOMEDIUM;
2756 if (drv->bdrv_snapshot_list)
2757 return drv->bdrv_snapshot_list(bs, psn_info);
2758 if (bs->file)
2759 return bdrv_snapshot_list(bs->file, psn_info);
2760 return -ENOTSUP;
2763 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2764 const char *snapshot_name)
2766 BlockDriver *drv = bs->drv;
2767 if (!drv) {
2768 return -ENOMEDIUM;
2770 if (!bs->read_only) {
2771 return -EINVAL;
2773 if (drv->bdrv_snapshot_load_tmp) {
2774 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2776 return -ENOTSUP;
2779 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2780 const char *backing_file)
2782 if (!bs->drv) {
2783 return NULL;
2786 if (bs->backing_hd) {
2787 if (strcmp(bs->backing_file, backing_file) == 0) {
2788 return bs->backing_hd;
2789 } else {
2790 return bdrv_find_backing_image(bs->backing_hd, backing_file);
2794 return NULL;
2797 #define NB_SUFFIXES 4
2799 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2801 static const char suffixes[NB_SUFFIXES] = "KMGT";
2802 int64_t base;
2803 int i;
2805 if (size <= 999) {
2806 snprintf(buf, buf_size, "%" PRId64, size);
2807 } else {
2808 base = 1024;
2809 for(i = 0; i < NB_SUFFIXES; i++) {
2810 if (size < (10 * base)) {
2811 snprintf(buf, buf_size, "%0.1f%c",
2812 (double)size / base,
2813 suffixes[i]);
2814 break;
2815 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2816 snprintf(buf, buf_size, "%" PRId64 "%c",
2817 ((size + (base >> 1)) / base),
2818 suffixes[i]);
2819 break;
2821 base = base * 1024;
2824 return buf;
2827 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2829 char buf1[128], date_buf[128], clock_buf[128];
2830 #ifdef _WIN32
2831 struct tm *ptm;
2832 #else
2833 struct tm tm;
2834 #endif
2835 time_t ti;
2836 int64_t secs;
2838 if (!sn) {
2839 snprintf(buf, buf_size,
2840 "%-10s%-20s%7s%20s%15s",
2841 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2842 } else {
2843 ti = sn->date_sec;
2844 #ifdef _WIN32
2845 ptm = localtime(&ti);
2846 strftime(date_buf, sizeof(date_buf),
2847 "%Y-%m-%d %H:%M:%S", ptm);
2848 #else
2849 localtime_r(&ti, &tm);
2850 strftime(date_buf, sizeof(date_buf),
2851 "%Y-%m-%d %H:%M:%S", &tm);
2852 #endif
2853 secs = sn->vm_clock_nsec / 1000000000;
2854 snprintf(clock_buf, sizeof(clock_buf),
2855 "%02d:%02d:%02d.%03d",
2856 (int)(secs / 3600),
2857 (int)((secs / 60) % 60),
2858 (int)(secs % 60),
2859 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2860 snprintf(buf, buf_size,
2861 "%-10s%-20s%7s%20s%15s",
2862 sn->id_str, sn->name,
2863 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2864 date_buf,
2865 clock_buf);
2867 return buf;
2870 /**************************************************************/
2871 /* async I/Os */
2873 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2874 QEMUIOVector *qiov, int nb_sectors,
2875 BlockDriverCompletionFunc *cb, void *opaque)
2877 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2879 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2880 cb, opaque, false);
2883 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2884 QEMUIOVector *qiov, int nb_sectors,
2885 BlockDriverCompletionFunc *cb, void *opaque)
2887 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2889 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2890 cb, opaque, true);
2894 typedef struct MultiwriteCB {
2895 int error;
2896 int num_requests;
2897 int num_callbacks;
2898 struct {
2899 BlockDriverCompletionFunc *cb;
2900 void *opaque;
2901 QEMUIOVector *free_qiov;
2902 } callbacks[];
2903 } MultiwriteCB;
2905 static void multiwrite_user_cb(MultiwriteCB *mcb)
2907 int i;
2909 for (i = 0; i < mcb->num_callbacks; i++) {
2910 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2911 if (mcb->callbacks[i].free_qiov) {
2912 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2914 g_free(mcb->callbacks[i].free_qiov);
2918 static void multiwrite_cb(void *opaque, int ret)
2920 MultiwriteCB *mcb = opaque;
2922 trace_multiwrite_cb(mcb, ret);
2924 if (ret < 0 && !mcb->error) {
2925 mcb->error = ret;
2928 mcb->num_requests--;
2929 if (mcb->num_requests == 0) {
2930 multiwrite_user_cb(mcb);
2931 g_free(mcb);
2935 static int multiwrite_req_compare(const void *a, const void *b)
2937 const BlockRequest *req1 = a, *req2 = b;
2940 * Note that we can't simply subtract req2->sector from req1->sector
2941 * here as that could overflow the return value.
2943 if (req1->sector > req2->sector) {
2944 return 1;
2945 } else if (req1->sector < req2->sector) {
2946 return -1;
2947 } else {
2948 return 0;
2953 * Takes a bunch of requests and tries to merge them. Returns the number of
2954 * requests that remain after merging.
2956 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2957 int num_reqs, MultiwriteCB *mcb)
2959 int i, outidx;
2961 // Sort requests by start sector
2962 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2964 // Check if adjacent requests touch the same clusters. If so, combine them,
2965 // filling up gaps with zero sectors.
2966 outidx = 0;
2967 for (i = 1; i < num_reqs; i++) {
2968 int merge = 0;
2969 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2971 // Handle exactly sequential writes and overlapping writes.
2972 if (reqs[i].sector <= oldreq_last) {
2973 merge = 1;
2976 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2977 merge = 0;
2980 if (merge) {
2981 size_t size;
2982 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2983 qemu_iovec_init(qiov,
2984 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2986 // Add the first request to the merged one. If the requests are
2987 // overlapping, drop the last sectors of the first request.
2988 size = (reqs[i].sector - reqs[outidx].sector) << 9;
2989 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2991 // We should need to add any zeros between the two requests
2992 assert (reqs[i].sector <= oldreq_last);
2994 // Add the second request
2995 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2997 reqs[outidx].nb_sectors = qiov->size >> 9;
2998 reqs[outidx].qiov = qiov;
3000 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3001 } else {
3002 outidx++;
3003 reqs[outidx].sector = reqs[i].sector;
3004 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3005 reqs[outidx].qiov = reqs[i].qiov;
3009 return outidx + 1;
3013 * Submit multiple AIO write requests at once.
3015 * On success, the function returns 0 and all requests in the reqs array have
3016 * been submitted. In error case this function returns -1, and any of the
3017 * requests may or may not be submitted yet. In particular, this means that the
3018 * callback will be called for some of the requests, for others it won't. The
3019 * caller must check the error field of the BlockRequest to wait for the right
3020 * callbacks (if error != 0, no callback will be called).
3022 * The implementation may modify the contents of the reqs array, e.g. to merge
3023 * requests. However, the fields opaque and error are left unmodified as they
3024 * are used to signal failure for a single request to the caller.
3026 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3028 MultiwriteCB *mcb;
3029 int i;
3031 /* don't submit writes if we don't have a medium */
3032 if (bs->drv == NULL) {
3033 for (i = 0; i < num_reqs; i++) {
3034 reqs[i].error = -ENOMEDIUM;
3036 return -1;
3039 if (num_reqs == 0) {
3040 return 0;
3043 // Create MultiwriteCB structure
3044 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3045 mcb->num_requests = 0;
3046 mcb->num_callbacks = num_reqs;
3048 for (i = 0; i < num_reqs; i++) {
3049 mcb->callbacks[i].cb = reqs[i].cb;
3050 mcb->callbacks[i].opaque = reqs[i].opaque;
3053 // Check for mergable requests
3054 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3056 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3058 /* Run the aio requests. */
3059 mcb->num_requests = num_reqs;
3060 for (i = 0; i < num_reqs; i++) {
3061 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3062 reqs[i].nb_sectors, multiwrite_cb, mcb);
3065 return 0;
3068 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3070 acb->pool->cancel(acb);
3073 /* block I/O throttling */
3074 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3075 bool is_write, double elapsed_time, uint64_t *wait)
3077 uint64_t bps_limit = 0;
3078 double bytes_limit, bytes_base, bytes_res;
3079 double slice_time, wait_time;
3081 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3082 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3083 } else if (bs->io_limits.bps[is_write]) {
3084 bps_limit = bs->io_limits.bps[is_write];
3085 } else {
3086 if (wait) {
3087 *wait = 0;
3090 return false;
3093 slice_time = bs->slice_end - bs->slice_start;
3094 slice_time /= (NANOSECONDS_PER_SECOND);
3095 bytes_limit = bps_limit * slice_time;
3096 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3097 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3098 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3101 /* bytes_base: the bytes of data which have been read/written; and
3102 * it is obtained from the history statistic info.
3103 * bytes_res: the remaining bytes of data which need to be read/written.
3104 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3105 * the total time for completing reading/writting all data.
3107 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3109 if (bytes_base + bytes_res <= bytes_limit) {
3110 if (wait) {
3111 *wait = 0;
3114 return false;
3117 /* Calc approx time to dispatch */
3118 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3120 /* When the I/O rate at runtime exceeds the limits,
3121 * bs->slice_end need to be extended in order that the current statistic
3122 * info can be kept until the timer fire, so it is increased and tuned
3123 * based on the result of experiment.
3125 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3126 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3127 if (wait) {
3128 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3131 return true;
3134 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3135 double elapsed_time, uint64_t *wait)
3137 uint64_t iops_limit = 0;
3138 double ios_limit, ios_base;
3139 double slice_time, wait_time;
3141 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3142 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3143 } else if (bs->io_limits.iops[is_write]) {
3144 iops_limit = bs->io_limits.iops[is_write];
3145 } else {
3146 if (wait) {
3147 *wait = 0;
3150 return false;
3153 slice_time = bs->slice_end - bs->slice_start;
3154 slice_time /= (NANOSECONDS_PER_SECOND);
3155 ios_limit = iops_limit * slice_time;
3156 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3157 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3158 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3161 if (ios_base + 1 <= ios_limit) {
3162 if (wait) {
3163 *wait = 0;
3166 return false;
3169 /* Calc approx time to dispatch */
3170 wait_time = (ios_base + 1) / iops_limit;
3171 if (wait_time > elapsed_time) {
3172 wait_time = wait_time - elapsed_time;
3173 } else {
3174 wait_time = 0;
3177 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3178 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3179 if (wait) {
3180 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3183 return true;
3186 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3187 bool is_write, int64_t *wait)
3189 int64_t now, max_wait;
3190 uint64_t bps_wait = 0, iops_wait = 0;
3191 double elapsed_time;
3192 int bps_ret, iops_ret;
3194 now = qemu_get_clock_ns(vm_clock);
3195 if ((bs->slice_start < now)
3196 && (bs->slice_end > now)) {
3197 bs->slice_end = now + bs->slice_time;
3198 } else {
3199 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3200 bs->slice_start = now;
3201 bs->slice_end = now + bs->slice_time;
3203 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3204 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3206 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3207 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3210 elapsed_time = now - bs->slice_start;
3211 elapsed_time /= (NANOSECONDS_PER_SECOND);
3213 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3214 is_write, elapsed_time, &bps_wait);
3215 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3216 elapsed_time, &iops_wait);
3217 if (bps_ret || iops_ret) {
3218 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3219 if (wait) {
3220 *wait = max_wait;
3223 now = qemu_get_clock_ns(vm_clock);
3224 if (bs->slice_end < now + max_wait) {
3225 bs->slice_end = now + max_wait;
3228 return true;
3231 if (wait) {
3232 *wait = 0;
3235 return false;
3238 /**************************************************************/
3239 /* async block device emulation */
3241 typedef struct BlockDriverAIOCBSync {
3242 BlockDriverAIOCB common;
3243 QEMUBH *bh;
3244 int ret;
3245 /* vector translation state */
3246 QEMUIOVector *qiov;
3247 uint8_t *bounce;
3248 int is_write;
3249 } BlockDriverAIOCBSync;
3251 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3253 BlockDriverAIOCBSync *acb =
3254 container_of(blockacb, BlockDriverAIOCBSync, common);
3255 qemu_bh_delete(acb->bh);
3256 acb->bh = NULL;
3257 qemu_aio_release(acb);
3260 static AIOPool bdrv_em_aio_pool = {
3261 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3262 .cancel = bdrv_aio_cancel_em,
3265 static void bdrv_aio_bh_cb(void *opaque)
3267 BlockDriverAIOCBSync *acb = opaque;
3269 if (!acb->is_write)
3270 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3271 qemu_vfree(acb->bounce);
3272 acb->common.cb(acb->common.opaque, acb->ret);
3273 qemu_bh_delete(acb->bh);
3274 acb->bh = NULL;
3275 qemu_aio_release(acb);
3278 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3279 int64_t sector_num,
3280 QEMUIOVector *qiov,
3281 int nb_sectors,
3282 BlockDriverCompletionFunc *cb,
3283 void *opaque,
3284 int is_write)
3287 BlockDriverAIOCBSync *acb;
3289 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3290 acb->is_write = is_write;
3291 acb->qiov = qiov;
3292 acb->bounce = qemu_blockalign(bs, qiov->size);
3293 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3295 if (is_write) {
3296 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3297 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3298 } else {
3299 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3302 qemu_bh_schedule(acb->bh);
3304 return &acb->common;
3307 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3308 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3309 BlockDriverCompletionFunc *cb, void *opaque)
3311 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3314 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3315 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3316 BlockDriverCompletionFunc *cb, void *opaque)
3318 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3322 typedef struct BlockDriverAIOCBCoroutine {
3323 BlockDriverAIOCB common;
3324 BlockRequest req;
3325 bool is_write;
3326 QEMUBH* bh;
3327 } BlockDriverAIOCBCoroutine;
3329 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3331 qemu_aio_flush();
3334 static AIOPool bdrv_em_co_aio_pool = {
3335 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3336 .cancel = bdrv_aio_co_cancel_em,
3339 static void bdrv_co_em_bh(void *opaque)
3341 BlockDriverAIOCBCoroutine *acb = opaque;
3343 acb->common.cb(acb->common.opaque, acb->req.error);
3344 qemu_bh_delete(acb->bh);
3345 qemu_aio_release(acb);
3348 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3349 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3351 BlockDriverAIOCBCoroutine *acb = opaque;
3352 BlockDriverState *bs = acb->common.bs;
3354 if (!acb->is_write) {
3355 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3356 acb->req.nb_sectors, acb->req.qiov, 0);
3357 } else {
3358 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3359 acb->req.nb_sectors, acb->req.qiov, 0);
3362 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3363 qemu_bh_schedule(acb->bh);
3366 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3367 int64_t sector_num,
3368 QEMUIOVector *qiov,
3369 int nb_sectors,
3370 BlockDriverCompletionFunc *cb,
3371 void *opaque,
3372 bool is_write)
3374 Coroutine *co;
3375 BlockDriverAIOCBCoroutine *acb;
3377 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3378 acb->req.sector = sector_num;
3379 acb->req.nb_sectors = nb_sectors;
3380 acb->req.qiov = qiov;
3381 acb->is_write = is_write;
3383 co = qemu_coroutine_create(bdrv_co_do_rw);
3384 qemu_coroutine_enter(co, acb);
3386 return &acb->common;
3389 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3391 BlockDriverAIOCBCoroutine *acb = opaque;
3392 BlockDriverState *bs = acb->common.bs;
3394 acb->req.error = bdrv_co_flush(bs);
3395 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3396 qemu_bh_schedule(acb->bh);
3399 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3400 BlockDriverCompletionFunc *cb, void *opaque)
3402 trace_bdrv_aio_flush(bs, opaque);
3404 Coroutine *co;
3405 BlockDriverAIOCBCoroutine *acb;
3407 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3408 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3409 qemu_coroutine_enter(co, acb);
3411 return &acb->common;
3414 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3416 BlockDriverAIOCBCoroutine *acb = opaque;
3417 BlockDriverState *bs = acb->common.bs;
3419 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3420 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3421 qemu_bh_schedule(acb->bh);
3424 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3425 int64_t sector_num, int nb_sectors,
3426 BlockDriverCompletionFunc *cb, void *opaque)
3428 Coroutine *co;
3429 BlockDriverAIOCBCoroutine *acb;
3431 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3433 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3434 acb->req.sector = sector_num;
3435 acb->req.nb_sectors = nb_sectors;
3436 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3437 qemu_coroutine_enter(co, acb);
3439 return &acb->common;
3442 void bdrv_init(void)
3444 module_call_init(MODULE_INIT_BLOCK);
3447 void bdrv_init_with_whitelist(void)
3449 use_bdrv_whitelist = 1;
3450 bdrv_init();
3453 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3454 BlockDriverCompletionFunc *cb, void *opaque)
3456 BlockDriverAIOCB *acb;
3458 if (pool->free_aiocb) {
3459 acb = pool->free_aiocb;
3460 pool->free_aiocb = acb->next;
3461 } else {
3462 acb = g_malloc0(pool->aiocb_size);
3463 acb->pool = pool;
3465 acb->bs = bs;
3466 acb->cb = cb;
3467 acb->opaque = opaque;
3468 return acb;
3471 void qemu_aio_release(void *p)
3473 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3474 AIOPool *pool = acb->pool;
3475 acb->next = pool->free_aiocb;
3476 pool->free_aiocb = acb;
3479 /**************************************************************/
3480 /* Coroutine block device emulation */
3482 typedef struct CoroutineIOCompletion {
3483 Coroutine *coroutine;
3484 int ret;
3485 } CoroutineIOCompletion;
3487 static void bdrv_co_io_em_complete(void *opaque, int ret)
3489 CoroutineIOCompletion *co = opaque;
3491 co->ret = ret;
3492 qemu_coroutine_enter(co->coroutine, NULL);
3495 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3496 int nb_sectors, QEMUIOVector *iov,
3497 bool is_write)
3499 CoroutineIOCompletion co = {
3500 .coroutine = qemu_coroutine_self(),
3502 BlockDriverAIOCB *acb;
3504 if (is_write) {
3505 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3506 bdrv_co_io_em_complete, &co);
3507 } else {
3508 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3509 bdrv_co_io_em_complete, &co);
3512 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3513 if (!acb) {
3514 return -EIO;
3516 qemu_coroutine_yield();
3518 return co.ret;
3521 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3522 int64_t sector_num, int nb_sectors,
3523 QEMUIOVector *iov)
3525 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3528 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3529 int64_t sector_num, int nb_sectors,
3530 QEMUIOVector *iov)
3532 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3535 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3537 RwCo *rwco = opaque;
3539 rwco->ret = bdrv_co_flush(rwco->bs);
3542 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3544 int ret;
3546 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3547 return 0;
3550 /* Write back cached data to the OS even with cache=unsafe */
3551 if (bs->drv->bdrv_co_flush_to_os) {
3552 ret = bs->drv->bdrv_co_flush_to_os(bs);
3553 if (ret < 0) {
3554 return ret;
3558 /* But don't actually force it to the disk with cache=unsafe */
3559 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3560 return 0;
3563 if (bs->drv->bdrv_co_flush_to_disk) {
3564 ret = bs->drv->bdrv_co_flush_to_disk(bs);
3565 } else if (bs->drv->bdrv_aio_flush) {
3566 BlockDriverAIOCB *acb;
3567 CoroutineIOCompletion co = {
3568 .coroutine = qemu_coroutine_self(),
3571 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3572 if (acb == NULL) {
3573 ret = -EIO;
3574 } else {
3575 qemu_coroutine_yield();
3576 ret = co.ret;
3578 } else {
3580 * Some block drivers always operate in either writethrough or unsafe
3581 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3582 * know how the server works (because the behaviour is hardcoded or
3583 * depends on server-side configuration), so we can't ensure that
3584 * everything is safe on disk. Returning an error doesn't work because
3585 * that would break guests even if the server operates in writethrough
3586 * mode.
3588 * Let's hope the user knows what he's doing.
3590 ret = 0;
3592 if (ret < 0) {
3593 return ret;
3596 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3597 * in the case of cache=unsafe, so there are no useless flushes.
3599 return bdrv_co_flush(bs->file);
3602 void bdrv_invalidate_cache(BlockDriverState *bs)
3604 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3605 bs->drv->bdrv_invalidate_cache(bs);
3609 void bdrv_invalidate_cache_all(void)
3611 BlockDriverState *bs;
3613 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3614 bdrv_invalidate_cache(bs);
3618 int bdrv_flush(BlockDriverState *bs)
3620 Coroutine *co;
3621 RwCo rwco = {
3622 .bs = bs,
3623 .ret = NOT_DONE,
3626 if (qemu_in_coroutine()) {
3627 /* Fast-path if already in coroutine context */
3628 bdrv_flush_co_entry(&rwco);
3629 } else {
3630 co = qemu_coroutine_create(bdrv_flush_co_entry);
3631 qemu_coroutine_enter(co, &rwco);
3632 while (rwco.ret == NOT_DONE) {
3633 qemu_aio_wait();
3637 return rwco.ret;
3640 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3642 RwCo *rwco = opaque;
3644 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3647 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3648 int nb_sectors)
3650 if (!bs->drv) {
3651 return -ENOMEDIUM;
3652 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3653 return -EIO;
3654 } else if (bs->read_only) {
3655 return -EROFS;
3656 } else if (bs->drv->bdrv_co_discard) {
3657 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3658 } else if (bs->drv->bdrv_aio_discard) {
3659 BlockDriverAIOCB *acb;
3660 CoroutineIOCompletion co = {
3661 .coroutine = qemu_coroutine_self(),
3664 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3665 bdrv_co_io_em_complete, &co);
3666 if (acb == NULL) {
3667 return -EIO;
3668 } else {
3669 qemu_coroutine_yield();
3670 return co.ret;
3672 } else {
3673 return 0;
3677 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3679 Coroutine *co;
3680 RwCo rwco = {
3681 .bs = bs,
3682 .sector_num = sector_num,
3683 .nb_sectors = nb_sectors,
3684 .ret = NOT_DONE,
3687 if (qemu_in_coroutine()) {
3688 /* Fast-path if already in coroutine context */
3689 bdrv_discard_co_entry(&rwco);
3690 } else {
3691 co = qemu_coroutine_create(bdrv_discard_co_entry);
3692 qemu_coroutine_enter(co, &rwco);
3693 while (rwco.ret == NOT_DONE) {
3694 qemu_aio_wait();
3698 return rwco.ret;
3701 /**************************************************************/
3702 /* removable device support */
3705 * Return TRUE if the media is present
3707 int bdrv_is_inserted(BlockDriverState *bs)
3709 BlockDriver *drv = bs->drv;
3711 if (!drv)
3712 return 0;
3713 if (!drv->bdrv_is_inserted)
3714 return 1;
3715 return drv->bdrv_is_inserted(bs);
3719 * Return whether the media changed since the last call to this
3720 * function, or -ENOTSUP if we don't know. Most drivers don't know.
3722 int bdrv_media_changed(BlockDriverState *bs)
3724 BlockDriver *drv = bs->drv;
3726 if (drv && drv->bdrv_media_changed) {
3727 return drv->bdrv_media_changed(bs);
3729 return -ENOTSUP;
3733 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3735 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3737 BlockDriver *drv = bs->drv;
3739 if (drv && drv->bdrv_eject) {
3740 drv->bdrv_eject(bs, eject_flag);
3743 if (bs->device_name[0] != '\0') {
3744 bdrv_emit_qmp_eject_event(bs, eject_flag);
3749 * Lock or unlock the media (if it is locked, the user won't be able
3750 * to eject it manually).
3752 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3754 BlockDriver *drv = bs->drv;
3756 trace_bdrv_lock_medium(bs, locked);
3758 if (drv && drv->bdrv_lock_medium) {
3759 drv->bdrv_lock_medium(bs, locked);
3763 /* needed for generic scsi interface */
3765 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3767 BlockDriver *drv = bs->drv;
3769 if (drv && drv->bdrv_ioctl)
3770 return drv->bdrv_ioctl(bs, req, buf);
3771 return -ENOTSUP;
3774 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3775 unsigned long int req, void *buf,
3776 BlockDriverCompletionFunc *cb, void *opaque)
3778 BlockDriver *drv = bs->drv;
3780 if (drv && drv->bdrv_aio_ioctl)
3781 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3782 return NULL;
3785 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3787 bs->buffer_alignment = align;
3790 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3792 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3795 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3797 int64_t bitmap_size;
3799 bs->dirty_count = 0;
3800 if (enable) {
3801 if (!bs->dirty_bitmap) {
3802 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3803 BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3804 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3806 bs->dirty_bitmap = g_malloc0(bitmap_size);
3808 } else {
3809 if (bs->dirty_bitmap) {
3810 g_free(bs->dirty_bitmap);
3811 bs->dirty_bitmap = NULL;
3816 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3818 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3820 if (bs->dirty_bitmap &&
3821 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3822 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3823 (1UL << (chunk % (sizeof(unsigned long) * 8))));
3824 } else {
3825 return 0;
3829 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3830 int nr_sectors)
3832 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3835 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3837 return bs->dirty_count;
3840 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3842 assert(bs->in_use != in_use);
3843 bs->in_use = in_use;
3846 int bdrv_in_use(BlockDriverState *bs)
3848 return bs->in_use;
3851 void bdrv_iostatus_enable(BlockDriverState *bs)
3853 bs->iostatus_enabled = true;
3854 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3857 /* The I/O status is only enabled if the drive explicitly
3858 * enables it _and_ the VM is configured to stop on errors */
3859 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3861 return (bs->iostatus_enabled &&
3862 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3863 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
3864 bs->on_read_error == BLOCK_ERR_STOP_ANY));
3867 void bdrv_iostatus_disable(BlockDriverState *bs)
3869 bs->iostatus_enabled = false;
3872 void bdrv_iostatus_reset(BlockDriverState *bs)
3874 if (bdrv_iostatus_is_enabled(bs)) {
3875 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3879 /* XXX: Today this is set by device models because it makes the implementation
3880 quite simple. However, the block layer knows about the error, so it's
3881 possible to implement this without device models being involved */
3882 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3884 if (bdrv_iostatus_is_enabled(bs) &&
3885 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3886 assert(error >= 0);
3887 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3888 BLOCK_DEVICE_IO_STATUS_FAILED;
3892 void
3893 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3894 enum BlockAcctType type)
3896 assert(type < BDRV_MAX_IOTYPE);
3898 cookie->bytes = bytes;
3899 cookie->start_time_ns = get_clock();
3900 cookie->type = type;
3903 void
3904 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3906 assert(cookie->type < BDRV_MAX_IOTYPE);
3908 bs->nr_bytes[cookie->type] += cookie->bytes;
3909 bs->nr_ops[cookie->type]++;
3910 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3913 int bdrv_img_create(const char *filename, const char *fmt,
3914 const char *base_filename, const char *base_fmt,
3915 char *options, uint64_t img_size, int flags)
3917 QEMUOptionParameter *param = NULL, *create_options = NULL;
3918 QEMUOptionParameter *backing_fmt, *backing_file, *size;
3919 BlockDriverState *bs = NULL;
3920 BlockDriver *drv, *proto_drv;
3921 BlockDriver *backing_drv = NULL;
3922 int ret = 0;
3924 /* Find driver and parse its options */
3925 drv = bdrv_find_format(fmt);
3926 if (!drv) {
3927 error_report("Unknown file format '%s'", fmt);
3928 ret = -EINVAL;
3929 goto out;
3932 proto_drv = bdrv_find_protocol(filename);
3933 if (!proto_drv) {
3934 error_report("Unknown protocol '%s'", filename);
3935 ret = -EINVAL;
3936 goto out;
3939 create_options = append_option_parameters(create_options,
3940 drv->create_options);
3941 create_options = append_option_parameters(create_options,
3942 proto_drv->create_options);
3944 /* Create parameter list with default values */
3945 param = parse_option_parameters("", create_options, param);
3947 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3949 /* Parse -o options */
3950 if (options) {
3951 param = parse_option_parameters(options, create_options, param);
3952 if (param == NULL) {
3953 error_report("Invalid options for file format '%s'.", fmt);
3954 ret = -EINVAL;
3955 goto out;
3959 if (base_filename) {
3960 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3961 base_filename)) {
3962 error_report("Backing file not supported for file format '%s'",
3963 fmt);
3964 ret = -EINVAL;
3965 goto out;
3969 if (base_fmt) {
3970 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3971 error_report("Backing file format not supported for file "
3972 "format '%s'", fmt);
3973 ret = -EINVAL;
3974 goto out;
3978 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3979 if (backing_file && backing_file->value.s) {
3980 if (!strcmp(filename, backing_file->value.s)) {
3981 error_report("Error: Trying to create an image with the "
3982 "same filename as the backing file");
3983 ret = -EINVAL;
3984 goto out;
3988 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3989 if (backing_fmt && backing_fmt->value.s) {
3990 backing_drv = bdrv_find_format(backing_fmt->value.s);
3991 if (!backing_drv) {
3992 error_report("Unknown backing file format '%s'",
3993 backing_fmt->value.s);
3994 ret = -EINVAL;
3995 goto out;
3999 // The size for the image must always be specified, with one exception:
4000 // If we are using a backing file, we can obtain the size from there
4001 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4002 if (size && size->value.n == -1) {
4003 if (backing_file && backing_file->value.s) {
4004 uint64_t size;
4005 char buf[32];
4007 bs = bdrv_new("");
4009 ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
4010 if (ret < 0) {
4011 error_report("Could not open '%s'", backing_file->value.s);
4012 goto out;
4014 bdrv_get_geometry(bs, &size);
4015 size *= 512;
4017 snprintf(buf, sizeof(buf), "%" PRId64, size);
4018 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4019 } else {
4020 error_report("Image creation needs a size parameter");
4021 ret = -EINVAL;
4022 goto out;
4026 printf("Formatting '%s', fmt=%s ", filename, fmt);
4027 print_option_parameters(param);
4028 puts("");
4030 ret = bdrv_create(drv, filename, param);
4032 if (ret < 0) {
4033 if (ret == -ENOTSUP) {
4034 error_report("Formatting or formatting option not supported for "
4035 "file format '%s'", fmt);
4036 } else if (ret == -EFBIG) {
4037 error_report("The image size is too large for file format '%s'",
4038 fmt);
4039 } else {
4040 error_report("%s: error while creating %s: %s", filename, fmt,
4041 strerror(-ret));
4045 out:
4046 free_option_parameters(create_options);
4047 free_option_parameters(param);
4049 if (bs) {
4050 bdrv_delete(bs);
4053 return ret;
4056 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4057 BlockDriverCompletionFunc *cb, void *opaque)
4059 BlockJob *job;
4061 if (bs->job || bdrv_in_use(bs)) {
4062 return NULL;
4064 bdrv_set_in_use(bs, 1);
4066 job = g_malloc0(job_type->instance_size);
4067 job->job_type = job_type;
4068 job->bs = bs;
4069 job->cb = cb;
4070 job->opaque = opaque;
4071 bs->job = job;
4072 return job;
4075 void block_job_complete(BlockJob *job, int ret)
4077 BlockDriverState *bs = job->bs;
4079 assert(bs->job == job);
4080 job->cb(job->opaque, ret);
4081 bs->job = NULL;
4082 g_free(job);
4083 bdrv_set_in_use(bs, 0);
4086 int block_job_set_speed(BlockJob *job, int64_t value)
4088 int rc;
4090 if (!job->job_type->set_speed) {
4091 return -ENOTSUP;
4093 rc = job->job_type->set_speed(job, value);
4094 if (rc == 0) {
4095 job->speed = value;
4097 return rc;
4100 void block_job_cancel(BlockJob *job)
4102 job->cancelled = true;
4105 bool block_job_is_cancelled(BlockJob *job)
4107 return job->cancelled;
4110 void block_job_cancel_sync(BlockJob *job)
4112 BlockDriverState *bs = job->bs;
4114 assert(bs->job == job);
4115 block_job_cancel(job);
4116 while (bs->job != NULL && bs->job->busy) {
4117 qemu_aio_wait();