scripts/qemu-binfmt-conf.sh: Fix shell syntax
[qemu-kvm.git] / block.c
blobee7d8f220f4aca5ba09dcd7a76e3c88a7104ba37
1 /*
2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qjson.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
35 #ifdef CONFIG_BSD
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
40 #ifndef __DragonFly__
41 #include <sys/disk.h>
42 #endif
43 #endif
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
51 typedef enum {
52 BDRV_REQ_COPY_ON_READ = 0x1,
53 BDRV_REQ_ZERO_WRITE = 0x2,
54 } BdrvRequestFlags;
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59 BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62 BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
65 QEMUIOVector *iov);
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
68 QEMUIOVector *iov);
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76 int64_t sector_num,
77 QEMUIOVector *qiov,
78 int nb_sectors,
79 BlockDriverCompletionFunc *cb,
80 void *opaque,
81 bool is_write);
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
83 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84 int64_t sector_num, int nb_sectors);
86 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87 bool is_write, double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89 double elapsed_time, uint64_t *wait);
90 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91 bool is_write, int64_t *wait);
93 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94 QTAILQ_HEAD_INITIALIZER(bdrv_states);
96 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97 QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 /* The device to use for VM snapshots */
100 static BlockDriverState *bs_snapshots;
102 /* If non-zero, use only whitelisted block drivers */
103 static int use_bdrv_whitelist;
105 #ifdef _WIN32
106 static int is_windows_drive_prefix(const char *filename)
108 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110 filename[1] == ':');
113 int is_windows_drive(const char *filename)
115 if (is_windows_drive_prefix(filename) &&
116 filename[2] == '\0')
117 return 1;
118 if (strstart(filename, "\\\\.\\", NULL) ||
119 strstart(filename, "//./", NULL))
120 return 1;
121 return 0;
123 #endif
125 /* throttling disk I/O limits */
126 void bdrv_io_limits_disable(BlockDriverState *bs)
128 bs->io_limits_enabled = false;
130 while (qemu_co_queue_next(&bs->throttled_reqs));
132 if (bs->block_timer) {
133 qemu_del_timer(bs->block_timer);
134 qemu_free_timer(bs->block_timer);
135 bs->block_timer = NULL;
138 bs->slice_start = 0;
139 bs->slice_end = 0;
140 bs->slice_time = 0;
141 memset(&bs->io_base, 0, sizeof(bs->io_base));
144 static void bdrv_block_timer(void *opaque)
146 BlockDriverState *bs = opaque;
148 qemu_co_queue_next(&bs->throttled_reqs);
151 void bdrv_io_limits_enable(BlockDriverState *bs)
153 qemu_co_queue_init(&bs->throttled_reqs);
154 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
156 bs->slice_start = qemu_get_clock_ns(vm_clock);
157 bs->slice_end = bs->slice_start + bs->slice_time;
158 memset(&bs->io_base, 0, sizeof(bs->io_base));
159 bs->io_limits_enabled = true;
162 bool bdrv_io_limits_enabled(BlockDriverState *bs)
164 BlockIOLimit *io_limits = &bs->io_limits;
165 return io_limits->bps[BLOCK_IO_LIMIT_READ]
166 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168 || io_limits->iops[BLOCK_IO_LIMIT_READ]
169 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
173 static void bdrv_io_limits_intercept(BlockDriverState *bs,
174 bool is_write, int nb_sectors)
176 int64_t wait_time = -1;
178 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179 qemu_co_queue_wait(&bs->throttled_reqs);
182 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183 * throttled requests will not be dequeued until the current request is
184 * allowed to be serviced. So if the current request still exceeds the
185 * limits, it will be inserted to the head. All requests followed it will
186 * be still in throttled_reqs queue.
189 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190 qemu_mod_timer(bs->block_timer,
191 wait_time + qemu_get_clock_ns(vm_clock));
192 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
195 qemu_co_queue_next(&bs->throttled_reqs);
198 /* check if the path starts with "<protocol>:" */
199 static int path_has_protocol(const char *path)
201 #ifdef _WIN32
202 if (is_windows_drive(path) ||
203 is_windows_drive_prefix(path)) {
204 return 0;
206 #endif
208 return strchr(path, ':') != NULL;
211 int path_is_absolute(const char *path)
213 const char *p;
214 #ifdef _WIN32
215 /* specific case for names like: "\\.\d:" */
216 if (*path == '/' || *path == '\\')
217 return 1;
218 #endif
219 p = strchr(path, ':');
220 if (p)
221 p++;
222 else
223 p = path;
224 #ifdef _WIN32
225 return (*p == '/' || *p == '\\');
226 #else
227 return (*p == '/');
228 #endif
231 /* if filename is absolute, just copy it to dest. Otherwise, build a
232 path to it by considering it is relative to base_path. URL are
233 supported. */
234 void path_combine(char *dest, int dest_size,
235 const char *base_path,
236 const char *filename)
238 const char *p, *p1;
239 int len;
241 if (dest_size <= 0)
242 return;
243 if (path_is_absolute(filename)) {
244 pstrcpy(dest, dest_size, filename);
245 } else {
246 p = strchr(base_path, ':');
247 if (p)
248 p++;
249 else
250 p = base_path;
251 p1 = strrchr(base_path, '/');
252 #ifdef _WIN32
254 const char *p2;
255 p2 = strrchr(base_path, '\\');
256 if (!p1 || p2 > p1)
257 p1 = p2;
259 #endif
260 if (p1)
261 p1++;
262 else
263 p1 = base_path;
264 if (p1 > p)
265 p = p1;
266 len = p - base_path;
267 if (len > dest_size - 1)
268 len = dest_size - 1;
269 memcpy(dest, base_path, len);
270 dest[len] = '\0';
271 pstrcat(dest, dest_size, filename);
275 void bdrv_register(BlockDriver *bdrv)
277 /* Block drivers without coroutine functions need emulation */
278 if (!bdrv->bdrv_co_readv) {
279 bdrv->bdrv_co_readv = bdrv_co_readv_em;
280 bdrv->bdrv_co_writev = bdrv_co_writev_em;
282 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
283 * the block driver lacks aio we need to emulate that too.
285 if (!bdrv->bdrv_aio_readv) {
286 /* add AIO emulation layer */
287 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
288 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
292 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
295 /* create a new block device (by default it is empty) */
296 BlockDriverState *bdrv_new(const char *device_name)
298 BlockDriverState *bs;
300 bs = g_malloc0(sizeof(BlockDriverState));
301 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
302 if (device_name[0] != '\0') {
303 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
305 bdrv_iostatus_disable(bs);
306 return bs;
309 BlockDriver *bdrv_find_format(const char *format_name)
311 BlockDriver *drv1;
312 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
313 if (!strcmp(drv1->format_name, format_name)) {
314 return drv1;
317 return NULL;
320 static int bdrv_is_whitelisted(BlockDriver *drv)
322 static const char *whitelist[] = {
323 CONFIG_BDRV_WHITELIST
325 const char **p;
327 if (!whitelist[0])
328 return 1; /* no whitelist, anything goes */
330 for (p = whitelist; *p; p++) {
331 if (!strcmp(drv->format_name, *p)) {
332 return 1;
335 return 0;
338 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
340 BlockDriver *drv = bdrv_find_format(format_name);
341 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
344 typedef struct CreateCo {
345 BlockDriver *drv;
346 char *filename;
347 QEMUOptionParameter *options;
348 int ret;
349 } CreateCo;
351 static void coroutine_fn bdrv_create_co_entry(void *opaque)
353 CreateCo *cco = opaque;
354 assert(cco->drv);
356 cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
359 int bdrv_create(BlockDriver *drv, const char* filename,
360 QEMUOptionParameter *options)
362 int ret;
364 Coroutine *co;
365 CreateCo cco = {
366 .drv = drv,
367 .filename = g_strdup(filename),
368 .options = options,
369 .ret = NOT_DONE,
372 if (!drv->bdrv_create) {
373 return -ENOTSUP;
376 if (qemu_in_coroutine()) {
377 /* Fast-path if already in coroutine context */
378 bdrv_create_co_entry(&cco);
379 } else {
380 co = qemu_coroutine_create(bdrv_create_co_entry);
381 qemu_coroutine_enter(co, &cco);
382 while (cco.ret == NOT_DONE) {
383 qemu_aio_wait();
387 ret = cco.ret;
388 g_free(cco.filename);
390 return ret;
393 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
395 BlockDriver *drv;
397 drv = bdrv_find_protocol(filename);
398 if (drv == NULL) {
399 return -ENOENT;
402 return bdrv_create(drv, filename, options);
405 #ifdef _WIN32
406 void get_tmp_filename(char *filename, int size)
408 char temp_dir[MAX_PATH];
410 GetTempPath(MAX_PATH, temp_dir);
411 GetTempFileName(temp_dir, "qem", 0, filename);
413 #else
414 void get_tmp_filename(char *filename, int size)
416 int fd;
417 const char *tmpdir;
418 /* XXX: race condition possible */
419 tmpdir = getenv("TMPDIR");
420 if (!tmpdir)
421 tmpdir = "/tmp";
422 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
423 fd = mkstemp(filename);
424 close(fd);
426 #endif
429 * Detect host devices. By convention, /dev/cdrom[N] is always
430 * recognized as a host CDROM.
432 static BlockDriver *find_hdev_driver(const char *filename)
434 int score_max = 0, score;
435 BlockDriver *drv = NULL, *d;
437 QLIST_FOREACH(d, &bdrv_drivers, list) {
438 if (d->bdrv_probe_device) {
439 score = d->bdrv_probe_device(filename);
440 if (score > score_max) {
441 score_max = score;
442 drv = d;
447 return drv;
450 BlockDriver *bdrv_find_protocol(const char *filename)
452 BlockDriver *drv1;
453 char protocol[128];
454 int len;
455 const char *p;
457 /* TODO Drivers without bdrv_file_open must be specified explicitly */
460 * XXX(hch): we really should not let host device detection
461 * override an explicit protocol specification, but moving this
462 * later breaks access to device names with colons in them.
463 * Thanks to the brain-dead persistent naming schemes on udev-
464 * based Linux systems those actually are quite common.
466 drv1 = find_hdev_driver(filename);
467 if (drv1) {
468 return drv1;
471 if (!path_has_protocol(filename)) {
472 return bdrv_find_format("file");
474 p = strchr(filename, ':');
475 assert(p != NULL);
476 len = p - filename;
477 if (len > sizeof(protocol) - 1)
478 len = sizeof(protocol) - 1;
479 memcpy(protocol, filename, len);
480 protocol[len] = '\0';
481 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
482 if (drv1->protocol_name &&
483 !strcmp(drv1->protocol_name, protocol)) {
484 return drv1;
487 return NULL;
490 static int find_image_format(const char *filename, BlockDriver **pdrv)
492 int ret, score, score_max;
493 BlockDriver *drv1, *drv;
494 uint8_t buf[2048];
495 BlockDriverState *bs;
497 ret = bdrv_file_open(&bs, filename, 0);
498 if (ret < 0) {
499 *pdrv = NULL;
500 return ret;
503 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
504 if (bs->sg || !bdrv_is_inserted(bs)) {
505 bdrv_delete(bs);
506 drv = bdrv_find_format("raw");
507 if (!drv) {
508 ret = -ENOENT;
510 *pdrv = drv;
511 return ret;
514 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
515 bdrv_delete(bs);
516 if (ret < 0) {
517 *pdrv = NULL;
518 return ret;
521 score_max = 0;
522 drv = NULL;
523 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
524 if (drv1->bdrv_probe) {
525 score = drv1->bdrv_probe(buf, ret, filename);
526 if (score > score_max) {
527 score_max = score;
528 drv = drv1;
532 if (!drv) {
533 ret = -ENOENT;
535 *pdrv = drv;
536 return ret;
540 * Set the current 'total_sectors' value
542 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
544 BlockDriver *drv = bs->drv;
546 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
547 if (bs->sg)
548 return 0;
550 /* query actual device if possible, otherwise just trust the hint */
551 if (drv->bdrv_getlength) {
552 int64_t length = drv->bdrv_getlength(bs);
553 if (length < 0) {
554 return length;
556 hint = length >> BDRV_SECTOR_BITS;
559 bs->total_sectors = hint;
560 return 0;
564 * Set open flags for a given cache mode
566 * Return 0 on success, -1 if the cache mode was invalid.
568 int bdrv_parse_cache_flags(const char *mode, int *flags)
570 *flags &= ~BDRV_O_CACHE_MASK;
572 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
573 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
574 } else if (!strcmp(mode, "directsync")) {
575 *flags |= BDRV_O_NOCACHE;
576 } else if (!strcmp(mode, "writeback")) {
577 *flags |= BDRV_O_CACHE_WB;
578 } else if (!strcmp(mode, "unsafe")) {
579 *flags |= BDRV_O_CACHE_WB;
580 *flags |= BDRV_O_NO_FLUSH;
581 } else if (!strcmp(mode, "writethrough")) {
582 /* this is the default */
583 } else {
584 return -1;
587 return 0;
591 * The copy-on-read flag is actually a reference count so multiple users may
592 * use the feature without worrying about clobbering its previous state.
593 * Copy-on-read stays enabled until all users have called to disable it.
595 void bdrv_enable_copy_on_read(BlockDriverState *bs)
597 bs->copy_on_read++;
600 void bdrv_disable_copy_on_read(BlockDriverState *bs)
602 assert(bs->copy_on_read > 0);
603 bs->copy_on_read--;
607 * Common part for opening disk images and files
609 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
610 int flags, BlockDriver *drv)
612 int ret, open_flags;
614 assert(drv != NULL);
616 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
618 bs->file = NULL;
619 bs->total_sectors = 0;
620 bs->encrypted = 0;
621 bs->valid_key = 0;
622 bs->sg = 0;
623 bs->open_flags = flags;
624 bs->growable = 0;
625 bs->buffer_alignment = 512;
627 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
628 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
629 bdrv_enable_copy_on_read(bs);
632 pstrcpy(bs->filename, sizeof(bs->filename), filename);
633 bs->backing_file[0] = '\0';
635 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
636 return -ENOTSUP;
639 bs->drv = drv;
640 bs->opaque = g_malloc0(drv->instance_size);
642 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
645 * Clear flags that are internal to the block layer before opening the
646 * image.
648 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
651 * Snapshots should be writable.
653 if (bs->is_temporary) {
654 open_flags |= BDRV_O_RDWR;
657 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
659 /* Open the image, either directly or using a protocol */
660 if (drv->bdrv_file_open) {
661 ret = drv->bdrv_file_open(bs, filename, open_flags);
662 } else {
663 ret = bdrv_file_open(&bs->file, filename, open_flags);
664 if (ret >= 0) {
665 ret = drv->bdrv_open(bs, open_flags);
669 if (ret < 0) {
670 goto free_and_fail;
673 ret = refresh_total_sectors(bs, bs->total_sectors);
674 if (ret < 0) {
675 goto free_and_fail;
678 #ifndef _WIN32
679 if (bs->is_temporary) {
680 unlink(filename);
682 #endif
683 return 0;
685 free_and_fail:
686 if (bs->file) {
687 bdrv_delete(bs->file);
688 bs->file = NULL;
690 g_free(bs->opaque);
691 bs->opaque = NULL;
692 bs->drv = NULL;
693 return ret;
697 * Opens a file using a protocol (file, host_device, nbd, ...)
699 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
701 BlockDriverState *bs;
702 BlockDriver *drv;
703 int ret;
705 drv = bdrv_find_protocol(filename);
706 if (!drv) {
707 return -ENOENT;
710 bs = bdrv_new("");
711 ret = bdrv_open_common(bs, filename, flags, drv);
712 if (ret < 0) {
713 bdrv_delete(bs);
714 return ret;
716 bs->growable = 1;
717 *pbs = bs;
718 return 0;
722 * Opens a disk image (raw, qcow2, vmdk, ...)
724 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
725 BlockDriver *drv)
727 int ret;
728 char tmp_filename[PATH_MAX];
730 if (flags & BDRV_O_SNAPSHOT) {
731 BlockDriverState *bs1;
732 int64_t total_size;
733 int is_protocol = 0;
734 BlockDriver *bdrv_qcow2;
735 QEMUOptionParameter *options;
736 char backing_filename[PATH_MAX];
738 /* if snapshot, we create a temporary backing file and open it
739 instead of opening 'filename' directly */
741 /* if there is a backing file, use it */
742 bs1 = bdrv_new("");
743 ret = bdrv_open(bs1, filename, 0, drv);
744 if (ret < 0) {
745 bdrv_delete(bs1);
746 return ret;
748 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
750 if (bs1->drv && bs1->drv->protocol_name)
751 is_protocol = 1;
753 bdrv_delete(bs1);
755 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
757 /* Real path is meaningless for protocols */
758 if (is_protocol)
759 snprintf(backing_filename, sizeof(backing_filename),
760 "%s", filename);
761 else if (!realpath(filename, backing_filename))
762 return -errno;
764 bdrv_qcow2 = bdrv_find_format("qcow2");
765 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
767 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
768 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
769 if (drv) {
770 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
771 drv->format_name);
774 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
775 free_option_parameters(options);
776 if (ret < 0) {
777 return ret;
780 filename = tmp_filename;
781 drv = bdrv_qcow2;
782 bs->is_temporary = 1;
785 /* Find the right image format driver */
786 if (!drv) {
787 ret = find_image_format(filename, &drv);
790 if (!drv) {
791 goto unlink_and_fail;
794 /* Open the image */
795 ret = bdrv_open_common(bs, filename, flags, drv);
796 if (ret < 0) {
797 goto unlink_and_fail;
800 /* If there is a backing file, use it */
801 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
802 char backing_filename[PATH_MAX];
803 int back_flags;
804 BlockDriver *back_drv = NULL;
806 bs->backing_hd = bdrv_new("");
808 if (path_has_protocol(bs->backing_file)) {
809 pstrcpy(backing_filename, sizeof(backing_filename),
810 bs->backing_file);
811 } else {
812 path_combine(backing_filename, sizeof(backing_filename),
813 filename, bs->backing_file);
816 if (bs->backing_format[0] != '\0') {
817 back_drv = bdrv_find_format(bs->backing_format);
820 /* backing files always opened read-only */
821 back_flags =
822 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
824 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
825 if (ret < 0) {
826 bdrv_close(bs);
827 return ret;
829 if (bs->is_temporary) {
830 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
831 } else {
832 /* base image inherits from "parent" */
833 bs->backing_hd->keep_read_only = bs->keep_read_only;
837 if (!bdrv_key_required(bs)) {
838 bdrv_dev_change_media_cb(bs, true);
841 /* throttling disk I/O limits */
842 if (bs->io_limits_enabled) {
843 bdrv_io_limits_enable(bs);
846 return 0;
848 unlink_and_fail:
849 if (bs->is_temporary) {
850 unlink(filename);
852 return ret;
855 void bdrv_close(BlockDriverState *bs)
857 bdrv_flush(bs);
858 if (bs->drv) {
859 if (bs->job) {
860 block_job_cancel_sync(bs->job);
862 bdrv_drain_all();
864 if (bs == bs_snapshots) {
865 bs_snapshots = NULL;
867 if (bs->backing_hd) {
868 bdrv_delete(bs->backing_hd);
869 bs->backing_hd = NULL;
871 bs->drv->bdrv_close(bs);
872 g_free(bs->opaque);
873 #ifdef _WIN32
874 if (bs->is_temporary) {
875 unlink(bs->filename);
877 #endif
878 bs->opaque = NULL;
879 bs->drv = NULL;
880 bs->copy_on_read = 0;
882 if (bs->file != NULL) {
883 bdrv_close(bs->file);
886 bdrv_dev_change_media_cb(bs, false);
889 /*throttling disk I/O limits*/
890 if (bs->io_limits_enabled) {
891 bdrv_io_limits_disable(bs);
895 void bdrv_close_all(void)
897 BlockDriverState *bs;
899 QTAILQ_FOREACH(bs, &bdrv_states, list) {
900 bdrv_close(bs);
905 * Wait for pending requests to complete across all BlockDriverStates
907 * This function does not flush data to disk, use bdrv_flush_all() for that
908 * after calling this function.
910 void bdrv_drain_all(void)
912 BlockDriverState *bs;
914 qemu_aio_flush();
916 /* If requests are still pending there is a bug somewhere */
917 QTAILQ_FOREACH(bs, &bdrv_states, list) {
918 assert(QLIST_EMPTY(&bs->tracked_requests));
919 assert(qemu_co_queue_empty(&bs->throttled_reqs));
923 /* make a BlockDriverState anonymous by removing from bdrv_state list.
924 Also, NULL terminate the device_name to prevent double remove */
925 void bdrv_make_anon(BlockDriverState *bs)
927 if (bs->device_name[0] != '\0') {
928 QTAILQ_REMOVE(&bdrv_states, bs, list);
930 bs->device_name[0] = '\0';
934 * Add new bs contents at the top of an image chain while the chain is
935 * live, while keeping required fields on the top layer.
937 * This will modify the BlockDriverState fields, and swap contents
938 * between bs_new and bs_top. Both bs_new and bs_top are modified.
940 * bs_new is required to be anonymous.
942 * This function does not create any image files.
944 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
946 BlockDriverState tmp;
948 /* bs_new must be anonymous */
949 assert(bs_new->device_name[0] == '\0');
951 tmp = *bs_new;
953 /* there are some fields that need to stay on the top layer: */
955 /* dev info */
956 tmp.dev_ops = bs_top->dev_ops;
957 tmp.dev_opaque = bs_top->dev_opaque;
958 tmp.dev = bs_top->dev;
959 tmp.buffer_alignment = bs_top->buffer_alignment;
960 tmp.copy_on_read = bs_top->copy_on_read;
962 /* i/o timing parameters */
963 tmp.slice_time = bs_top->slice_time;
964 tmp.slice_start = bs_top->slice_start;
965 tmp.slice_end = bs_top->slice_end;
966 tmp.io_limits = bs_top->io_limits;
967 tmp.io_base = bs_top->io_base;
968 tmp.throttled_reqs = bs_top->throttled_reqs;
969 tmp.block_timer = bs_top->block_timer;
970 tmp.io_limits_enabled = bs_top->io_limits_enabled;
972 /* geometry */
973 tmp.cyls = bs_top->cyls;
974 tmp.heads = bs_top->heads;
975 tmp.secs = bs_top->secs;
976 tmp.translation = bs_top->translation;
978 /* r/w error */
979 tmp.on_read_error = bs_top->on_read_error;
980 tmp.on_write_error = bs_top->on_write_error;
982 /* i/o status */
983 tmp.iostatus_enabled = bs_top->iostatus_enabled;
984 tmp.iostatus = bs_top->iostatus;
986 /* keep the same entry in bdrv_states */
987 pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
988 tmp.list = bs_top->list;
990 /* The contents of 'tmp' will become bs_top, as we are
991 * swapping bs_new and bs_top contents. */
992 tmp.backing_hd = bs_new;
993 pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
994 bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
996 /* swap contents of the fixed new bs and the current top */
997 *bs_new = *bs_top;
998 *bs_top = tmp;
1000 /* device_name[] was carried over from the old bs_top. bs_new
1001 * shouldn't be in bdrv_states, so we need to make device_name[]
1002 * reflect the anonymity of bs_new
1004 bs_new->device_name[0] = '\0';
1006 /* clear the copied fields in the new backing file */
1007 bdrv_detach_dev(bs_new, bs_new->dev);
1009 qemu_co_queue_init(&bs_new->throttled_reqs);
1010 memset(&bs_new->io_base, 0, sizeof(bs_new->io_base));
1011 memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
1012 bdrv_iostatus_disable(bs_new);
1014 /* we don't use bdrv_io_limits_disable() for this, because we don't want
1015 * to affect or delete the block_timer, as it has been moved to bs_top */
1016 bs_new->io_limits_enabled = false;
1017 bs_new->block_timer = NULL;
1018 bs_new->slice_time = 0;
1019 bs_new->slice_start = 0;
1020 bs_new->slice_end = 0;
1023 void bdrv_delete(BlockDriverState *bs)
1025 assert(!bs->dev);
1026 assert(!bs->job);
1027 assert(!bs->in_use);
1029 /* remove from list, if necessary */
1030 bdrv_make_anon(bs);
1032 bdrv_close(bs);
1033 if (bs->file != NULL) {
1034 bdrv_delete(bs->file);
1037 assert(bs != bs_snapshots);
1038 g_free(bs);
1041 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1042 /* TODO change to DeviceState *dev when all users are qdevified */
1044 if (bs->dev) {
1045 return -EBUSY;
1047 bs->dev = dev;
1048 bdrv_iostatus_reset(bs);
1049 return 0;
1052 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1053 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1055 if (bdrv_attach_dev(bs, dev) < 0) {
1056 abort();
1060 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1061 /* TODO change to DeviceState *dev when all users are qdevified */
1063 assert(bs->dev == dev);
1064 bs->dev = NULL;
1065 bs->dev_ops = NULL;
1066 bs->dev_opaque = NULL;
1067 bs->buffer_alignment = 512;
1070 /* TODO change to return DeviceState * when all users are qdevified */
1071 void *bdrv_get_attached_dev(BlockDriverState *bs)
1073 return bs->dev;
1076 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1077 void *opaque)
1079 bs->dev_ops = ops;
1080 bs->dev_opaque = opaque;
1081 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1082 bs_snapshots = NULL;
1086 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1087 BlockQMPEventAction action, int is_read)
1089 QObject *data;
1090 const char *action_str;
1092 switch (action) {
1093 case BDRV_ACTION_REPORT:
1094 action_str = "report";
1095 break;
1096 case BDRV_ACTION_IGNORE:
1097 action_str = "ignore";
1098 break;
1099 case BDRV_ACTION_STOP:
1100 action_str = "stop";
1101 break;
1102 default:
1103 abort();
1106 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1107 bdrv->device_name,
1108 action_str,
1109 is_read ? "read" : "write");
1110 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1112 qobject_decref(data);
1115 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1117 QObject *data;
1119 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1120 bdrv_get_device_name(bs), ejected);
1121 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1123 qobject_decref(data);
1126 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1128 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1129 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1130 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1131 if (tray_was_closed) {
1132 /* tray open */
1133 bdrv_emit_qmp_eject_event(bs, true);
1135 if (load) {
1136 /* tray close */
1137 bdrv_emit_qmp_eject_event(bs, false);
1142 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1144 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1147 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1149 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1150 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1154 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1156 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1157 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1159 return false;
1162 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1164 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1165 bs->dev_ops->resize_cb(bs->dev_opaque);
1169 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1171 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1172 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1174 return false;
1178 * Run consistency checks on an image
1180 * Returns 0 if the check could be completed (it doesn't mean that the image is
1181 * free of errors) or -errno when an internal error occurred. The results of the
1182 * check are stored in res.
1184 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
1186 if (bs->drv->bdrv_check == NULL) {
1187 return -ENOTSUP;
1190 memset(res, 0, sizeof(*res));
1191 return bs->drv->bdrv_check(bs, res);
1194 #define COMMIT_BUF_SECTORS 2048
1196 /* commit COW file into the raw image */
1197 int bdrv_commit(BlockDriverState *bs)
1199 BlockDriver *drv = bs->drv;
1200 BlockDriver *backing_drv;
1201 int64_t sector, total_sectors;
1202 int n, ro, open_flags;
1203 int ret = 0, rw_ret = 0;
1204 uint8_t *buf;
1205 char filename[1024];
1206 BlockDriverState *bs_rw, *bs_ro;
1208 if (!drv)
1209 return -ENOMEDIUM;
1211 if (!bs->backing_hd) {
1212 return -ENOTSUP;
1215 if (bs->backing_hd->keep_read_only) {
1216 return -EACCES;
1219 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1220 return -EBUSY;
1223 backing_drv = bs->backing_hd->drv;
1224 ro = bs->backing_hd->read_only;
1225 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1226 open_flags = bs->backing_hd->open_flags;
1228 if (ro) {
1229 /* re-open as RW */
1230 bdrv_delete(bs->backing_hd);
1231 bs->backing_hd = NULL;
1232 bs_rw = bdrv_new("");
1233 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1234 backing_drv);
1235 if (rw_ret < 0) {
1236 bdrv_delete(bs_rw);
1237 /* try to re-open read-only */
1238 bs_ro = bdrv_new("");
1239 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1240 backing_drv);
1241 if (ret < 0) {
1242 bdrv_delete(bs_ro);
1243 /* drive not functional anymore */
1244 bs->drv = NULL;
1245 return ret;
1247 bs->backing_hd = bs_ro;
1248 return rw_ret;
1250 bs->backing_hd = bs_rw;
1253 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1254 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1256 for (sector = 0; sector < total_sectors; sector += n) {
1257 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1259 if (bdrv_read(bs, sector, buf, n) != 0) {
1260 ret = -EIO;
1261 goto ro_cleanup;
1264 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1265 ret = -EIO;
1266 goto ro_cleanup;
1271 if (drv->bdrv_make_empty) {
1272 ret = drv->bdrv_make_empty(bs);
1273 bdrv_flush(bs);
1277 * Make sure all data we wrote to the backing device is actually
1278 * stable on disk.
1280 if (bs->backing_hd)
1281 bdrv_flush(bs->backing_hd);
1283 ro_cleanup:
1284 g_free(buf);
1286 if (ro) {
1287 /* re-open as RO */
1288 bdrv_delete(bs->backing_hd);
1289 bs->backing_hd = NULL;
1290 bs_ro = bdrv_new("");
1291 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1292 backing_drv);
1293 if (ret < 0) {
1294 bdrv_delete(bs_ro);
1295 /* drive not functional anymore */
1296 bs->drv = NULL;
1297 return ret;
1299 bs->backing_hd = bs_ro;
1300 bs->backing_hd->keep_read_only = 0;
1303 return ret;
1306 int bdrv_commit_all(void)
1308 BlockDriverState *bs;
1310 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1311 int ret = bdrv_commit(bs);
1312 if (ret < 0) {
1313 return ret;
1316 return 0;
1319 struct BdrvTrackedRequest {
1320 BlockDriverState *bs;
1321 int64_t sector_num;
1322 int nb_sectors;
1323 bool is_write;
1324 QLIST_ENTRY(BdrvTrackedRequest) list;
1325 Coroutine *co; /* owner, used for deadlock detection */
1326 CoQueue wait_queue; /* coroutines blocked on this request */
1330 * Remove an active request from the tracked requests list
1332 * This function should be called when a tracked request is completing.
1334 static void tracked_request_end(BdrvTrackedRequest *req)
1336 QLIST_REMOVE(req, list);
1337 qemu_co_queue_restart_all(&req->wait_queue);
1341 * Add an active request to the tracked requests list
1343 static void tracked_request_begin(BdrvTrackedRequest *req,
1344 BlockDriverState *bs,
1345 int64_t sector_num,
1346 int nb_sectors, bool is_write)
1348 *req = (BdrvTrackedRequest){
1349 .bs = bs,
1350 .sector_num = sector_num,
1351 .nb_sectors = nb_sectors,
1352 .is_write = is_write,
1353 .co = qemu_coroutine_self(),
1356 qemu_co_queue_init(&req->wait_queue);
1358 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1362 * Round a region to cluster boundaries
1364 static void round_to_clusters(BlockDriverState *bs,
1365 int64_t sector_num, int nb_sectors,
1366 int64_t *cluster_sector_num,
1367 int *cluster_nb_sectors)
1369 BlockDriverInfo bdi;
1371 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1372 *cluster_sector_num = sector_num;
1373 *cluster_nb_sectors = nb_sectors;
1374 } else {
1375 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1376 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1377 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1378 nb_sectors, c);
1382 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1383 int64_t sector_num, int nb_sectors) {
1384 /* aaaa bbbb */
1385 if (sector_num >= req->sector_num + req->nb_sectors) {
1386 return false;
1388 /* bbbb aaaa */
1389 if (req->sector_num >= sector_num + nb_sectors) {
1390 return false;
1392 return true;
1395 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1396 int64_t sector_num, int nb_sectors)
1398 BdrvTrackedRequest *req;
1399 int64_t cluster_sector_num;
1400 int cluster_nb_sectors;
1401 bool retry;
1403 /* If we touch the same cluster it counts as an overlap. This guarantees
1404 * that allocating writes will be serialized and not race with each other
1405 * for the same cluster. For example, in copy-on-read it ensures that the
1406 * CoR read and write operations are atomic and guest writes cannot
1407 * interleave between them.
1409 round_to_clusters(bs, sector_num, nb_sectors,
1410 &cluster_sector_num, &cluster_nb_sectors);
1412 do {
1413 retry = false;
1414 QLIST_FOREACH(req, &bs->tracked_requests, list) {
1415 if (tracked_request_overlaps(req, cluster_sector_num,
1416 cluster_nb_sectors)) {
1417 /* Hitting this means there was a reentrant request, for
1418 * example, a block driver issuing nested requests. This must
1419 * never happen since it means deadlock.
1421 assert(qemu_coroutine_self() != req->co);
1423 qemu_co_queue_wait(&req->wait_queue);
1424 retry = true;
1425 break;
1428 } while (retry);
1432 * Return values:
1433 * 0 - success
1434 * -EINVAL - backing format specified, but no file
1435 * -ENOSPC - can't update the backing file because no space is left in the
1436 * image file header
1437 * -ENOTSUP - format driver doesn't support changing the backing file
1439 int bdrv_change_backing_file(BlockDriverState *bs,
1440 const char *backing_file, const char *backing_fmt)
1442 BlockDriver *drv = bs->drv;
1444 if (drv->bdrv_change_backing_file != NULL) {
1445 return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1446 } else {
1447 return -ENOTSUP;
1451 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1452 size_t size)
1454 int64_t len;
1456 if (!bdrv_is_inserted(bs))
1457 return -ENOMEDIUM;
1459 if (bs->growable)
1460 return 0;
1462 len = bdrv_getlength(bs);
1464 if (offset < 0)
1465 return -EIO;
1467 if ((offset > len) || (len - offset < size))
1468 return -EIO;
1470 return 0;
1473 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1474 int nb_sectors)
1476 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1477 nb_sectors * BDRV_SECTOR_SIZE);
1480 typedef struct RwCo {
1481 BlockDriverState *bs;
1482 int64_t sector_num;
1483 int nb_sectors;
1484 QEMUIOVector *qiov;
1485 bool is_write;
1486 int ret;
1487 } RwCo;
1489 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1491 RwCo *rwco = opaque;
1493 if (!rwco->is_write) {
1494 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1495 rwco->nb_sectors, rwco->qiov, 0);
1496 } else {
1497 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1498 rwco->nb_sectors, rwco->qiov, 0);
1503 * Process a synchronous request using coroutines
1505 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1506 int nb_sectors, bool is_write)
1508 QEMUIOVector qiov;
1509 struct iovec iov = {
1510 .iov_base = (void *)buf,
1511 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1513 Coroutine *co;
1514 RwCo rwco = {
1515 .bs = bs,
1516 .sector_num = sector_num,
1517 .nb_sectors = nb_sectors,
1518 .qiov = &qiov,
1519 .is_write = is_write,
1520 .ret = NOT_DONE,
1523 qemu_iovec_init_external(&qiov, &iov, 1);
1526 * In sync call context, when the vcpu is blocked, this throttling timer
1527 * will not fire; so the I/O throttling function has to be disabled here
1528 * if it has been enabled.
1530 if (bs->io_limits_enabled) {
1531 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1532 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1533 bdrv_io_limits_disable(bs);
1536 if (qemu_in_coroutine()) {
1537 /* Fast-path if already in coroutine context */
1538 bdrv_rw_co_entry(&rwco);
1539 } else {
1540 co = qemu_coroutine_create(bdrv_rw_co_entry);
1541 qemu_coroutine_enter(co, &rwco);
1542 while (rwco.ret == NOT_DONE) {
1543 qemu_aio_wait();
1546 return rwco.ret;
1549 /* return < 0 if error. See bdrv_write() for the return codes */
1550 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1551 uint8_t *buf, int nb_sectors)
1553 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1556 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1557 int nb_sectors, int dirty)
1559 int64_t start, end;
1560 unsigned long val, idx, bit;
1562 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1563 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1565 for (; start <= end; start++) {
1566 idx = start / (sizeof(unsigned long) * 8);
1567 bit = start % (sizeof(unsigned long) * 8);
1568 val = bs->dirty_bitmap[idx];
1569 if (dirty) {
1570 if (!(val & (1UL << bit))) {
1571 bs->dirty_count++;
1572 val |= 1UL << bit;
1574 } else {
1575 if (val & (1UL << bit)) {
1576 bs->dirty_count--;
1577 val &= ~(1UL << bit);
1580 bs->dirty_bitmap[idx] = val;
1584 /* Return < 0 if error. Important errors are:
1585 -EIO generic I/O error (may happen for all errors)
1586 -ENOMEDIUM No media inserted.
1587 -EINVAL Invalid sector number or nb_sectors
1588 -EACCES Trying to write a read-only device
1590 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1591 const uint8_t *buf, int nb_sectors)
1593 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1596 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1597 void *buf, int count1)
1599 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1600 int len, nb_sectors, count;
1601 int64_t sector_num;
1602 int ret;
1604 count = count1;
1605 /* first read to align to sector start */
1606 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1607 if (len > count)
1608 len = count;
1609 sector_num = offset >> BDRV_SECTOR_BITS;
1610 if (len > 0) {
1611 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1612 return ret;
1613 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1614 count -= len;
1615 if (count == 0)
1616 return count1;
1617 sector_num++;
1618 buf += len;
1621 /* read the sectors "in place" */
1622 nb_sectors = count >> BDRV_SECTOR_BITS;
1623 if (nb_sectors > 0) {
1624 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1625 return ret;
1626 sector_num += nb_sectors;
1627 len = nb_sectors << BDRV_SECTOR_BITS;
1628 buf += len;
1629 count -= len;
1632 /* add data from the last sector */
1633 if (count > 0) {
1634 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1635 return ret;
1636 memcpy(buf, tmp_buf, count);
1638 return count1;
1641 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1642 const void *buf, int count1)
1644 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1645 int len, nb_sectors, count;
1646 int64_t sector_num;
1647 int ret;
1649 count = count1;
1650 /* first write to align to sector start */
1651 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1652 if (len > count)
1653 len = count;
1654 sector_num = offset >> BDRV_SECTOR_BITS;
1655 if (len > 0) {
1656 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1657 return ret;
1658 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1659 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1660 return ret;
1661 count -= len;
1662 if (count == 0)
1663 return count1;
1664 sector_num++;
1665 buf += len;
1668 /* write the sectors "in place" */
1669 nb_sectors = count >> BDRV_SECTOR_BITS;
1670 if (nb_sectors > 0) {
1671 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1672 return ret;
1673 sector_num += nb_sectors;
1674 len = nb_sectors << BDRV_SECTOR_BITS;
1675 buf += len;
1676 count -= len;
1679 /* add data from the last sector */
1680 if (count > 0) {
1681 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1682 return ret;
1683 memcpy(tmp_buf, buf, count);
1684 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1685 return ret;
1687 return count1;
1691 * Writes to the file and ensures that no writes are reordered across this
1692 * request (acts as a barrier)
1694 * Returns 0 on success, -errno in error cases.
1696 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1697 const void *buf, int count)
1699 int ret;
1701 ret = bdrv_pwrite(bs, offset, buf, count);
1702 if (ret < 0) {
1703 return ret;
1706 /* No flush needed for cache modes that use O_DSYNC */
1707 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1708 bdrv_flush(bs);
1711 return 0;
1714 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1715 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1717 /* Perform I/O through a temporary buffer so that users who scribble over
1718 * their read buffer while the operation is in progress do not end up
1719 * modifying the image file. This is critical for zero-copy guest I/O
1720 * where anything might happen inside guest memory.
1722 void *bounce_buffer;
1724 BlockDriver *drv = bs->drv;
1725 struct iovec iov;
1726 QEMUIOVector bounce_qiov;
1727 int64_t cluster_sector_num;
1728 int cluster_nb_sectors;
1729 size_t skip_bytes;
1730 int ret;
1732 /* Cover entire cluster so no additional backing file I/O is required when
1733 * allocating cluster in the image file.
1735 round_to_clusters(bs, sector_num, nb_sectors,
1736 &cluster_sector_num, &cluster_nb_sectors);
1738 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1739 cluster_sector_num, cluster_nb_sectors);
1741 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1742 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1743 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1745 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1746 &bounce_qiov);
1747 if (ret < 0) {
1748 goto err;
1751 if (drv->bdrv_co_write_zeroes &&
1752 buffer_is_zero(bounce_buffer, iov.iov_len)) {
1753 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1754 cluster_nb_sectors);
1755 } else {
1756 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1757 &bounce_qiov);
1760 if (ret < 0) {
1761 /* It might be okay to ignore write errors for guest requests. If this
1762 * is a deliberate copy-on-read then we don't want to ignore the error.
1763 * Simply report it in all cases.
1765 goto err;
1768 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1769 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1770 nb_sectors * BDRV_SECTOR_SIZE);
1772 err:
1773 qemu_vfree(bounce_buffer);
1774 return ret;
1778 * Handle a read request in coroutine context
1780 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1781 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1782 BdrvRequestFlags flags)
1784 BlockDriver *drv = bs->drv;
1785 BdrvTrackedRequest req;
1786 int ret;
1788 if (!drv) {
1789 return -ENOMEDIUM;
1791 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1792 return -EIO;
1795 /* throttling disk read I/O */
1796 if (bs->io_limits_enabled) {
1797 bdrv_io_limits_intercept(bs, false, nb_sectors);
1800 if (bs->copy_on_read) {
1801 flags |= BDRV_REQ_COPY_ON_READ;
1803 if (flags & BDRV_REQ_COPY_ON_READ) {
1804 bs->copy_on_read_in_flight++;
1807 if (bs->copy_on_read_in_flight) {
1808 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1811 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1813 if (flags & BDRV_REQ_COPY_ON_READ) {
1814 int pnum;
1816 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1817 if (ret < 0) {
1818 goto out;
1821 if (!ret || pnum != nb_sectors) {
1822 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1823 goto out;
1827 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1829 out:
1830 tracked_request_end(&req);
1832 if (flags & BDRV_REQ_COPY_ON_READ) {
1833 bs->copy_on_read_in_flight--;
1836 return ret;
1839 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1840 int nb_sectors, QEMUIOVector *qiov)
1842 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1844 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1847 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1848 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1850 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1852 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1853 BDRV_REQ_COPY_ON_READ);
1856 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1857 int64_t sector_num, int nb_sectors)
1859 BlockDriver *drv = bs->drv;
1860 QEMUIOVector qiov;
1861 struct iovec iov;
1862 int ret;
1864 /* TODO Emulate only part of misaligned requests instead of letting block
1865 * drivers return -ENOTSUP and emulate everything */
1867 /* First try the efficient write zeroes operation */
1868 if (drv->bdrv_co_write_zeroes) {
1869 ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1870 if (ret != -ENOTSUP) {
1871 return ret;
1875 /* Fall back to bounce buffer if write zeroes is unsupported */
1876 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
1877 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1878 memset(iov.iov_base, 0, iov.iov_len);
1879 qemu_iovec_init_external(&qiov, &iov, 1);
1881 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1883 qemu_vfree(iov.iov_base);
1884 return ret;
1888 * Handle a write request in coroutine context
1890 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1891 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1892 BdrvRequestFlags flags)
1894 BlockDriver *drv = bs->drv;
1895 BdrvTrackedRequest req;
1896 int ret;
1898 if (!bs->drv) {
1899 return -ENOMEDIUM;
1901 if (bs->read_only) {
1902 return -EACCES;
1904 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1905 return -EIO;
1908 /* throttling disk write I/O */
1909 if (bs->io_limits_enabled) {
1910 bdrv_io_limits_intercept(bs, true, nb_sectors);
1913 if (bs->copy_on_read_in_flight) {
1914 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1917 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1919 if (flags & BDRV_REQ_ZERO_WRITE) {
1920 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1921 } else {
1922 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1925 if (bs->dirty_bitmap) {
1926 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1929 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1930 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1933 tracked_request_end(&req);
1935 return ret;
1938 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1939 int nb_sectors, QEMUIOVector *qiov)
1941 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1943 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1946 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1947 int64_t sector_num, int nb_sectors)
1949 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1951 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1952 BDRV_REQ_ZERO_WRITE);
1956 * Truncate file to 'offset' bytes (needed only for file protocols)
1958 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1960 BlockDriver *drv = bs->drv;
1961 int ret;
1962 if (!drv)
1963 return -ENOMEDIUM;
1964 if (!drv->bdrv_truncate)
1965 return -ENOTSUP;
1966 if (bs->read_only)
1967 return -EACCES;
1968 if (bdrv_in_use(bs))
1969 return -EBUSY;
1970 ret = drv->bdrv_truncate(bs, offset);
1971 if (ret == 0) {
1972 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1973 bdrv_dev_resize_cb(bs);
1975 return ret;
1979 * Length of a allocated file in bytes. Sparse files are counted by actual
1980 * allocated space. Return < 0 if error or unknown.
1982 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1984 BlockDriver *drv = bs->drv;
1985 if (!drv) {
1986 return -ENOMEDIUM;
1988 if (drv->bdrv_get_allocated_file_size) {
1989 return drv->bdrv_get_allocated_file_size(bs);
1991 if (bs->file) {
1992 return bdrv_get_allocated_file_size(bs->file);
1994 return -ENOTSUP;
1998 * Length of a file in bytes. Return < 0 if error or unknown.
2000 int64_t bdrv_getlength(BlockDriverState *bs)
2002 BlockDriver *drv = bs->drv;
2003 if (!drv)
2004 return -ENOMEDIUM;
2006 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2007 if (drv->bdrv_getlength) {
2008 return drv->bdrv_getlength(bs);
2011 return bs->total_sectors * BDRV_SECTOR_SIZE;
2014 /* return 0 as number of sectors if no device present or error */
2015 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2017 int64_t length;
2018 length = bdrv_getlength(bs);
2019 if (length < 0)
2020 length = 0;
2021 else
2022 length = length >> BDRV_SECTOR_BITS;
2023 *nb_sectors_ptr = length;
2026 struct partition {
2027 uint8_t boot_ind; /* 0x80 - active */
2028 uint8_t head; /* starting head */
2029 uint8_t sector; /* starting sector */
2030 uint8_t cyl; /* starting cylinder */
2031 uint8_t sys_ind; /* What partition type */
2032 uint8_t end_head; /* end head */
2033 uint8_t end_sector; /* end sector */
2034 uint8_t end_cyl; /* end cylinder */
2035 uint32_t start_sect; /* starting sector counting from 0 */
2036 uint32_t nr_sects; /* nr of sectors in partition */
2037 } QEMU_PACKED;
2039 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
2040 static int guess_disk_lchs(BlockDriverState *bs,
2041 int *pcylinders, int *pheads, int *psectors)
2043 uint8_t buf[BDRV_SECTOR_SIZE];
2044 int ret, i, heads, sectors, cylinders;
2045 struct partition *p;
2046 uint32_t nr_sects;
2047 uint64_t nb_sectors;
2048 bool enabled;
2050 bdrv_get_geometry(bs, &nb_sectors);
2053 * The function will be invoked during startup not only in sync I/O mode,
2054 * but also in async I/O mode. So the I/O throttling function has to
2055 * be disabled temporarily here, not permanently.
2057 enabled = bs->io_limits_enabled;
2058 bs->io_limits_enabled = false;
2059 ret = bdrv_read(bs, 0, buf, 1);
2060 bs->io_limits_enabled = enabled;
2061 if (ret < 0)
2062 return -1;
2063 /* test msdos magic */
2064 if (buf[510] != 0x55 || buf[511] != 0xaa)
2065 return -1;
2066 for(i = 0; i < 4; i++) {
2067 p = ((struct partition *)(buf + 0x1be)) + i;
2068 nr_sects = le32_to_cpu(p->nr_sects);
2069 if (nr_sects && p->end_head) {
2070 /* We make the assumption that the partition terminates on
2071 a cylinder boundary */
2072 heads = p->end_head + 1;
2073 sectors = p->end_sector & 63;
2074 if (sectors == 0)
2075 continue;
2076 cylinders = nb_sectors / (heads * sectors);
2077 if (cylinders < 1 || cylinders > 16383)
2078 continue;
2079 *pheads = heads;
2080 *psectors = sectors;
2081 *pcylinders = cylinders;
2082 #if 0
2083 printf("guessed geometry: LCHS=%d %d %d\n",
2084 cylinders, heads, sectors);
2085 #endif
2086 return 0;
2089 return -1;
2092 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2094 int translation, lba_detected = 0;
2095 int cylinders, heads, secs;
2096 uint64_t nb_sectors;
2098 /* if a geometry hint is available, use it */
2099 bdrv_get_geometry(bs, &nb_sectors);
2100 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2101 translation = bdrv_get_translation_hint(bs);
2102 if (cylinders != 0) {
2103 *pcyls = cylinders;
2104 *pheads = heads;
2105 *psecs = secs;
2106 } else {
2107 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2108 if (heads > 16) {
2109 /* if heads > 16, it means that a BIOS LBA
2110 translation was active, so the default
2111 hardware geometry is OK */
2112 lba_detected = 1;
2113 goto default_geometry;
2114 } else {
2115 *pcyls = cylinders;
2116 *pheads = heads;
2117 *psecs = secs;
2118 /* disable any translation to be in sync with
2119 the logical geometry */
2120 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2121 bdrv_set_translation_hint(bs,
2122 BIOS_ATA_TRANSLATION_NONE);
2125 } else {
2126 default_geometry:
2127 /* if no geometry, use a standard physical disk geometry */
2128 cylinders = nb_sectors / (16 * 63);
2130 if (cylinders > 16383)
2131 cylinders = 16383;
2132 else if (cylinders < 2)
2133 cylinders = 2;
2134 *pcyls = cylinders;
2135 *pheads = 16;
2136 *psecs = 63;
2137 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2138 if ((*pcyls * *pheads) <= 131072) {
2139 bdrv_set_translation_hint(bs,
2140 BIOS_ATA_TRANSLATION_LARGE);
2141 } else {
2142 bdrv_set_translation_hint(bs,
2143 BIOS_ATA_TRANSLATION_LBA);
2147 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2151 void bdrv_set_geometry_hint(BlockDriverState *bs,
2152 int cyls, int heads, int secs)
2154 bs->cyls = cyls;
2155 bs->heads = heads;
2156 bs->secs = secs;
2159 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2161 bs->translation = translation;
2164 void bdrv_get_geometry_hint(BlockDriverState *bs,
2165 int *pcyls, int *pheads, int *psecs)
2167 *pcyls = bs->cyls;
2168 *pheads = bs->heads;
2169 *psecs = bs->secs;
2172 /* throttling disk io limits */
2173 void bdrv_set_io_limits(BlockDriverState *bs,
2174 BlockIOLimit *io_limits)
2176 bs->io_limits = *io_limits;
2177 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2180 /* Recognize floppy formats */
2181 typedef struct FDFormat {
2182 FDriveType drive;
2183 uint8_t last_sect;
2184 uint8_t max_track;
2185 uint8_t max_head;
2186 FDriveRate rate;
2187 } FDFormat;
2189 static const FDFormat fd_formats[] = {
2190 /* First entry is default format */
2191 /* 1.44 MB 3"1/2 floppy disks */
2192 { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2193 { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2194 { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2195 { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2196 { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2197 { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2198 { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2199 { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2200 /* 2.88 MB 3"1/2 floppy disks */
2201 { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2202 { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2203 { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2204 { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2205 { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2206 /* 720 kB 3"1/2 floppy disks */
2207 { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, },
2208 { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2209 { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2210 { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2211 { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2212 { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2213 /* 1.2 MB 5"1/4 floppy disks */
2214 { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2215 { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2216 { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2217 { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2218 { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2219 /* 720 kB 5"1/4 floppy disks */
2220 { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, },
2221 { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2222 /* 360 kB 5"1/4 floppy disks */
2223 { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, },
2224 { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, },
2225 { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2226 { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2227 /* 320 kB 5"1/4 floppy disks */
2228 { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, },
2229 { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, },
2230 /* 360 kB must match 5"1/4 better than 3"1/2... */
2231 { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, },
2232 /* end */
2233 { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2236 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2237 int *max_track, int *last_sect,
2238 FDriveType drive_in, FDriveType *drive,
2239 FDriveRate *rate)
2241 const FDFormat *parse;
2242 uint64_t nb_sectors, size;
2243 int i, first_match, match;
2245 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2246 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2247 /* User defined disk */
2248 *rate = FDRIVE_RATE_500K;
2249 } else {
2250 bdrv_get_geometry(bs, &nb_sectors);
2251 match = -1;
2252 first_match = -1;
2253 for (i = 0; ; i++) {
2254 parse = &fd_formats[i];
2255 if (parse->drive == FDRIVE_DRV_NONE) {
2256 break;
2258 if (drive_in == parse->drive ||
2259 drive_in == FDRIVE_DRV_NONE) {
2260 size = (parse->max_head + 1) * parse->max_track *
2261 parse->last_sect;
2262 if (nb_sectors == size) {
2263 match = i;
2264 break;
2266 if (first_match == -1) {
2267 first_match = i;
2271 if (match == -1) {
2272 if (first_match == -1) {
2273 match = 1;
2274 } else {
2275 match = first_match;
2277 parse = &fd_formats[match];
2279 *nb_heads = parse->max_head + 1;
2280 *max_track = parse->max_track;
2281 *last_sect = parse->last_sect;
2282 *drive = parse->drive;
2283 *rate = parse->rate;
2287 int bdrv_get_translation_hint(BlockDriverState *bs)
2289 return bs->translation;
2292 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2293 BlockErrorAction on_write_error)
2295 bs->on_read_error = on_read_error;
2296 bs->on_write_error = on_write_error;
2299 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2301 return is_read ? bs->on_read_error : bs->on_write_error;
2304 int bdrv_is_read_only(BlockDriverState *bs)
2306 return bs->read_only;
2309 int bdrv_is_sg(BlockDriverState *bs)
2311 return bs->sg;
2314 int bdrv_enable_write_cache(BlockDriverState *bs)
2316 return bs->enable_write_cache;
2319 int bdrv_is_encrypted(BlockDriverState *bs)
2321 if (bs->backing_hd && bs->backing_hd->encrypted)
2322 return 1;
2323 return bs->encrypted;
2326 int bdrv_key_required(BlockDriverState *bs)
2328 BlockDriverState *backing_hd = bs->backing_hd;
2330 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2331 return 1;
2332 return (bs->encrypted && !bs->valid_key);
2335 int bdrv_set_key(BlockDriverState *bs, const char *key)
2337 int ret;
2338 if (bs->backing_hd && bs->backing_hd->encrypted) {
2339 ret = bdrv_set_key(bs->backing_hd, key);
2340 if (ret < 0)
2341 return ret;
2342 if (!bs->encrypted)
2343 return 0;
2345 if (!bs->encrypted) {
2346 return -EINVAL;
2347 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2348 return -ENOMEDIUM;
2350 ret = bs->drv->bdrv_set_key(bs, key);
2351 if (ret < 0) {
2352 bs->valid_key = 0;
2353 } else if (!bs->valid_key) {
2354 bs->valid_key = 1;
2355 /* call the change callback now, we skipped it on open */
2356 bdrv_dev_change_media_cb(bs, true);
2358 return ret;
2361 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2363 if (!bs->drv) {
2364 buf[0] = '\0';
2365 } else {
2366 pstrcpy(buf, buf_size, bs->drv->format_name);
2370 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2371 void *opaque)
2373 BlockDriver *drv;
2375 QLIST_FOREACH(drv, &bdrv_drivers, list) {
2376 it(opaque, drv->format_name);
2380 BlockDriverState *bdrv_find(const char *name)
2382 BlockDriverState *bs;
2384 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2385 if (!strcmp(name, bs->device_name)) {
2386 return bs;
2389 return NULL;
2392 BlockDriverState *bdrv_next(BlockDriverState *bs)
2394 if (!bs) {
2395 return QTAILQ_FIRST(&bdrv_states);
2397 return QTAILQ_NEXT(bs, list);
2400 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2402 BlockDriverState *bs;
2404 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2405 it(opaque, bs);
2409 const char *bdrv_get_device_name(BlockDriverState *bs)
2411 return bs->device_name;
2414 void bdrv_flush_all(void)
2416 BlockDriverState *bs;
2418 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2419 bdrv_flush(bs);
2423 int bdrv_has_zero_init(BlockDriverState *bs)
2425 assert(bs->drv);
2427 if (bs->drv->bdrv_has_zero_init) {
2428 return bs->drv->bdrv_has_zero_init(bs);
2431 return 1;
2434 typedef struct BdrvCoIsAllocatedData {
2435 BlockDriverState *bs;
2436 int64_t sector_num;
2437 int nb_sectors;
2438 int *pnum;
2439 int ret;
2440 bool done;
2441 } BdrvCoIsAllocatedData;
2444 * Returns true iff the specified sector is present in the disk image. Drivers
2445 * not implementing the functionality are assumed to not support backing files,
2446 * hence all their sectors are reported as allocated.
2448 * If 'sector_num' is beyond the end of the disk image the return value is 0
2449 * and 'pnum' is set to 0.
2451 * 'pnum' is set to the number of sectors (including and immediately following
2452 * the specified sector) that are known to be in the same
2453 * allocated/unallocated state.
2455 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2456 * beyond the end of the disk image it will be clamped.
2458 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2459 int nb_sectors, int *pnum)
2461 int64_t n;
2463 if (sector_num >= bs->total_sectors) {
2464 *pnum = 0;
2465 return 0;
2468 n = bs->total_sectors - sector_num;
2469 if (n < nb_sectors) {
2470 nb_sectors = n;
2473 if (!bs->drv->bdrv_co_is_allocated) {
2474 *pnum = nb_sectors;
2475 return 1;
2478 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2481 /* Coroutine wrapper for bdrv_is_allocated() */
2482 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2484 BdrvCoIsAllocatedData *data = opaque;
2485 BlockDriverState *bs = data->bs;
2487 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2488 data->pnum);
2489 data->done = true;
2493 * Synchronous wrapper around bdrv_co_is_allocated().
2495 * See bdrv_co_is_allocated() for details.
2497 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2498 int *pnum)
2500 Coroutine *co;
2501 BdrvCoIsAllocatedData data = {
2502 .bs = bs,
2503 .sector_num = sector_num,
2504 .nb_sectors = nb_sectors,
2505 .pnum = pnum,
2506 .done = false,
2509 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2510 qemu_coroutine_enter(co, &data);
2511 while (!data.done) {
2512 qemu_aio_wait();
2514 return data.ret;
2517 BlockInfoList *qmp_query_block(Error **errp)
2519 BlockInfoList *head = NULL, *cur_item = NULL;
2520 BlockDriverState *bs;
2522 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2523 BlockInfoList *info = g_malloc0(sizeof(*info));
2525 info->value = g_malloc0(sizeof(*info->value));
2526 info->value->device = g_strdup(bs->device_name);
2527 info->value->type = g_strdup("unknown");
2528 info->value->locked = bdrv_dev_is_medium_locked(bs);
2529 info->value->removable = bdrv_dev_has_removable_media(bs);
2531 if (bdrv_dev_has_removable_media(bs)) {
2532 info->value->has_tray_open = true;
2533 info->value->tray_open = bdrv_dev_is_tray_open(bs);
2536 if (bdrv_iostatus_is_enabled(bs)) {
2537 info->value->has_io_status = true;
2538 info->value->io_status = bs->iostatus;
2541 if (bs->drv) {
2542 info->value->has_inserted = true;
2543 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2544 info->value->inserted->file = g_strdup(bs->filename);
2545 info->value->inserted->ro = bs->read_only;
2546 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2547 info->value->inserted->encrypted = bs->encrypted;
2548 if (bs->backing_file[0]) {
2549 info->value->inserted->has_backing_file = true;
2550 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2553 if (bs->io_limits_enabled) {
2554 info->value->inserted->bps =
2555 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2556 info->value->inserted->bps_rd =
2557 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2558 info->value->inserted->bps_wr =
2559 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2560 info->value->inserted->iops =
2561 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2562 info->value->inserted->iops_rd =
2563 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2564 info->value->inserted->iops_wr =
2565 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2569 /* XXX: waiting for the qapi to support GSList */
2570 if (!cur_item) {
2571 head = cur_item = info;
2572 } else {
2573 cur_item->next = info;
2574 cur_item = info;
2578 return head;
2581 /* Consider exposing this as a full fledged QMP command */
2582 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2584 BlockStats *s;
2586 s = g_malloc0(sizeof(*s));
2588 if (bs->device_name[0]) {
2589 s->has_device = true;
2590 s->device = g_strdup(bs->device_name);
2593 s->stats = g_malloc0(sizeof(*s->stats));
2594 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2595 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2596 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2597 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2598 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2599 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2600 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2601 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2602 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2604 if (bs->file) {
2605 s->has_parent = true;
2606 s->parent = qmp_query_blockstat(bs->file, NULL);
2609 return s;
2612 BlockStatsList *qmp_query_blockstats(Error **errp)
2614 BlockStatsList *head = NULL, *cur_item = NULL;
2615 BlockDriverState *bs;
2617 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2618 BlockStatsList *info = g_malloc0(sizeof(*info));
2619 info->value = qmp_query_blockstat(bs, NULL);
2621 /* XXX: waiting for the qapi to support GSList */
2622 if (!cur_item) {
2623 head = cur_item = info;
2624 } else {
2625 cur_item->next = info;
2626 cur_item = info;
2630 return head;
2633 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2635 if (bs->backing_hd && bs->backing_hd->encrypted)
2636 return bs->backing_file;
2637 else if (bs->encrypted)
2638 return bs->filename;
2639 else
2640 return NULL;
2643 void bdrv_get_backing_filename(BlockDriverState *bs,
2644 char *filename, int filename_size)
2646 pstrcpy(filename, filename_size, bs->backing_file);
2649 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2650 const uint8_t *buf, int nb_sectors)
2652 BlockDriver *drv = bs->drv;
2653 if (!drv)
2654 return -ENOMEDIUM;
2655 if (!drv->bdrv_write_compressed)
2656 return -ENOTSUP;
2657 if (bdrv_check_request(bs, sector_num, nb_sectors))
2658 return -EIO;
2660 if (bs->dirty_bitmap) {
2661 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2664 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2667 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2669 BlockDriver *drv = bs->drv;
2670 if (!drv)
2671 return -ENOMEDIUM;
2672 if (!drv->bdrv_get_info)
2673 return -ENOTSUP;
2674 memset(bdi, 0, sizeof(*bdi));
2675 return drv->bdrv_get_info(bs, bdi);
2678 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2679 int64_t pos, int size)
2681 BlockDriver *drv = bs->drv;
2682 if (!drv)
2683 return -ENOMEDIUM;
2684 if (drv->bdrv_save_vmstate)
2685 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2686 if (bs->file)
2687 return bdrv_save_vmstate(bs->file, buf, pos, size);
2688 return -ENOTSUP;
2691 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2692 int64_t pos, int size)
2694 BlockDriver *drv = bs->drv;
2695 if (!drv)
2696 return -ENOMEDIUM;
2697 if (drv->bdrv_load_vmstate)
2698 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2699 if (bs->file)
2700 return bdrv_load_vmstate(bs->file, buf, pos, size);
2701 return -ENOTSUP;
2704 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2706 BlockDriver *drv = bs->drv;
2708 if (!drv || !drv->bdrv_debug_event) {
2709 return;
2712 return drv->bdrv_debug_event(bs, event);
2716 /**************************************************************/
2717 /* handling of snapshots */
2719 int bdrv_can_snapshot(BlockDriverState *bs)
2721 BlockDriver *drv = bs->drv;
2722 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2723 return 0;
2726 if (!drv->bdrv_snapshot_create) {
2727 if (bs->file != NULL) {
2728 return bdrv_can_snapshot(bs->file);
2730 return 0;
2733 return 1;
2736 int bdrv_is_snapshot(BlockDriverState *bs)
2738 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2741 BlockDriverState *bdrv_snapshots(void)
2743 BlockDriverState *bs;
2745 if (bs_snapshots) {
2746 return bs_snapshots;
2749 bs = NULL;
2750 while ((bs = bdrv_next(bs))) {
2751 if (bdrv_can_snapshot(bs)) {
2752 bs_snapshots = bs;
2753 return bs;
2756 return NULL;
2759 int bdrv_snapshot_create(BlockDriverState *bs,
2760 QEMUSnapshotInfo *sn_info)
2762 BlockDriver *drv = bs->drv;
2763 if (!drv)
2764 return -ENOMEDIUM;
2765 if (drv->bdrv_snapshot_create)
2766 return drv->bdrv_snapshot_create(bs, sn_info);
2767 if (bs->file)
2768 return bdrv_snapshot_create(bs->file, sn_info);
2769 return -ENOTSUP;
2772 int bdrv_snapshot_goto(BlockDriverState *bs,
2773 const char *snapshot_id)
2775 BlockDriver *drv = bs->drv;
2776 int ret, open_ret;
2778 if (!drv)
2779 return -ENOMEDIUM;
2780 if (drv->bdrv_snapshot_goto)
2781 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2783 if (bs->file) {
2784 drv->bdrv_close(bs);
2785 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2786 open_ret = drv->bdrv_open(bs, bs->open_flags);
2787 if (open_ret < 0) {
2788 bdrv_delete(bs->file);
2789 bs->drv = NULL;
2790 return open_ret;
2792 return ret;
2795 return -ENOTSUP;
2798 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2800 BlockDriver *drv = bs->drv;
2801 if (!drv)
2802 return -ENOMEDIUM;
2803 if (drv->bdrv_snapshot_delete)
2804 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2805 if (bs->file)
2806 return bdrv_snapshot_delete(bs->file, snapshot_id);
2807 return -ENOTSUP;
2810 int bdrv_snapshot_list(BlockDriverState *bs,
2811 QEMUSnapshotInfo **psn_info)
2813 BlockDriver *drv = bs->drv;
2814 if (!drv)
2815 return -ENOMEDIUM;
2816 if (drv->bdrv_snapshot_list)
2817 return drv->bdrv_snapshot_list(bs, psn_info);
2818 if (bs->file)
2819 return bdrv_snapshot_list(bs->file, psn_info);
2820 return -ENOTSUP;
2823 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2824 const char *snapshot_name)
2826 BlockDriver *drv = bs->drv;
2827 if (!drv) {
2828 return -ENOMEDIUM;
2830 if (!bs->read_only) {
2831 return -EINVAL;
2833 if (drv->bdrv_snapshot_load_tmp) {
2834 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2836 return -ENOTSUP;
2839 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2840 const char *backing_file)
2842 if (!bs->drv) {
2843 return NULL;
2846 if (bs->backing_hd) {
2847 if (strcmp(bs->backing_file, backing_file) == 0) {
2848 return bs->backing_hd;
2849 } else {
2850 return bdrv_find_backing_image(bs->backing_hd, backing_file);
2854 return NULL;
2857 #define NB_SUFFIXES 4
2859 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2861 static const char suffixes[NB_SUFFIXES] = "KMGT";
2862 int64_t base;
2863 int i;
2865 if (size <= 999) {
2866 snprintf(buf, buf_size, "%" PRId64, size);
2867 } else {
2868 base = 1024;
2869 for(i = 0; i < NB_SUFFIXES; i++) {
2870 if (size < (10 * base)) {
2871 snprintf(buf, buf_size, "%0.1f%c",
2872 (double)size / base,
2873 suffixes[i]);
2874 break;
2875 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2876 snprintf(buf, buf_size, "%" PRId64 "%c",
2877 ((size + (base >> 1)) / base),
2878 suffixes[i]);
2879 break;
2881 base = base * 1024;
2884 return buf;
2887 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2889 char buf1[128], date_buf[128], clock_buf[128];
2890 #ifdef _WIN32
2891 struct tm *ptm;
2892 #else
2893 struct tm tm;
2894 #endif
2895 time_t ti;
2896 int64_t secs;
2898 if (!sn) {
2899 snprintf(buf, buf_size,
2900 "%-10s%-20s%7s%20s%15s",
2901 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2902 } else {
2903 ti = sn->date_sec;
2904 #ifdef _WIN32
2905 ptm = localtime(&ti);
2906 strftime(date_buf, sizeof(date_buf),
2907 "%Y-%m-%d %H:%M:%S", ptm);
2908 #else
2909 localtime_r(&ti, &tm);
2910 strftime(date_buf, sizeof(date_buf),
2911 "%Y-%m-%d %H:%M:%S", &tm);
2912 #endif
2913 secs = sn->vm_clock_nsec / 1000000000;
2914 snprintf(clock_buf, sizeof(clock_buf),
2915 "%02d:%02d:%02d.%03d",
2916 (int)(secs / 3600),
2917 (int)((secs / 60) % 60),
2918 (int)(secs % 60),
2919 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2920 snprintf(buf, buf_size,
2921 "%-10s%-20s%7s%20s%15s",
2922 sn->id_str, sn->name,
2923 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2924 date_buf,
2925 clock_buf);
2927 return buf;
2930 /**************************************************************/
2931 /* async I/Os */
2933 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2934 QEMUIOVector *qiov, int nb_sectors,
2935 BlockDriverCompletionFunc *cb, void *opaque)
2937 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2939 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2940 cb, opaque, false);
2943 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2944 QEMUIOVector *qiov, int nb_sectors,
2945 BlockDriverCompletionFunc *cb, void *opaque)
2947 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2949 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2950 cb, opaque, true);
2954 typedef struct MultiwriteCB {
2955 int error;
2956 int num_requests;
2957 int num_callbacks;
2958 struct {
2959 BlockDriverCompletionFunc *cb;
2960 void *opaque;
2961 QEMUIOVector *free_qiov;
2962 } callbacks[];
2963 } MultiwriteCB;
2965 static void multiwrite_user_cb(MultiwriteCB *mcb)
2967 int i;
2969 for (i = 0; i < mcb->num_callbacks; i++) {
2970 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2971 if (mcb->callbacks[i].free_qiov) {
2972 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2974 g_free(mcb->callbacks[i].free_qiov);
2978 static void multiwrite_cb(void *opaque, int ret)
2980 MultiwriteCB *mcb = opaque;
2982 trace_multiwrite_cb(mcb, ret);
2984 if (ret < 0 && !mcb->error) {
2985 mcb->error = ret;
2988 mcb->num_requests--;
2989 if (mcb->num_requests == 0) {
2990 multiwrite_user_cb(mcb);
2991 g_free(mcb);
2995 static int multiwrite_req_compare(const void *a, const void *b)
2997 const BlockRequest *req1 = a, *req2 = b;
3000 * Note that we can't simply subtract req2->sector from req1->sector
3001 * here as that could overflow the return value.
3003 if (req1->sector > req2->sector) {
3004 return 1;
3005 } else if (req1->sector < req2->sector) {
3006 return -1;
3007 } else {
3008 return 0;
3013 * Takes a bunch of requests and tries to merge them. Returns the number of
3014 * requests that remain after merging.
3016 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3017 int num_reqs, MultiwriteCB *mcb)
3019 int i, outidx;
3021 // Sort requests by start sector
3022 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3024 // Check if adjacent requests touch the same clusters. If so, combine them,
3025 // filling up gaps with zero sectors.
3026 outidx = 0;
3027 for (i = 1; i < num_reqs; i++) {
3028 int merge = 0;
3029 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3031 // Handle exactly sequential writes and overlapping writes.
3032 if (reqs[i].sector <= oldreq_last) {
3033 merge = 1;
3036 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3037 merge = 0;
3040 if (merge) {
3041 size_t size;
3042 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3043 qemu_iovec_init(qiov,
3044 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3046 // Add the first request to the merged one. If the requests are
3047 // overlapping, drop the last sectors of the first request.
3048 size = (reqs[i].sector - reqs[outidx].sector) << 9;
3049 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3051 // We should need to add any zeros between the two requests
3052 assert (reqs[i].sector <= oldreq_last);
3054 // Add the second request
3055 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3057 reqs[outidx].nb_sectors = qiov->size >> 9;
3058 reqs[outidx].qiov = qiov;
3060 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3061 } else {
3062 outidx++;
3063 reqs[outidx].sector = reqs[i].sector;
3064 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3065 reqs[outidx].qiov = reqs[i].qiov;
3069 return outidx + 1;
3073 * Submit multiple AIO write requests at once.
3075 * On success, the function returns 0 and all requests in the reqs array have
3076 * been submitted. In error case this function returns -1, and any of the
3077 * requests may or may not be submitted yet. In particular, this means that the
3078 * callback will be called for some of the requests, for others it won't. The
3079 * caller must check the error field of the BlockRequest to wait for the right
3080 * callbacks (if error != 0, no callback will be called).
3082 * The implementation may modify the contents of the reqs array, e.g. to merge
3083 * requests. However, the fields opaque and error are left unmodified as they
3084 * are used to signal failure for a single request to the caller.
3086 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3088 MultiwriteCB *mcb;
3089 int i;
3091 /* don't submit writes if we don't have a medium */
3092 if (bs->drv == NULL) {
3093 for (i = 0; i < num_reqs; i++) {
3094 reqs[i].error = -ENOMEDIUM;
3096 return -1;
3099 if (num_reqs == 0) {
3100 return 0;
3103 // Create MultiwriteCB structure
3104 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3105 mcb->num_requests = 0;
3106 mcb->num_callbacks = num_reqs;
3108 for (i = 0; i < num_reqs; i++) {
3109 mcb->callbacks[i].cb = reqs[i].cb;
3110 mcb->callbacks[i].opaque = reqs[i].opaque;
3113 // Check for mergable requests
3114 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3116 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3118 /* Run the aio requests. */
3119 mcb->num_requests = num_reqs;
3120 for (i = 0; i < num_reqs; i++) {
3121 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3122 reqs[i].nb_sectors, multiwrite_cb, mcb);
3125 return 0;
3128 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3130 acb->pool->cancel(acb);
3133 /* block I/O throttling */
3134 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3135 bool is_write, double elapsed_time, uint64_t *wait)
3137 uint64_t bps_limit = 0;
3138 double bytes_limit, bytes_base, bytes_res;
3139 double slice_time, wait_time;
3141 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3142 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3143 } else if (bs->io_limits.bps[is_write]) {
3144 bps_limit = bs->io_limits.bps[is_write];
3145 } else {
3146 if (wait) {
3147 *wait = 0;
3150 return false;
3153 slice_time = bs->slice_end - bs->slice_start;
3154 slice_time /= (NANOSECONDS_PER_SECOND);
3155 bytes_limit = bps_limit * slice_time;
3156 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3157 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3158 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3161 /* bytes_base: the bytes of data which have been read/written; and
3162 * it is obtained from the history statistic info.
3163 * bytes_res: the remaining bytes of data which need to be read/written.
3164 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3165 * the total time for completing reading/writting all data.
3167 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3169 if (bytes_base + bytes_res <= bytes_limit) {
3170 if (wait) {
3171 *wait = 0;
3174 return false;
3177 /* Calc approx time to dispatch */
3178 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3180 /* When the I/O rate at runtime exceeds the limits,
3181 * bs->slice_end need to be extended in order that the current statistic
3182 * info can be kept until the timer fire, so it is increased and tuned
3183 * based on the result of experiment.
3185 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3186 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3187 if (wait) {
3188 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3191 return true;
3194 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3195 double elapsed_time, uint64_t *wait)
3197 uint64_t iops_limit = 0;
3198 double ios_limit, ios_base;
3199 double slice_time, wait_time;
3201 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3202 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3203 } else if (bs->io_limits.iops[is_write]) {
3204 iops_limit = bs->io_limits.iops[is_write];
3205 } else {
3206 if (wait) {
3207 *wait = 0;
3210 return false;
3213 slice_time = bs->slice_end - bs->slice_start;
3214 slice_time /= (NANOSECONDS_PER_SECOND);
3215 ios_limit = iops_limit * slice_time;
3216 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3217 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3218 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3221 if (ios_base + 1 <= ios_limit) {
3222 if (wait) {
3223 *wait = 0;
3226 return false;
3229 /* Calc approx time to dispatch */
3230 wait_time = (ios_base + 1) / iops_limit;
3231 if (wait_time > elapsed_time) {
3232 wait_time = wait_time - elapsed_time;
3233 } else {
3234 wait_time = 0;
3237 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3238 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3239 if (wait) {
3240 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3243 return true;
3246 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3247 bool is_write, int64_t *wait)
3249 int64_t now, max_wait;
3250 uint64_t bps_wait = 0, iops_wait = 0;
3251 double elapsed_time;
3252 int bps_ret, iops_ret;
3254 now = qemu_get_clock_ns(vm_clock);
3255 if ((bs->slice_start < now)
3256 && (bs->slice_end > now)) {
3257 bs->slice_end = now + bs->slice_time;
3258 } else {
3259 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3260 bs->slice_start = now;
3261 bs->slice_end = now + bs->slice_time;
3263 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3264 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3266 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3267 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3270 elapsed_time = now - bs->slice_start;
3271 elapsed_time /= (NANOSECONDS_PER_SECOND);
3273 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3274 is_write, elapsed_time, &bps_wait);
3275 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3276 elapsed_time, &iops_wait);
3277 if (bps_ret || iops_ret) {
3278 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3279 if (wait) {
3280 *wait = max_wait;
3283 now = qemu_get_clock_ns(vm_clock);
3284 if (bs->slice_end < now + max_wait) {
3285 bs->slice_end = now + max_wait;
3288 return true;
3291 if (wait) {
3292 *wait = 0;
3295 return false;
3298 /**************************************************************/
3299 /* async block device emulation */
3301 typedef struct BlockDriverAIOCBSync {
3302 BlockDriverAIOCB common;
3303 QEMUBH *bh;
3304 int ret;
3305 /* vector translation state */
3306 QEMUIOVector *qiov;
3307 uint8_t *bounce;
3308 int is_write;
3309 } BlockDriverAIOCBSync;
3311 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3313 BlockDriverAIOCBSync *acb =
3314 container_of(blockacb, BlockDriverAIOCBSync, common);
3315 qemu_bh_delete(acb->bh);
3316 acb->bh = NULL;
3317 qemu_aio_release(acb);
3320 static AIOPool bdrv_em_aio_pool = {
3321 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3322 .cancel = bdrv_aio_cancel_em,
3325 static void bdrv_aio_bh_cb(void *opaque)
3327 BlockDriverAIOCBSync *acb = opaque;
3329 if (!acb->is_write)
3330 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3331 qemu_vfree(acb->bounce);
3332 acb->common.cb(acb->common.opaque, acb->ret);
3333 qemu_bh_delete(acb->bh);
3334 acb->bh = NULL;
3335 qemu_aio_release(acb);
3338 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3339 int64_t sector_num,
3340 QEMUIOVector *qiov,
3341 int nb_sectors,
3342 BlockDriverCompletionFunc *cb,
3343 void *opaque,
3344 int is_write)
3347 BlockDriverAIOCBSync *acb;
3349 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3350 acb->is_write = is_write;
3351 acb->qiov = qiov;
3352 acb->bounce = qemu_blockalign(bs, qiov->size);
3353 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3355 if (is_write) {
3356 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3357 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3358 } else {
3359 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3362 qemu_bh_schedule(acb->bh);
3364 return &acb->common;
3367 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3368 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3369 BlockDriverCompletionFunc *cb, void *opaque)
3371 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3374 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3375 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3376 BlockDriverCompletionFunc *cb, void *opaque)
3378 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3382 typedef struct BlockDriverAIOCBCoroutine {
3383 BlockDriverAIOCB common;
3384 BlockRequest req;
3385 bool is_write;
3386 QEMUBH* bh;
3387 } BlockDriverAIOCBCoroutine;
3389 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3391 qemu_aio_flush();
3394 static AIOPool bdrv_em_co_aio_pool = {
3395 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3396 .cancel = bdrv_aio_co_cancel_em,
3399 static void bdrv_co_em_bh(void *opaque)
3401 BlockDriverAIOCBCoroutine *acb = opaque;
3403 acb->common.cb(acb->common.opaque, acb->req.error);
3404 qemu_bh_delete(acb->bh);
3405 qemu_aio_release(acb);
3408 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3409 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3411 BlockDriverAIOCBCoroutine *acb = opaque;
3412 BlockDriverState *bs = acb->common.bs;
3414 if (!acb->is_write) {
3415 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3416 acb->req.nb_sectors, acb->req.qiov, 0);
3417 } else {
3418 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3419 acb->req.nb_sectors, acb->req.qiov, 0);
3422 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3423 qemu_bh_schedule(acb->bh);
3426 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3427 int64_t sector_num,
3428 QEMUIOVector *qiov,
3429 int nb_sectors,
3430 BlockDriverCompletionFunc *cb,
3431 void *opaque,
3432 bool is_write)
3434 Coroutine *co;
3435 BlockDriverAIOCBCoroutine *acb;
3437 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3438 acb->req.sector = sector_num;
3439 acb->req.nb_sectors = nb_sectors;
3440 acb->req.qiov = qiov;
3441 acb->is_write = is_write;
3443 co = qemu_coroutine_create(bdrv_co_do_rw);
3444 qemu_coroutine_enter(co, acb);
3446 return &acb->common;
3449 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3451 BlockDriverAIOCBCoroutine *acb = opaque;
3452 BlockDriverState *bs = acb->common.bs;
3454 acb->req.error = bdrv_co_flush(bs);
3455 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3456 qemu_bh_schedule(acb->bh);
3459 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3460 BlockDriverCompletionFunc *cb, void *opaque)
3462 trace_bdrv_aio_flush(bs, opaque);
3464 Coroutine *co;
3465 BlockDriverAIOCBCoroutine *acb;
3467 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3468 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3469 qemu_coroutine_enter(co, acb);
3471 return &acb->common;
3474 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3476 BlockDriverAIOCBCoroutine *acb = opaque;
3477 BlockDriverState *bs = acb->common.bs;
3479 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3480 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3481 qemu_bh_schedule(acb->bh);
3484 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3485 int64_t sector_num, int nb_sectors,
3486 BlockDriverCompletionFunc *cb, void *opaque)
3488 Coroutine *co;
3489 BlockDriverAIOCBCoroutine *acb;
3491 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3493 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3494 acb->req.sector = sector_num;
3495 acb->req.nb_sectors = nb_sectors;
3496 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3497 qemu_coroutine_enter(co, acb);
3499 return &acb->common;
3502 void bdrv_init(void)
3504 module_call_init(MODULE_INIT_BLOCK);
3507 void bdrv_init_with_whitelist(void)
3509 use_bdrv_whitelist = 1;
3510 bdrv_init();
3513 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3514 BlockDriverCompletionFunc *cb, void *opaque)
3516 BlockDriverAIOCB *acb;
3518 if (pool->free_aiocb) {
3519 acb = pool->free_aiocb;
3520 pool->free_aiocb = acb->next;
3521 } else {
3522 acb = g_malloc0(pool->aiocb_size);
3523 acb->pool = pool;
3525 acb->bs = bs;
3526 acb->cb = cb;
3527 acb->opaque = opaque;
3528 return acb;
3531 void qemu_aio_release(void *p)
3533 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3534 AIOPool *pool = acb->pool;
3535 acb->next = pool->free_aiocb;
3536 pool->free_aiocb = acb;
3539 /**************************************************************/
3540 /* Coroutine block device emulation */
3542 typedef struct CoroutineIOCompletion {
3543 Coroutine *coroutine;
3544 int ret;
3545 } CoroutineIOCompletion;
3547 static void bdrv_co_io_em_complete(void *opaque, int ret)
3549 CoroutineIOCompletion *co = opaque;
3551 co->ret = ret;
3552 qemu_coroutine_enter(co->coroutine, NULL);
3555 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3556 int nb_sectors, QEMUIOVector *iov,
3557 bool is_write)
3559 CoroutineIOCompletion co = {
3560 .coroutine = qemu_coroutine_self(),
3562 BlockDriverAIOCB *acb;
3564 if (is_write) {
3565 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3566 bdrv_co_io_em_complete, &co);
3567 } else {
3568 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3569 bdrv_co_io_em_complete, &co);
3572 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3573 if (!acb) {
3574 return -EIO;
3576 qemu_coroutine_yield();
3578 return co.ret;
3581 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3582 int64_t sector_num, int nb_sectors,
3583 QEMUIOVector *iov)
3585 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3588 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3589 int64_t sector_num, int nb_sectors,
3590 QEMUIOVector *iov)
3592 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3595 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3597 RwCo *rwco = opaque;
3599 rwco->ret = bdrv_co_flush(rwco->bs);
3602 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3604 int ret;
3606 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3607 return 0;
3610 /* Write back cached data to the OS even with cache=unsafe */
3611 if (bs->drv->bdrv_co_flush_to_os) {
3612 ret = bs->drv->bdrv_co_flush_to_os(bs);
3613 if (ret < 0) {
3614 return ret;
3618 /* But don't actually force it to the disk with cache=unsafe */
3619 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3620 return 0;
3623 if (bs->drv->bdrv_co_flush_to_disk) {
3624 ret = bs->drv->bdrv_co_flush_to_disk(bs);
3625 } else if (bs->drv->bdrv_aio_flush) {
3626 BlockDriverAIOCB *acb;
3627 CoroutineIOCompletion co = {
3628 .coroutine = qemu_coroutine_self(),
3631 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3632 if (acb == NULL) {
3633 ret = -EIO;
3634 } else {
3635 qemu_coroutine_yield();
3636 ret = co.ret;
3638 } else {
3640 * Some block drivers always operate in either writethrough or unsafe
3641 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3642 * know how the server works (because the behaviour is hardcoded or
3643 * depends on server-side configuration), so we can't ensure that
3644 * everything is safe on disk. Returning an error doesn't work because
3645 * that would break guests even if the server operates in writethrough
3646 * mode.
3648 * Let's hope the user knows what he's doing.
3650 ret = 0;
3652 if (ret < 0) {
3653 return ret;
3656 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3657 * in the case of cache=unsafe, so there are no useless flushes.
3659 return bdrv_co_flush(bs->file);
3662 void bdrv_invalidate_cache(BlockDriverState *bs)
3664 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3665 bs->drv->bdrv_invalidate_cache(bs);
3669 void bdrv_invalidate_cache_all(void)
3671 BlockDriverState *bs;
3673 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3674 bdrv_invalidate_cache(bs);
3678 void bdrv_clear_incoming_migration_all(void)
3680 BlockDriverState *bs;
3682 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3683 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3687 int bdrv_flush(BlockDriverState *bs)
3689 Coroutine *co;
3690 RwCo rwco = {
3691 .bs = bs,
3692 .ret = NOT_DONE,
3695 if (qemu_in_coroutine()) {
3696 /* Fast-path if already in coroutine context */
3697 bdrv_flush_co_entry(&rwco);
3698 } else {
3699 co = qemu_coroutine_create(bdrv_flush_co_entry);
3700 qemu_coroutine_enter(co, &rwco);
3701 while (rwco.ret == NOT_DONE) {
3702 qemu_aio_wait();
3706 return rwco.ret;
3709 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3711 RwCo *rwco = opaque;
3713 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3716 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3717 int nb_sectors)
3719 if (!bs->drv) {
3720 return -ENOMEDIUM;
3721 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3722 return -EIO;
3723 } else if (bs->read_only) {
3724 return -EROFS;
3725 } else if (bs->drv->bdrv_co_discard) {
3726 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3727 } else if (bs->drv->bdrv_aio_discard) {
3728 BlockDriverAIOCB *acb;
3729 CoroutineIOCompletion co = {
3730 .coroutine = qemu_coroutine_self(),
3733 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3734 bdrv_co_io_em_complete, &co);
3735 if (acb == NULL) {
3736 return -EIO;
3737 } else {
3738 qemu_coroutine_yield();
3739 return co.ret;
3741 } else {
3742 return 0;
3746 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3748 Coroutine *co;
3749 RwCo rwco = {
3750 .bs = bs,
3751 .sector_num = sector_num,
3752 .nb_sectors = nb_sectors,
3753 .ret = NOT_DONE,
3756 if (qemu_in_coroutine()) {
3757 /* Fast-path if already in coroutine context */
3758 bdrv_discard_co_entry(&rwco);
3759 } else {
3760 co = qemu_coroutine_create(bdrv_discard_co_entry);
3761 qemu_coroutine_enter(co, &rwco);
3762 while (rwco.ret == NOT_DONE) {
3763 qemu_aio_wait();
3767 return rwco.ret;
3770 /**************************************************************/
3771 /* removable device support */
3774 * Return TRUE if the media is present
3776 int bdrv_is_inserted(BlockDriverState *bs)
3778 BlockDriver *drv = bs->drv;
3780 if (!drv)
3781 return 0;
3782 if (!drv->bdrv_is_inserted)
3783 return 1;
3784 return drv->bdrv_is_inserted(bs);
3788 * Return whether the media changed since the last call to this
3789 * function, or -ENOTSUP if we don't know. Most drivers don't know.
3791 int bdrv_media_changed(BlockDriverState *bs)
3793 BlockDriver *drv = bs->drv;
3795 if (drv && drv->bdrv_media_changed) {
3796 return drv->bdrv_media_changed(bs);
3798 return -ENOTSUP;
3802 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3804 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3806 BlockDriver *drv = bs->drv;
3808 if (drv && drv->bdrv_eject) {
3809 drv->bdrv_eject(bs, eject_flag);
3812 if (bs->device_name[0] != '\0') {
3813 bdrv_emit_qmp_eject_event(bs, eject_flag);
3818 * Lock or unlock the media (if it is locked, the user won't be able
3819 * to eject it manually).
3821 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3823 BlockDriver *drv = bs->drv;
3825 trace_bdrv_lock_medium(bs, locked);
3827 if (drv && drv->bdrv_lock_medium) {
3828 drv->bdrv_lock_medium(bs, locked);
3832 /* needed for generic scsi interface */
3834 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3836 BlockDriver *drv = bs->drv;
3838 if (drv && drv->bdrv_ioctl)
3839 return drv->bdrv_ioctl(bs, req, buf);
3840 return -ENOTSUP;
3843 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3844 unsigned long int req, void *buf,
3845 BlockDriverCompletionFunc *cb, void *opaque)
3847 BlockDriver *drv = bs->drv;
3849 if (drv && drv->bdrv_aio_ioctl)
3850 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3851 return NULL;
3854 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3856 bs->buffer_alignment = align;
3859 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3861 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3864 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3866 int64_t bitmap_size;
3868 bs->dirty_count = 0;
3869 if (enable) {
3870 if (!bs->dirty_bitmap) {
3871 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3872 BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3873 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3875 bs->dirty_bitmap = g_malloc0(bitmap_size);
3877 } else {
3878 if (bs->dirty_bitmap) {
3879 g_free(bs->dirty_bitmap);
3880 bs->dirty_bitmap = NULL;
3885 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3887 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3889 if (bs->dirty_bitmap &&
3890 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3891 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3892 (1UL << (chunk % (sizeof(unsigned long) * 8))));
3893 } else {
3894 return 0;
3898 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3899 int nr_sectors)
3901 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3904 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3906 return bs->dirty_count;
3909 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3911 assert(bs->in_use != in_use);
3912 bs->in_use = in_use;
3915 int bdrv_in_use(BlockDriverState *bs)
3917 return bs->in_use;
3920 void bdrv_iostatus_enable(BlockDriverState *bs)
3922 bs->iostatus_enabled = true;
3923 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3926 /* The I/O status is only enabled if the drive explicitly
3927 * enables it _and_ the VM is configured to stop on errors */
3928 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3930 return (bs->iostatus_enabled &&
3931 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3932 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
3933 bs->on_read_error == BLOCK_ERR_STOP_ANY));
3936 void bdrv_iostatus_disable(BlockDriverState *bs)
3938 bs->iostatus_enabled = false;
3941 void bdrv_iostatus_reset(BlockDriverState *bs)
3943 if (bdrv_iostatus_is_enabled(bs)) {
3944 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3948 /* XXX: Today this is set by device models because it makes the implementation
3949 quite simple. However, the block layer knows about the error, so it's
3950 possible to implement this without device models being involved */
3951 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3953 if (bdrv_iostatus_is_enabled(bs) &&
3954 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3955 assert(error >= 0);
3956 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3957 BLOCK_DEVICE_IO_STATUS_FAILED;
3961 void
3962 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3963 enum BlockAcctType type)
3965 assert(type < BDRV_MAX_IOTYPE);
3967 cookie->bytes = bytes;
3968 cookie->start_time_ns = get_clock();
3969 cookie->type = type;
3972 void
3973 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3975 assert(cookie->type < BDRV_MAX_IOTYPE);
3977 bs->nr_bytes[cookie->type] += cookie->bytes;
3978 bs->nr_ops[cookie->type]++;
3979 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3982 int bdrv_img_create(const char *filename, const char *fmt,
3983 const char *base_filename, const char *base_fmt,
3984 char *options, uint64_t img_size, int flags)
3986 QEMUOptionParameter *param = NULL, *create_options = NULL;
3987 QEMUOptionParameter *backing_fmt, *backing_file, *size;
3988 BlockDriverState *bs = NULL;
3989 BlockDriver *drv, *proto_drv;
3990 BlockDriver *backing_drv = NULL;
3991 int ret = 0;
3993 /* Find driver and parse its options */
3994 drv = bdrv_find_format(fmt);
3995 if (!drv) {
3996 error_report("Unknown file format '%s'", fmt);
3997 ret = -EINVAL;
3998 goto out;
4001 proto_drv = bdrv_find_protocol(filename);
4002 if (!proto_drv) {
4003 error_report("Unknown protocol '%s'", filename);
4004 ret = -EINVAL;
4005 goto out;
4008 create_options = append_option_parameters(create_options,
4009 drv->create_options);
4010 create_options = append_option_parameters(create_options,
4011 proto_drv->create_options);
4013 /* Create parameter list with default values */
4014 param = parse_option_parameters("", create_options, param);
4016 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4018 /* Parse -o options */
4019 if (options) {
4020 param = parse_option_parameters(options, create_options, param);
4021 if (param == NULL) {
4022 error_report("Invalid options for file format '%s'.", fmt);
4023 ret = -EINVAL;
4024 goto out;
4028 if (base_filename) {
4029 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4030 base_filename)) {
4031 error_report("Backing file not supported for file format '%s'",
4032 fmt);
4033 ret = -EINVAL;
4034 goto out;
4038 if (base_fmt) {
4039 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4040 error_report("Backing file format not supported for file "
4041 "format '%s'", fmt);
4042 ret = -EINVAL;
4043 goto out;
4047 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4048 if (backing_file && backing_file->value.s) {
4049 if (!strcmp(filename, backing_file->value.s)) {
4050 error_report("Error: Trying to create an image with the "
4051 "same filename as the backing file");
4052 ret = -EINVAL;
4053 goto out;
4057 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4058 if (backing_fmt && backing_fmt->value.s) {
4059 backing_drv = bdrv_find_format(backing_fmt->value.s);
4060 if (!backing_drv) {
4061 error_report("Unknown backing file format '%s'",
4062 backing_fmt->value.s);
4063 ret = -EINVAL;
4064 goto out;
4068 // The size for the image must always be specified, with one exception:
4069 // If we are using a backing file, we can obtain the size from there
4070 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4071 if (size && size->value.n == -1) {
4072 if (backing_file && backing_file->value.s) {
4073 uint64_t size;
4074 char buf[32];
4076 bs = bdrv_new("");
4078 ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
4079 if (ret < 0) {
4080 error_report("Could not open '%s'", backing_file->value.s);
4081 goto out;
4083 bdrv_get_geometry(bs, &size);
4084 size *= 512;
4086 snprintf(buf, sizeof(buf), "%" PRId64, size);
4087 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4088 } else {
4089 error_report("Image creation needs a size parameter");
4090 ret = -EINVAL;
4091 goto out;
4095 printf("Formatting '%s', fmt=%s ", filename, fmt);
4096 print_option_parameters(param);
4097 puts("");
4099 ret = bdrv_create(drv, filename, param);
4101 if (ret < 0) {
4102 if (ret == -ENOTSUP) {
4103 error_report("Formatting or formatting option not supported for "
4104 "file format '%s'", fmt);
4105 } else if (ret == -EFBIG) {
4106 error_report("The image size is too large for file format '%s'",
4107 fmt);
4108 } else {
4109 error_report("%s: error while creating %s: %s", filename, fmt,
4110 strerror(-ret));
4114 out:
4115 free_option_parameters(create_options);
4116 free_option_parameters(param);
4118 if (bs) {
4119 bdrv_delete(bs);
4122 return ret;
4125 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4126 int64_t speed, BlockDriverCompletionFunc *cb,
4127 void *opaque, Error **errp)
4129 BlockJob *job;
4131 if (bs->job || bdrv_in_use(bs)) {
4132 error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4133 return NULL;
4135 bdrv_set_in_use(bs, 1);
4137 job = g_malloc0(job_type->instance_size);
4138 job->job_type = job_type;
4139 job->bs = bs;
4140 job->cb = cb;
4141 job->opaque = opaque;
4142 bs->job = job;
4144 /* Only set speed when necessary to avoid NotSupported error */
4145 if (speed != 0) {
4146 Error *local_err = NULL;
4148 block_job_set_speed(job, speed, &local_err);
4149 if (error_is_set(&local_err)) {
4150 bs->job = NULL;
4151 g_free(job);
4152 bdrv_set_in_use(bs, 0);
4153 error_propagate(errp, local_err);
4154 return NULL;
4157 return job;
4160 void block_job_complete(BlockJob *job, int ret)
4162 BlockDriverState *bs = job->bs;
4164 assert(bs->job == job);
4165 job->cb(job->opaque, ret);
4166 bs->job = NULL;
4167 g_free(job);
4168 bdrv_set_in_use(bs, 0);
4171 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4173 Error *local_err = NULL;
4175 if (!job->job_type->set_speed) {
4176 error_set(errp, QERR_NOT_SUPPORTED);
4177 return;
4179 job->job_type->set_speed(job, speed, &local_err);
4180 if (error_is_set(&local_err)) {
4181 error_propagate(errp, local_err);
4182 return;
4185 job->speed = speed;
4188 void block_job_cancel(BlockJob *job)
4190 job->cancelled = true;
4193 bool block_job_is_cancelled(BlockJob *job)
4195 return job->cancelled;
4198 void block_job_cancel_sync(BlockJob *job)
4200 BlockDriverState *bs = job->bs;
4202 assert(bs->job == job);
4203 block_job_cancel(job);
4204 while (bs->job != NULL && bs->job->busy) {
4205 qemu_aio_wait();