block: copy enable_write_cache in bdrv_append
[qemu-kvm.git] / block.c
blob9bff401200aefce8bdc60f44e53f80f09123a4e5
1 /*
2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qjson.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
35 #ifdef CONFIG_BSD
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
40 #ifndef __DragonFly__
41 #include <sys/disk.h>
42 #endif
43 #endif
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
51 typedef enum {
52 BDRV_REQ_COPY_ON_READ = 0x1,
53 BDRV_REQ_ZERO_WRITE = 0x2,
54 } BdrvRequestFlags;
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59 BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62 BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
65 QEMUIOVector *iov);
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
68 QEMUIOVector *iov);
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76 int64_t sector_num,
77 QEMUIOVector *qiov,
78 int nb_sectors,
79 BlockDriverCompletionFunc *cb,
80 void *opaque,
81 bool is_write);
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
83 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84 int64_t sector_num, int nb_sectors);
86 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87 bool is_write, double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89 double elapsed_time, uint64_t *wait);
90 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91 bool is_write, int64_t *wait);
93 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94 QTAILQ_HEAD_INITIALIZER(bdrv_states);
96 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97 QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 /* The device to use for VM snapshots */
100 static BlockDriverState *bs_snapshots;
102 /* If non-zero, use only whitelisted block drivers */
103 static int use_bdrv_whitelist;
105 #ifdef _WIN32
106 static int is_windows_drive_prefix(const char *filename)
108 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110 filename[1] == ':');
113 int is_windows_drive(const char *filename)
115 if (is_windows_drive_prefix(filename) &&
116 filename[2] == '\0')
117 return 1;
118 if (strstart(filename, "\\\\.\\", NULL) ||
119 strstart(filename, "//./", NULL))
120 return 1;
121 return 0;
123 #endif
125 /* throttling disk I/O limits */
126 void bdrv_io_limits_disable(BlockDriverState *bs)
128 bs->io_limits_enabled = false;
130 while (qemu_co_queue_next(&bs->throttled_reqs));
132 if (bs->block_timer) {
133 qemu_del_timer(bs->block_timer);
134 qemu_free_timer(bs->block_timer);
135 bs->block_timer = NULL;
138 bs->slice_start = 0;
139 bs->slice_end = 0;
140 bs->slice_time = 0;
141 memset(&bs->io_base, 0, sizeof(bs->io_base));
144 static void bdrv_block_timer(void *opaque)
146 BlockDriverState *bs = opaque;
148 qemu_co_queue_next(&bs->throttled_reqs);
151 void bdrv_io_limits_enable(BlockDriverState *bs)
153 qemu_co_queue_init(&bs->throttled_reqs);
154 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
156 bs->slice_start = qemu_get_clock_ns(vm_clock);
157 bs->slice_end = bs->slice_start + bs->slice_time;
158 memset(&bs->io_base, 0, sizeof(bs->io_base));
159 bs->io_limits_enabled = true;
162 bool bdrv_io_limits_enabled(BlockDriverState *bs)
164 BlockIOLimit *io_limits = &bs->io_limits;
165 return io_limits->bps[BLOCK_IO_LIMIT_READ]
166 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168 || io_limits->iops[BLOCK_IO_LIMIT_READ]
169 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
173 static void bdrv_io_limits_intercept(BlockDriverState *bs,
174 bool is_write, int nb_sectors)
176 int64_t wait_time = -1;
178 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179 qemu_co_queue_wait(&bs->throttled_reqs);
182 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183 * throttled requests will not be dequeued until the current request is
184 * allowed to be serviced. So if the current request still exceeds the
185 * limits, it will be inserted to the head. All requests followed it will
186 * be still in throttled_reqs queue.
189 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190 qemu_mod_timer(bs->block_timer,
191 wait_time + qemu_get_clock_ns(vm_clock));
192 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
195 qemu_co_queue_next(&bs->throttled_reqs);
198 /* check if the path starts with "<protocol>:" */
199 static int path_has_protocol(const char *path)
201 const char *p;
203 #ifdef _WIN32
204 if (is_windows_drive(path) ||
205 is_windows_drive_prefix(path)) {
206 return 0;
208 p = path + strcspn(path, ":/\\");
209 #else
210 p = path + strcspn(path, ":/");
211 #endif
213 return *p == ':';
216 int path_is_absolute(const char *path)
218 #ifdef _WIN32
219 /* specific case for names like: "\\.\d:" */
220 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
221 return 1;
223 return (*path == '/' || *path == '\\');
224 #else
225 return (*path == '/');
226 #endif
229 /* if filename is absolute, just copy it to dest. Otherwise, build a
230 path to it by considering it is relative to base_path. URL are
231 supported. */
232 void path_combine(char *dest, int dest_size,
233 const char *base_path,
234 const char *filename)
236 const char *p, *p1;
237 int len;
239 if (dest_size <= 0)
240 return;
241 if (path_is_absolute(filename)) {
242 pstrcpy(dest, dest_size, filename);
243 } else {
244 p = strchr(base_path, ':');
245 if (p)
246 p++;
247 else
248 p = base_path;
249 p1 = strrchr(base_path, '/');
250 #ifdef _WIN32
252 const char *p2;
253 p2 = strrchr(base_path, '\\');
254 if (!p1 || p2 > p1)
255 p1 = p2;
257 #endif
258 if (p1)
259 p1++;
260 else
261 p1 = base_path;
262 if (p1 > p)
263 p = p1;
264 len = p - base_path;
265 if (len > dest_size - 1)
266 len = dest_size - 1;
267 memcpy(dest, base_path, len);
268 dest[len] = '\0';
269 pstrcat(dest, dest_size, filename);
273 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
275 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
276 pstrcpy(dest, sz, bs->backing_file);
277 } else {
278 path_combine(dest, sz, bs->filename, bs->backing_file);
282 void bdrv_register(BlockDriver *bdrv)
284 /* Block drivers without coroutine functions need emulation */
285 if (!bdrv->bdrv_co_readv) {
286 bdrv->bdrv_co_readv = bdrv_co_readv_em;
287 bdrv->bdrv_co_writev = bdrv_co_writev_em;
289 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
290 * the block driver lacks aio we need to emulate that too.
292 if (!bdrv->bdrv_aio_readv) {
293 /* add AIO emulation layer */
294 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
295 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
299 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
302 /* create a new block device (by default it is empty) */
303 BlockDriverState *bdrv_new(const char *device_name)
305 BlockDriverState *bs;
307 bs = g_malloc0(sizeof(BlockDriverState));
308 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
309 if (device_name[0] != '\0') {
310 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
312 bdrv_iostatus_disable(bs);
313 return bs;
316 BlockDriver *bdrv_find_format(const char *format_name)
318 BlockDriver *drv1;
319 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
320 if (!strcmp(drv1->format_name, format_name)) {
321 return drv1;
324 return NULL;
327 static int bdrv_is_whitelisted(BlockDriver *drv)
329 static const char *whitelist[] = {
330 CONFIG_BDRV_WHITELIST
332 const char **p;
334 if (!whitelist[0])
335 return 1; /* no whitelist, anything goes */
337 for (p = whitelist; *p; p++) {
338 if (!strcmp(drv->format_name, *p)) {
339 return 1;
342 return 0;
345 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
347 BlockDriver *drv = bdrv_find_format(format_name);
348 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
351 typedef struct CreateCo {
352 BlockDriver *drv;
353 char *filename;
354 QEMUOptionParameter *options;
355 int ret;
356 } CreateCo;
358 static void coroutine_fn bdrv_create_co_entry(void *opaque)
360 CreateCo *cco = opaque;
361 assert(cco->drv);
363 cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
366 int bdrv_create(BlockDriver *drv, const char* filename,
367 QEMUOptionParameter *options)
369 int ret;
371 Coroutine *co;
372 CreateCo cco = {
373 .drv = drv,
374 .filename = g_strdup(filename),
375 .options = options,
376 .ret = NOT_DONE,
379 if (!drv->bdrv_create) {
380 return -ENOTSUP;
383 if (qemu_in_coroutine()) {
384 /* Fast-path if already in coroutine context */
385 bdrv_create_co_entry(&cco);
386 } else {
387 co = qemu_coroutine_create(bdrv_create_co_entry);
388 qemu_coroutine_enter(co, &cco);
389 while (cco.ret == NOT_DONE) {
390 qemu_aio_wait();
394 ret = cco.ret;
395 g_free(cco.filename);
397 return ret;
400 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
402 BlockDriver *drv;
404 drv = bdrv_find_protocol(filename);
405 if (drv == NULL) {
406 return -ENOENT;
409 return bdrv_create(drv, filename, options);
413 * Create a uniquely-named empty temporary file.
414 * Return 0 upon success, otherwise a negative errno value.
416 int get_tmp_filename(char *filename, int size)
418 #ifdef _WIN32
419 char temp_dir[MAX_PATH];
420 /* GetTempFileName requires that its output buffer (4th param)
421 have length MAX_PATH or greater. */
422 assert(size >= MAX_PATH);
423 return (GetTempPath(MAX_PATH, temp_dir)
424 && GetTempFileName(temp_dir, "qem", 0, filename)
425 ? 0 : -GetLastError());
426 #else
427 int fd;
428 const char *tmpdir;
429 tmpdir = getenv("TMPDIR");
430 if (!tmpdir)
431 tmpdir = "/tmp";
432 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
433 return -EOVERFLOW;
435 fd = mkstemp(filename);
436 if (fd < 0 || close(fd)) {
437 return -errno;
439 return 0;
440 #endif
444 * Detect host devices. By convention, /dev/cdrom[N] is always
445 * recognized as a host CDROM.
447 static BlockDriver *find_hdev_driver(const char *filename)
449 int score_max = 0, score;
450 BlockDriver *drv = NULL, *d;
452 QLIST_FOREACH(d, &bdrv_drivers, list) {
453 if (d->bdrv_probe_device) {
454 score = d->bdrv_probe_device(filename);
455 if (score > score_max) {
456 score_max = score;
457 drv = d;
462 return drv;
465 BlockDriver *bdrv_find_protocol(const char *filename)
467 BlockDriver *drv1;
468 char protocol[128];
469 int len;
470 const char *p;
472 /* TODO Drivers without bdrv_file_open must be specified explicitly */
475 * XXX(hch): we really should not let host device detection
476 * override an explicit protocol specification, but moving this
477 * later breaks access to device names with colons in them.
478 * Thanks to the brain-dead persistent naming schemes on udev-
479 * based Linux systems those actually are quite common.
481 drv1 = find_hdev_driver(filename);
482 if (drv1) {
483 return drv1;
486 if (!path_has_protocol(filename)) {
487 return bdrv_find_format("file");
489 p = strchr(filename, ':');
490 assert(p != NULL);
491 len = p - filename;
492 if (len > sizeof(protocol) - 1)
493 len = sizeof(protocol) - 1;
494 memcpy(protocol, filename, len);
495 protocol[len] = '\0';
496 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
497 if (drv1->protocol_name &&
498 !strcmp(drv1->protocol_name, protocol)) {
499 return drv1;
502 return NULL;
505 static int find_image_format(const char *filename, BlockDriver **pdrv)
507 int ret, score, score_max;
508 BlockDriver *drv1, *drv;
509 uint8_t buf[2048];
510 BlockDriverState *bs;
512 ret = bdrv_file_open(&bs, filename, 0);
513 if (ret < 0) {
514 *pdrv = NULL;
515 return ret;
518 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
519 if (bs->sg || !bdrv_is_inserted(bs)) {
520 bdrv_delete(bs);
521 drv = bdrv_find_format("raw");
522 if (!drv) {
523 ret = -ENOENT;
525 *pdrv = drv;
526 return ret;
529 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
530 bdrv_delete(bs);
531 if (ret < 0) {
532 *pdrv = NULL;
533 return ret;
536 score_max = 0;
537 drv = NULL;
538 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
539 if (drv1->bdrv_probe) {
540 score = drv1->bdrv_probe(buf, ret, filename);
541 if (score > score_max) {
542 score_max = score;
543 drv = drv1;
547 if (!drv) {
548 ret = -ENOENT;
550 *pdrv = drv;
551 return ret;
555 * Set the current 'total_sectors' value
557 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
559 BlockDriver *drv = bs->drv;
561 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
562 if (bs->sg)
563 return 0;
565 /* query actual device if possible, otherwise just trust the hint */
566 if (drv->bdrv_getlength) {
567 int64_t length = drv->bdrv_getlength(bs);
568 if (length < 0) {
569 return length;
571 hint = length >> BDRV_SECTOR_BITS;
574 bs->total_sectors = hint;
575 return 0;
579 * Set open flags for a given cache mode
581 * Return 0 on success, -1 if the cache mode was invalid.
583 int bdrv_parse_cache_flags(const char *mode, int *flags)
585 *flags &= ~BDRV_O_CACHE_MASK;
587 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
588 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
589 } else if (!strcmp(mode, "directsync")) {
590 *flags |= BDRV_O_NOCACHE;
591 } else if (!strcmp(mode, "writeback")) {
592 *flags |= BDRV_O_CACHE_WB;
593 } else if (!strcmp(mode, "unsafe")) {
594 *flags |= BDRV_O_CACHE_WB;
595 *flags |= BDRV_O_NO_FLUSH;
596 } else if (!strcmp(mode, "writethrough")) {
597 /* this is the default */
598 } else {
599 return -1;
602 return 0;
606 * The copy-on-read flag is actually a reference count so multiple users may
607 * use the feature without worrying about clobbering its previous state.
608 * Copy-on-read stays enabled until all users have called to disable it.
610 void bdrv_enable_copy_on_read(BlockDriverState *bs)
612 bs->copy_on_read++;
615 void bdrv_disable_copy_on_read(BlockDriverState *bs)
617 assert(bs->copy_on_read > 0);
618 bs->copy_on_read--;
622 * Common part for opening disk images and files
624 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
625 int flags, BlockDriver *drv)
627 int ret, open_flags;
629 assert(drv != NULL);
630 assert(bs->file == NULL);
632 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
634 bs->open_flags = flags;
635 bs->buffer_alignment = 512;
637 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
638 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
639 bdrv_enable_copy_on_read(bs);
642 pstrcpy(bs->filename, sizeof(bs->filename), filename);
644 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
645 return -ENOTSUP;
648 bs->drv = drv;
649 bs->opaque = g_malloc0(drv->instance_size);
651 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
654 * Clear flags that are internal to the block layer before opening the
655 * image.
657 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
660 * Snapshots should be writable.
662 if (bs->is_temporary) {
663 open_flags |= BDRV_O_RDWR;
666 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
668 /* Open the image, either directly or using a protocol */
669 if (drv->bdrv_file_open) {
670 ret = drv->bdrv_file_open(bs, filename, open_flags);
671 } else {
672 ret = bdrv_file_open(&bs->file, filename, open_flags);
673 if (ret >= 0) {
674 ret = drv->bdrv_open(bs, open_flags);
678 if (ret < 0) {
679 goto free_and_fail;
682 ret = refresh_total_sectors(bs, bs->total_sectors);
683 if (ret < 0) {
684 goto free_and_fail;
687 #ifndef _WIN32
688 if (bs->is_temporary) {
689 unlink(filename);
691 #endif
692 return 0;
694 free_and_fail:
695 if (bs->file) {
696 bdrv_delete(bs->file);
697 bs->file = NULL;
699 g_free(bs->opaque);
700 bs->opaque = NULL;
701 bs->drv = NULL;
702 return ret;
706 * Opens a file using a protocol (file, host_device, nbd, ...)
708 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
710 BlockDriverState *bs;
711 BlockDriver *drv;
712 int ret;
714 drv = bdrv_find_protocol(filename);
715 if (!drv) {
716 return -ENOENT;
719 bs = bdrv_new("");
720 ret = bdrv_open_common(bs, filename, flags, drv);
721 if (ret < 0) {
722 bdrv_delete(bs);
723 return ret;
725 bs->growable = 1;
726 *pbs = bs;
727 return 0;
731 * Opens a disk image (raw, qcow2, vmdk, ...)
733 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
734 BlockDriver *drv)
736 int ret;
737 char tmp_filename[PATH_MAX];
739 if (flags & BDRV_O_SNAPSHOT) {
740 BlockDriverState *bs1;
741 int64_t total_size;
742 int is_protocol = 0;
743 BlockDriver *bdrv_qcow2;
744 QEMUOptionParameter *options;
745 char backing_filename[PATH_MAX];
747 /* if snapshot, we create a temporary backing file and open it
748 instead of opening 'filename' directly */
750 /* if there is a backing file, use it */
751 bs1 = bdrv_new("");
752 ret = bdrv_open(bs1, filename, 0, drv);
753 if (ret < 0) {
754 bdrv_delete(bs1);
755 return ret;
757 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
759 if (bs1->drv && bs1->drv->protocol_name)
760 is_protocol = 1;
762 bdrv_delete(bs1);
764 ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
765 if (ret < 0) {
766 return ret;
769 /* Real path is meaningless for protocols */
770 if (is_protocol)
771 snprintf(backing_filename, sizeof(backing_filename),
772 "%s", filename);
773 else if (!realpath(filename, backing_filename))
774 return -errno;
776 bdrv_qcow2 = bdrv_find_format("qcow2");
777 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
779 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
780 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
781 if (drv) {
782 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
783 drv->format_name);
786 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
787 free_option_parameters(options);
788 if (ret < 0) {
789 return ret;
792 filename = tmp_filename;
793 drv = bdrv_qcow2;
794 bs->is_temporary = 1;
797 /* Find the right image format driver */
798 if (!drv) {
799 ret = find_image_format(filename, &drv);
802 if (!drv) {
803 goto unlink_and_fail;
806 /* Open the image */
807 ret = bdrv_open_common(bs, filename, flags, drv);
808 if (ret < 0) {
809 goto unlink_and_fail;
812 /* If there is a backing file, use it */
813 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
814 char backing_filename[PATH_MAX];
815 int back_flags;
816 BlockDriver *back_drv = NULL;
818 bs->backing_hd = bdrv_new("");
819 bdrv_get_full_backing_filename(bs, backing_filename,
820 sizeof(backing_filename));
822 if (bs->backing_format[0] != '\0') {
823 back_drv = bdrv_find_format(bs->backing_format);
826 /* backing files always opened read-only */
827 back_flags =
828 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
830 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
831 if (ret < 0) {
832 bdrv_close(bs);
833 return ret;
835 if (bs->is_temporary) {
836 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
837 } else {
838 /* base image inherits from "parent" */
839 bs->backing_hd->keep_read_only = bs->keep_read_only;
843 if (!bdrv_key_required(bs)) {
844 bdrv_dev_change_media_cb(bs, true);
847 /* throttling disk I/O limits */
848 if (bs->io_limits_enabled) {
849 bdrv_io_limits_enable(bs);
852 return 0;
854 unlink_and_fail:
855 if (bs->is_temporary) {
856 unlink(filename);
858 return ret;
861 void bdrv_close(BlockDriverState *bs)
863 bdrv_flush(bs);
864 if (bs->drv) {
865 if (bs->job) {
866 block_job_cancel_sync(bs->job);
868 bdrv_drain_all();
870 if (bs == bs_snapshots) {
871 bs_snapshots = NULL;
873 if (bs->backing_hd) {
874 bdrv_delete(bs->backing_hd);
875 bs->backing_hd = NULL;
877 bs->drv->bdrv_close(bs);
878 g_free(bs->opaque);
879 #ifdef _WIN32
880 if (bs->is_temporary) {
881 unlink(bs->filename);
883 #endif
884 bs->opaque = NULL;
885 bs->drv = NULL;
886 bs->copy_on_read = 0;
887 bs->backing_file[0] = '\0';
888 bs->backing_format[0] = '\0';
889 bs->total_sectors = 0;
890 bs->encrypted = 0;
891 bs->valid_key = 0;
892 bs->sg = 0;
893 bs->growable = 0;
895 if (bs->file != NULL) {
896 bdrv_delete(bs->file);
897 bs->file = NULL;
900 bdrv_dev_change_media_cb(bs, false);
903 /*throttling disk I/O limits*/
904 if (bs->io_limits_enabled) {
905 bdrv_io_limits_disable(bs);
909 void bdrv_close_all(void)
911 BlockDriverState *bs;
913 QTAILQ_FOREACH(bs, &bdrv_states, list) {
914 bdrv_close(bs);
919 * Wait for pending requests to complete across all BlockDriverStates
921 * This function does not flush data to disk, use bdrv_flush_all() for that
922 * after calling this function.
924 * Note that completion of an asynchronous I/O operation can trigger any
925 * number of other I/O operations on other devices---for example a coroutine
926 * can be arbitrarily complex and a constant flow of I/O can come until the
927 * coroutine is complete. Because of this, it is not possible to have a
928 * function to drain a single device's I/O queue.
930 void bdrv_drain_all(void)
932 BlockDriverState *bs;
933 bool busy;
935 do {
936 busy = qemu_aio_wait();
938 /* FIXME: We do not have timer support here, so this is effectively
939 * a busy wait.
941 QTAILQ_FOREACH(bs, &bdrv_states, list) {
942 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
943 qemu_co_queue_restart_all(&bs->throttled_reqs);
944 busy = true;
947 } while (busy);
949 /* If requests are still pending there is a bug somewhere */
950 QTAILQ_FOREACH(bs, &bdrv_states, list) {
951 assert(QLIST_EMPTY(&bs->tracked_requests));
952 assert(qemu_co_queue_empty(&bs->throttled_reqs));
956 /* make a BlockDriverState anonymous by removing from bdrv_state list.
957 Also, NULL terminate the device_name to prevent double remove */
958 void bdrv_make_anon(BlockDriverState *bs)
960 if (bs->device_name[0] != '\0') {
961 QTAILQ_REMOVE(&bdrv_states, bs, list);
963 bs->device_name[0] = '\0';
966 static void bdrv_rebind(BlockDriverState *bs)
968 if (bs->drv && bs->drv->bdrv_rebind) {
969 bs->drv->bdrv_rebind(bs);
974 * Add new bs contents at the top of an image chain while the chain is
975 * live, while keeping required fields on the top layer.
977 * This will modify the BlockDriverState fields, and swap contents
978 * between bs_new and bs_top. Both bs_new and bs_top are modified.
980 * bs_new is required to be anonymous.
982 * This function does not create any image files.
984 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
986 BlockDriverState tmp;
988 /* bs_new must be anonymous */
989 assert(bs_new->device_name[0] == '\0');
991 tmp = *bs_new;
993 /* there are some fields that need to stay on the top layer: */
994 tmp.open_flags = bs_top->open_flags;
996 /* dev info */
997 tmp.dev_ops = bs_top->dev_ops;
998 tmp.dev_opaque = bs_top->dev_opaque;
999 tmp.dev = bs_top->dev;
1000 tmp.buffer_alignment = bs_top->buffer_alignment;
1001 tmp.copy_on_read = bs_top->copy_on_read;
1003 tmp.enable_write_cache = bs_top->enable_write_cache;
1005 /* i/o timing parameters */
1006 tmp.slice_time = bs_top->slice_time;
1007 tmp.slice_start = bs_top->slice_start;
1008 tmp.slice_end = bs_top->slice_end;
1009 tmp.io_limits = bs_top->io_limits;
1010 tmp.io_base = bs_top->io_base;
1011 tmp.throttled_reqs = bs_top->throttled_reqs;
1012 tmp.block_timer = bs_top->block_timer;
1013 tmp.io_limits_enabled = bs_top->io_limits_enabled;
1015 /* geometry */
1016 tmp.cyls = bs_top->cyls;
1017 tmp.heads = bs_top->heads;
1018 tmp.secs = bs_top->secs;
1019 tmp.translation = bs_top->translation;
1021 /* r/w error */
1022 tmp.on_read_error = bs_top->on_read_error;
1023 tmp.on_write_error = bs_top->on_write_error;
1025 /* i/o status */
1026 tmp.iostatus_enabled = bs_top->iostatus_enabled;
1027 tmp.iostatus = bs_top->iostatus;
1029 /* keep the same entry in bdrv_states */
1030 pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
1031 tmp.list = bs_top->list;
1033 /* The contents of 'tmp' will become bs_top, as we are
1034 * swapping bs_new and bs_top contents. */
1035 tmp.backing_hd = bs_new;
1036 pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
1037 bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
1039 /* swap contents of the fixed new bs and the current top */
1040 *bs_new = *bs_top;
1041 *bs_top = tmp;
1043 /* device_name[] was carried over from the old bs_top. bs_new
1044 * shouldn't be in bdrv_states, so we need to make device_name[]
1045 * reflect the anonymity of bs_new
1047 bs_new->device_name[0] = '\0';
1049 /* clear the copied fields in the new backing file */
1050 bdrv_detach_dev(bs_new, bs_new->dev);
1052 qemu_co_queue_init(&bs_new->throttled_reqs);
1053 memset(&bs_new->io_base, 0, sizeof(bs_new->io_base));
1054 memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
1055 bdrv_iostatus_disable(bs_new);
1057 /* we don't use bdrv_io_limits_disable() for this, because we don't want
1058 * to affect or delete the block_timer, as it has been moved to bs_top */
1059 bs_new->io_limits_enabled = false;
1060 bs_new->block_timer = NULL;
1061 bs_new->slice_time = 0;
1062 bs_new->slice_start = 0;
1063 bs_new->slice_end = 0;
1065 bdrv_rebind(bs_new);
1066 bdrv_rebind(bs_top);
1069 void bdrv_delete(BlockDriverState *bs)
1071 assert(!bs->dev);
1072 assert(!bs->job);
1073 assert(!bs->in_use);
1075 /* remove from list, if necessary */
1076 bdrv_make_anon(bs);
1078 bdrv_close(bs);
1080 assert(bs != bs_snapshots);
1081 g_free(bs);
1084 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1085 /* TODO change to DeviceState *dev when all users are qdevified */
1087 if (bs->dev) {
1088 return -EBUSY;
1090 bs->dev = dev;
1091 bdrv_iostatus_reset(bs);
1092 return 0;
1095 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1096 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1098 if (bdrv_attach_dev(bs, dev) < 0) {
1099 abort();
1103 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1104 /* TODO change to DeviceState *dev when all users are qdevified */
1106 assert(bs->dev == dev);
1107 bs->dev = NULL;
1108 bs->dev_ops = NULL;
1109 bs->dev_opaque = NULL;
1110 bs->buffer_alignment = 512;
1113 /* TODO change to return DeviceState * when all users are qdevified */
1114 void *bdrv_get_attached_dev(BlockDriverState *bs)
1116 return bs->dev;
1119 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1120 void *opaque)
1122 bs->dev_ops = ops;
1123 bs->dev_opaque = opaque;
1124 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1125 bs_snapshots = NULL;
1129 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1130 BlockQMPEventAction action, int is_read)
1132 QObject *data;
1133 const char *action_str;
1135 switch (action) {
1136 case BDRV_ACTION_REPORT:
1137 action_str = "report";
1138 break;
1139 case BDRV_ACTION_IGNORE:
1140 action_str = "ignore";
1141 break;
1142 case BDRV_ACTION_STOP:
1143 action_str = "stop";
1144 break;
1145 default:
1146 abort();
1149 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1150 bdrv->device_name,
1151 action_str,
1152 is_read ? "read" : "write");
1153 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1155 qobject_decref(data);
1158 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1160 QObject *data;
1162 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1163 bdrv_get_device_name(bs), ejected);
1164 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1166 qobject_decref(data);
1169 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1171 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1172 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1173 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1174 if (tray_was_closed) {
1175 /* tray open */
1176 bdrv_emit_qmp_eject_event(bs, true);
1178 if (load) {
1179 /* tray close */
1180 bdrv_emit_qmp_eject_event(bs, false);
1185 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1187 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1190 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1192 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1193 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1197 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1199 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1200 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1202 return false;
1205 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1207 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1208 bs->dev_ops->resize_cb(bs->dev_opaque);
1212 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1214 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1215 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1217 return false;
1221 * Run consistency checks on an image
1223 * Returns 0 if the check could be completed (it doesn't mean that the image is
1224 * free of errors) or -errno when an internal error occurred. The results of the
1225 * check are stored in res.
1227 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1229 if (bs->drv->bdrv_check == NULL) {
1230 return -ENOTSUP;
1233 memset(res, 0, sizeof(*res));
1234 return bs->drv->bdrv_check(bs, res, fix);
1237 #define COMMIT_BUF_SECTORS 2048
1239 /* commit COW file into the raw image */
1240 int bdrv_commit(BlockDriverState *bs)
1242 BlockDriver *drv = bs->drv;
1243 BlockDriver *backing_drv;
1244 int64_t sector, total_sectors;
1245 int n, ro, open_flags;
1246 int ret = 0, rw_ret = 0;
1247 uint8_t *buf;
1248 char filename[1024];
1249 BlockDriverState *bs_rw, *bs_ro;
1251 if (!drv)
1252 return -ENOMEDIUM;
1254 if (!bs->backing_hd) {
1255 return -ENOTSUP;
1258 if (bs->backing_hd->keep_read_only) {
1259 return -EACCES;
1262 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1263 return -EBUSY;
1266 backing_drv = bs->backing_hd->drv;
1267 ro = bs->backing_hd->read_only;
1268 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1269 open_flags = bs->backing_hd->open_flags;
1271 if (ro) {
1272 /* re-open as RW */
1273 bdrv_delete(bs->backing_hd);
1274 bs->backing_hd = NULL;
1275 bs_rw = bdrv_new("");
1276 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1277 backing_drv);
1278 if (rw_ret < 0) {
1279 bdrv_delete(bs_rw);
1280 /* try to re-open read-only */
1281 bs_ro = bdrv_new("");
1282 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1283 backing_drv);
1284 if (ret < 0) {
1285 bdrv_delete(bs_ro);
1286 /* drive not functional anymore */
1287 bs->drv = NULL;
1288 return ret;
1290 bs->backing_hd = bs_ro;
1291 return rw_ret;
1293 bs->backing_hd = bs_rw;
1296 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1297 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1299 for (sector = 0; sector < total_sectors; sector += n) {
1300 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1302 if (bdrv_read(bs, sector, buf, n) != 0) {
1303 ret = -EIO;
1304 goto ro_cleanup;
1307 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1308 ret = -EIO;
1309 goto ro_cleanup;
1314 if (drv->bdrv_make_empty) {
1315 ret = drv->bdrv_make_empty(bs);
1316 bdrv_flush(bs);
1320 * Make sure all data we wrote to the backing device is actually
1321 * stable on disk.
1323 if (bs->backing_hd)
1324 bdrv_flush(bs->backing_hd);
1326 ro_cleanup:
1327 g_free(buf);
1329 if (ro) {
1330 /* re-open as RO */
1331 bdrv_delete(bs->backing_hd);
1332 bs->backing_hd = NULL;
1333 bs_ro = bdrv_new("");
1334 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1335 backing_drv);
1336 if (ret < 0) {
1337 bdrv_delete(bs_ro);
1338 /* drive not functional anymore */
1339 bs->drv = NULL;
1340 return ret;
1342 bs->backing_hd = bs_ro;
1343 bs->backing_hd->keep_read_only = 0;
1346 return ret;
1349 int bdrv_commit_all(void)
1351 BlockDriverState *bs;
1353 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1354 int ret = bdrv_commit(bs);
1355 if (ret < 0) {
1356 return ret;
1359 return 0;
1362 struct BdrvTrackedRequest {
1363 BlockDriverState *bs;
1364 int64_t sector_num;
1365 int nb_sectors;
1366 bool is_write;
1367 QLIST_ENTRY(BdrvTrackedRequest) list;
1368 Coroutine *co; /* owner, used for deadlock detection */
1369 CoQueue wait_queue; /* coroutines blocked on this request */
1373 * Remove an active request from the tracked requests list
1375 * This function should be called when a tracked request is completing.
1377 static void tracked_request_end(BdrvTrackedRequest *req)
1379 QLIST_REMOVE(req, list);
1380 qemu_co_queue_restart_all(&req->wait_queue);
1384 * Add an active request to the tracked requests list
1386 static void tracked_request_begin(BdrvTrackedRequest *req,
1387 BlockDriverState *bs,
1388 int64_t sector_num,
1389 int nb_sectors, bool is_write)
1391 *req = (BdrvTrackedRequest){
1392 .bs = bs,
1393 .sector_num = sector_num,
1394 .nb_sectors = nb_sectors,
1395 .is_write = is_write,
1396 .co = qemu_coroutine_self(),
1399 qemu_co_queue_init(&req->wait_queue);
1401 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1405 * Round a region to cluster boundaries
1407 static void round_to_clusters(BlockDriverState *bs,
1408 int64_t sector_num, int nb_sectors,
1409 int64_t *cluster_sector_num,
1410 int *cluster_nb_sectors)
1412 BlockDriverInfo bdi;
1414 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1415 *cluster_sector_num = sector_num;
1416 *cluster_nb_sectors = nb_sectors;
1417 } else {
1418 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1419 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1420 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1421 nb_sectors, c);
1425 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1426 int64_t sector_num, int nb_sectors) {
1427 /* aaaa bbbb */
1428 if (sector_num >= req->sector_num + req->nb_sectors) {
1429 return false;
1431 /* bbbb aaaa */
1432 if (req->sector_num >= sector_num + nb_sectors) {
1433 return false;
1435 return true;
1438 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1439 int64_t sector_num, int nb_sectors)
1441 BdrvTrackedRequest *req;
1442 int64_t cluster_sector_num;
1443 int cluster_nb_sectors;
1444 bool retry;
1446 /* If we touch the same cluster it counts as an overlap. This guarantees
1447 * that allocating writes will be serialized and not race with each other
1448 * for the same cluster. For example, in copy-on-read it ensures that the
1449 * CoR read and write operations are atomic and guest writes cannot
1450 * interleave between them.
1452 round_to_clusters(bs, sector_num, nb_sectors,
1453 &cluster_sector_num, &cluster_nb_sectors);
1455 do {
1456 retry = false;
1457 QLIST_FOREACH(req, &bs->tracked_requests, list) {
1458 if (tracked_request_overlaps(req, cluster_sector_num,
1459 cluster_nb_sectors)) {
1460 /* Hitting this means there was a reentrant request, for
1461 * example, a block driver issuing nested requests. This must
1462 * never happen since it means deadlock.
1464 assert(qemu_coroutine_self() != req->co);
1466 qemu_co_queue_wait(&req->wait_queue);
1467 retry = true;
1468 break;
1471 } while (retry);
1475 * Return values:
1476 * 0 - success
1477 * -EINVAL - backing format specified, but no file
1478 * -ENOSPC - can't update the backing file because no space is left in the
1479 * image file header
1480 * -ENOTSUP - format driver doesn't support changing the backing file
1482 int bdrv_change_backing_file(BlockDriverState *bs,
1483 const char *backing_file, const char *backing_fmt)
1485 BlockDriver *drv = bs->drv;
1486 int ret;
1488 /* Backing file format doesn't make sense without a backing file */
1489 if (backing_fmt && !backing_file) {
1490 return -EINVAL;
1493 if (drv->bdrv_change_backing_file != NULL) {
1494 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1495 } else {
1496 ret = -ENOTSUP;
1499 if (ret == 0) {
1500 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1501 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1503 return ret;
1506 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1507 size_t size)
1509 int64_t len;
1511 if (!bdrv_is_inserted(bs))
1512 return -ENOMEDIUM;
1514 if (bs->growable)
1515 return 0;
1517 len = bdrv_getlength(bs);
1519 if (offset < 0)
1520 return -EIO;
1522 if ((offset > len) || (len - offset < size))
1523 return -EIO;
1525 return 0;
1528 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1529 int nb_sectors)
1531 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1532 nb_sectors * BDRV_SECTOR_SIZE);
1535 typedef struct RwCo {
1536 BlockDriverState *bs;
1537 int64_t sector_num;
1538 int nb_sectors;
1539 QEMUIOVector *qiov;
1540 bool is_write;
1541 int ret;
1542 } RwCo;
1544 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1546 RwCo *rwco = opaque;
1548 if (!rwco->is_write) {
1549 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1550 rwco->nb_sectors, rwco->qiov, 0);
1551 } else {
1552 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1553 rwco->nb_sectors, rwco->qiov, 0);
1558 * Process a synchronous request using coroutines
1560 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1561 int nb_sectors, bool is_write)
1563 QEMUIOVector qiov;
1564 struct iovec iov = {
1565 .iov_base = (void *)buf,
1566 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1568 Coroutine *co;
1569 RwCo rwco = {
1570 .bs = bs,
1571 .sector_num = sector_num,
1572 .nb_sectors = nb_sectors,
1573 .qiov = &qiov,
1574 .is_write = is_write,
1575 .ret = NOT_DONE,
1578 qemu_iovec_init_external(&qiov, &iov, 1);
1581 * In sync call context, when the vcpu is blocked, this throttling timer
1582 * will not fire; so the I/O throttling function has to be disabled here
1583 * if it has been enabled.
1585 if (bs->io_limits_enabled) {
1586 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1587 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1588 bdrv_io_limits_disable(bs);
1591 if (qemu_in_coroutine()) {
1592 /* Fast-path if already in coroutine context */
1593 bdrv_rw_co_entry(&rwco);
1594 } else {
1595 co = qemu_coroutine_create(bdrv_rw_co_entry);
1596 qemu_coroutine_enter(co, &rwco);
1597 while (rwco.ret == NOT_DONE) {
1598 qemu_aio_wait();
1601 return rwco.ret;
1604 /* return < 0 if error. See bdrv_write() for the return codes */
1605 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1606 uint8_t *buf, int nb_sectors)
1608 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1611 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
1613 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1614 int nb_sectors, int dirty)
1616 int64_t start, end;
1617 unsigned long val, idx, bit;
1619 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1620 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1622 for (; start <= end; start++) {
1623 idx = start / BITS_PER_LONG;
1624 bit = start % BITS_PER_LONG;
1625 val = bs->dirty_bitmap[idx];
1626 if (dirty) {
1627 if (!(val & (1UL << bit))) {
1628 bs->dirty_count++;
1629 val |= 1UL << bit;
1631 } else {
1632 if (val & (1UL << bit)) {
1633 bs->dirty_count--;
1634 val &= ~(1UL << bit);
1637 bs->dirty_bitmap[idx] = val;
1641 /* Return < 0 if error. Important errors are:
1642 -EIO generic I/O error (may happen for all errors)
1643 -ENOMEDIUM No media inserted.
1644 -EINVAL Invalid sector number or nb_sectors
1645 -EACCES Trying to write a read-only device
1647 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1648 const uint8_t *buf, int nb_sectors)
1650 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1653 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1654 void *buf, int count1)
1656 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1657 int len, nb_sectors, count;
1658 int64_t sector_num;
1659 int ret;
1661 count = count1;
1662 /* first read to align to sector start */
1663 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1664 if (len > count)
1665 len = count;
1666 sector_num = offset >> BDRV_SECTOR_BITS;
1667 if (len > 0) {
1668 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1669 return ret;
1670 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1671 count -= len;
1672 if (count == 0)
1673 return count1;
1674 sector_num++;
1675 buf += len;
1678 /* read the sectors "in place" */
1679 nb_sectors = count >> BDRV_SECTOR_BITS;
1680 if (nb_sectors > 0) {
1681 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1682 return ret;
1683 sector_num += nb_sectors;
1684 len = nb_sectors << BDRV_SECTOR_BITS;
1685 buf += len;
1686 count -= len;
1689 /* add data from the last sector */
1690 if (count > 0) {
1691 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1692 return ret;
1693 memcpy(buf, tmp_buf, count);
1695 return count1;
1698 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1699 const void *buf, int count1)
1701 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1702 int len, nb_sectors, count;
1703 int64_t sector_num;
1704 int ret;
1706 count = count1;
1707 /* first write to align to sector start */
1708 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1709 if (len > count)
1710 len = count;
1711 sector_num = offset >> BDRV_SECTOR_BITS;
1712 if (len > 0) {
1713 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1714 return ret;
1715 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1716 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1717 return ret;
1718 count -= len;
1719 if (count == 0)
1720 return count1;
1721 sector_num++;
1722 buf += len;
1725 /* write the sectors "in place" */
1726 nb_sectors = count >> BDRV_SECTOR_BITS;
1727 if (nb_sectors > 0) {
1728 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1729 return ret;
1730 sector_num += nb_sectors;
1731 len = nb_sectors << BDRV_SECTOR_BITS;
1732 buf += len;
1733 count -= len;
1736 /* add data from the last sector */
1737 if (count > 0) {
1738 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1739 return ret;
1740 memcpy(tmp_buf, buf, count);
1741 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1742 return ret;
1744 return count1;
1748 * Writes to the file and ensures that no writes are reordered across this
1749 * request (acts as a barrier)
1751 * Returns 0 on success, -errno in error cases.
1753 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1754 const void *buf, int count)
1756 int ret;
1758 ret = bdrv_pwrite(bs, offset, buf, count);
1759 if (ret < 0) {
1760 return ret;
1763 /* No flush needed for cache modes that already do it */
1764 if (bs->enable_write_cache) {
1765 bdrv_flush(bs);
1768 return 0;
1771 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1772 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1774 /* Perform I/O through a temporary buffer so that users who scribble over
1775 * their read buffer while the operation is in progress do not end up
1776 * modifying the image file. This is critical for zero-copy guest I/O
1777 * where anything might happen inside guest memory.
1779 void *bounce_buffer;
1781 BlockDriver *drv = bs->drv;
1782 struct iovec iov;
1783 QEMUIOVector bounce_qiov;
1784 int64_t cluster_sector_num;
1785 int cluster_nb_sectors;
1786 size_t skip_bytes;
1787 int ret;
1789 /* Cover entire cluster so no additional backing file I/O is required when
1790 * allocating cluster in the image file.
1792 round_to_clusters(bs, sector_num, nb_sectors,
1793 &cluster_sector_num, &cluster_nb_sectors);
1795 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1796 cluster_sector_num, cluster_nb_sectors);
1798 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1799 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1800 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1802 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1803 &bounce_qiov);
1804 if (ret < 0) {
1805 goto err;
1808 if (drv->bdrv_co_write_zeroes &&
1809 buffer_is_zero(bounce_buffer, iov.iov_len)) {
1810 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1811 cluster_nb_sectors);
1812 } else {
1813 /* This does not change the data on the disk, it is not necessary
1814 * to flush even in cache=writethrough mode.
1816 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1817 &bounce_qiov);
1820 if (ret < 0) {
1821 /* It might be okay to ignore write errors for guest requests. If this
1822 * is a deliberate copy-on-read then we don't want to ignore the error.
1823 * Simply report it in all cases.
1825 goto err;
1828 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1829 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1830 nb_sectors * BDRV_SECTOR_SIZE);
1832 err:
1833 qemu_vfree(bounce_buffer);
1834 return ret;
1838 * Handle a read request in coroutine context
1840 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1841 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1842 BdrvRequestFlags flags)
1844 BlockDriver *drv = bs->drv;
1845 BdrvTrackedRequest req;
1846 int ret;
1848 if (!drv) {
1849 return -ENOMEDIUM;
1851 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1852 return -EIO;
1855 /* throttling disk read I/O */
1856 if (bs->io_limits_enabled) {
1857 bdrv_io_limits_intercept(bs, false, nb_sectors);
1860 if (bs->copy_on_read) {
1861 flags |= BDRV_REQ_COPY_ON_READ;
1863 if (flags & BDRV_REQ_COPY_ON_READ) {
1864 bs->copy_on_read_in_flight++;
1867 if (bs->copy_on_read_in_flight) {
1868 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1871 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1873 if (flags & BDRV_REQ_COPY_ON_READ) {
1874 int pnum;
1876 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1877 if (ret < 0) {
1878 goto out;
1881 if (!ret || pnum != nb_sectors) {
1882 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1883 goto out;
1887 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1889 out:
1890 tracked_request_end(&req);
1892 if (flags & BDRV_REQ_COPY_ON_READ) {
1893 bs->copy_on_read_in_flight--;
1896 return ret;
1899 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1900 int nb_sectors, QEMUIOVector *qiov)
1902 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1904 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1907 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1908 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1910 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1912 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1913 BDRV_REQ_COPY_ON_READ);
1916 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1917 int64_t sector_num, int nb_sectors)
1919 BlockDriver *drv = bs->drv;
1920 QEMUIOVector qiov;
1921 struct iovec iov;
1922 int ret;
1924 /* TODO Emulate only part of misaligned requests instead of letting block
1925 * drivers return -ENOTSUP and emulate everything */
1927 /* First try the efficient write zeroes operation */
1928 if (drv->bdrv_co_write_zeroes) {
1929 ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1930 if (ret != -ENOTSUP) {
1931 return ret;
1935 /* Fall back to bounce buffer if write zeroes is unsupported */
1936 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
1937 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1938 memset(iov.iov_base, 0, iov.iov_len);
1939 qemu_iovec_init_external(&qiov, &iov, 1);
1941 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1943 qemu_vfree(iov.iov_base);
1944 return ret;
1948 * Handle a write request in coroutine context
1950 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1951 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1952 BdrvRequestFlags flags)
1954 BlockDriver *drv = bs->drv;
1955 BdrvTrackedRequest req;
1956 int ret;
1958 if (!bs->drv) {
1959 return -ENOMEDIUM;
1961 if (bs->read_only) {
1962 return -EACCES;
1964 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1965 return -EIO;
1968 /* throttling disk write I/O */
1969 if (bs->io_limits_enabled) {
1970 bdrv_io_limits_intercept(bs, true, nb_sectors);
1973 if (bs->copy_on_read_in_flight) {
1974 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1977 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1979 if (flags & BDRV_REQ_ZERO_WRITE) {
1980 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1981 } else {
1982 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1985 if (ret == 0 && !bs->enable_write_cache) {
1986 ret = bdrv_co_flush(bs);
1989 if (bs->dirty_bitmap) {
1990 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1993 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1994 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1997 tracked_request_end(&req);
1999 return ret;
2002 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2003 int nb_sectors, QEMUIOVector *qiov)
2005 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2007 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2010 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2011 int64_t sector_num, int nb_sectors)
2013 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2015 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2016 BDRV_REQ_ZERO_WRITE);
2020 * Truncate file to 'offset' bytes (needed only for file protocols)
2022 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2024 BlockDriver *drv = bs->drv;
2025 int ret;
2026 if (!drv)
2027 return -ENOMEDIUM;
2028 if (!drv->bdrv_truncate)
2029 return -ENOTSUP;
2030 if (bs->read_only)
2031 return -EACCES;
2032 if (bdrv_in_use(bs))
2033 return -EBUSY;
2034 ret = drv->bdrv_truncate(bs, offset);
2035 if (ret == 0) {
2036 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2037 bdrv_dev_resize_cb(bs);
2039 return ret;
2043 * Length of a allocated file in bytes. Sparse files are counted by actual
2044 * allocated space. Return < 0 if error or unknown.
2046 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2048 BlockDriver *drv = bs->drv;
2049 if (!drv) {
2050 return -ENOMEDIUM;
2052 if (drv->bdrv_get_allocated_file_size) {
2053 return drv->bdrv_get_allocated_file_size(bs);
2055 if (bs->file) {
2056 return bdrv_get_allocated_file_size(bs->file);
2058 return -ENOTSUP;
2062 * Length of a file in bytes. Return < 0 if error or unknown.
2064 int64_t bdrv_getlength(BlockDriverState *bs)
2066 BlockDriver *drv = bs->drv;
2067 if (!drv)
2068 return -ENOMEDIUM;
2070 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2071 if (drv->bdrv_getlength) {
2072 return drv->bdrv_getlength(bs);
2075 return bs->total_sectors * BDRV_SECTOR_SIZE;
2078 /* return 0 as number of sectors if no device present or error */
2079 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2081 int64_t length;
2082 length = bdrv_getlength(bs);
2083 if (length < 0)
2084 length = 0;
2085 else
2086 length = length >> BDRV_SECTOR_BITS;
2087 *nb_sectors_ptr = length;
2090 struct partition {
2091 uint8_t boot_ind; /* 0x80 - active */
2092 uint8_t head; /* starting head */
2093 uint8_t sector; /* starting sector */
2094 uint8_t cyl; /* starting cylinder */
2095 uint8_t sys_ind; /* What partition type */
2096 uint8_t end_head; /* end head */
2097 uint8_t end_sector; /* end sector */
2098 uint8_t end_cyl; /* end cylinder */
2099 uint32_t start_sect; /* starting sector counting from 0 */
2100 uint32_t nr_sects; /* nr of sectors in partition */
2101 } QEMU_PACKED;
2103 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
2104 static int guess_disk_lchs(BlockDriverState *bs,
2105 int *pcylinders, int *pheads, int *psectors)
2107 uint8_t buf[BDRV_SECTOR_SIZE];
2108 int ret, i, heads, sectors, cylinders;
2109 struct partition *p;
2110 uint32_t nr_sects;
2111 uint64_t nb_sectors;
2112 bool enabled;
2114 bdrv_get_geometry(bs, &nb_sectors);
2117 * The function will be invoked during startup not only in sync I/O mode,
2118 * but also in async I/O mode. So the I/O throttling function has to
2119 * be disabled temporarily here, not permanently.
2121 enabled = bs->io_limits_enabled;
2122 bs->io_limits_enabled = false;
2123 ret = bdrv_read(bs, 0, buf, 1);
2124 bs->io_limits_enabled = enabled;
2125 if (ret < 0)
2126 return -1;
2127 /* test msdos magic */
2128 if (buf[510] != 0x55 || buf[511] != 0xaa)
2129 return -1;
2130 for(i = 0; i < 4; i++) {
2131 p = ((struct partition *)(buf + 0x1be)) + i;
2132 nr_sects = le32_to_cpu(p->nr_sects);
2133 if (nr_sects && p->end_head) {
2134 /* We make the assumption that the partition terminates on
2135 a cylinder boundary */
2136 heads = p->end_head + 1;
2137 sectors = p->end_sector & 63;
2138 if (sectors == 0)
2139 continue;
2140 cylinders = nb_sectors / (heads * sectors);
2141 if (cylinders < 1 || cylinders > 16383)
2142 continue;
2143 *pheads = heads;
2144 *psectors = sectors;
2145 *pcylinders = cylinders;
2146 #if 0
2147 printf("guessed geometry: LCHS=%d %d %d\n",
2148 cylinders, heads, sectors);
2149 #endif
2150 return 0;
2153 return -1;
2156 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2158 int translation, lba_detected = 0;
2159 int cylinders, heads, secs;
2160 uint64_t nb_sectors;
2162 /* if a geometry hint is available, use it */
2163 bdrv_get_geometry(bs, &nb_sectors);
2164 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2165 translation = bdrv_get_translation_hint(bs);
2166 if (cylinders != 0) {
2167 *pcyls = cylinders;
2168 *pheads = heads;
2169 *psecs = secs;
2170 } else {
2171 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2172 if (heads > 16) {
2173 /* if heads > 16, it means that a BIOS LBA
2174 translation was active, so the default
2175 hardware geometry is OK */
2176 lba_detected = 1;
2177 goto default_geometry;
2178 } else {
2179 *pcyls = cylinders;
2180 *pheads = heads;
2181 *psecs = secs;
2182 /* disable any translation to be in sync with
2183 the logical geometry */
2184 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2185 bdrv_set_translation_hint(bs,
2186 BIOS_ATA_TRANSLATION_NONE);
2189 } else {
2190 default_geometry:
2191 /* if no geometry, use a standard physical disk geometry */
2192 cylinders = nb_sectors / (16 * 63);
2194 if (cylinders > 16383)
2195 cylinders = 16383;
2196 else if (cylinders < 2)
2197 cylinders = 2;
2198 *pcyls = cylinders;
2199 *pheads = 16;
2200 *psecs = 63;
2201 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2202 if ((*pcyls * *pheads) <= 131072) {
2203 bdrv_set_translation_hint(bs,
2204 BIOS_ATA_TRANSLATION_LARGE);
2205 } else {
2206 bdrv_set_translation_hint(bs,
2207 BIOS_ATA_TRANSLATION_LBA);
2211 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2215 void bdrv_set_geometry_hint(BlockDriverState *bs,
2216 int cyls, int heads, int secs)
2218 bs->cyls = cyls;
2219 bs->heads = heads;
2220 bs->secs = secs;
2223 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2225 bs->translation = translation;
2228 void bdrv_get_geometry_hint(BlockDriverState *bs,
2229 int *pcyls, int *pheads, int *psecs)
2231 *pcyls = bs->cyls;
2232 *pheads = bs->heads;
2233 *psecs = bs->secs;
2236 /* throttling disk io limits */
2237 void bdrv_set_io_limits(BlockDriverState *bs,
2238 BlockIOLimit *io_limits)
2240 bs->io_limits = *io_limits;
2241 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2244 /* Recognize floppy formats */
2245 typedef struct FDFormat {
2246 FDriveType drive;
2247 uint8_t last_sect;
2248 uint8_t max_track;
2249 uint8_t max_head;
2250 FDriveRate rate;
2251 } FDFormat;
2253 static const FDFormat fd_formats[] = {
2254 /* First entry is default format */
2255 /* 1.44 MB 3"1/2 floppy disks */
2256 { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2257 { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2258 { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2259 { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2260 { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2261 { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2262 { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2263 { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2264 /* 2.88 MB 3"1/2 floppy disks */
2265 { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2266 { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2267 { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2268 { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2269 { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2270 /* 720 kB 3"1/2 floppy disks */
2271 { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, },
2272 { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2273 { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2274 { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2275 { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2276 { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2277 /* 1.2 MB 5"1/4 floppy disks */
2278 { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2279 { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2280 { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2281 { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2282 { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2283 /* 720 kB 5"1/4 floppy disks */
2284 { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, },
2285 { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2286 /* 360 kB 5"1/4 floppy disks */
2287 { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, },
2288 { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, },
2289 { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2290 { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2291 /* 320 kB 5"1/4 floppy disks */
2292 { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, },
2293 { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, },
2294 /* 360 kB must match 5"1/4 better than 3"1/2... */
2295 { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, },
2296 /* end */
2297 { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2300 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2301 int *max_track, int *last_sect,
2302 FDriveType drive_in, FDriveType *drive,
2303 FDriveRate *rate)
2305 const FDFormat *parse;
2306 uint64_t nb_sectors, size;
2307 int i, first_match, match;
2309 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2310 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2311 /* User defined disk */
2312 *rate = FDRIVE_RATE_500K;
2313 } else {
2314 bdrv_get_geometry(bs, &nb_sectors);
2315 match = -1;
2316 first_match = -1;
2317 for (i = 0; ; i++) {
2318 parse = &fd_formats[i];
2319 if (parse->drive == FDRIVE_DRV_NONE) {
2320 break;
2322 if (drive_in == parse->drive ||
2323 drive_in == FDRIVE_DRV_NONE) {
2324 size = (parse->max_head + 1) * parse->max_track *
2325 parse->last_sect;
2326 if (nb_sectors == size) {
2327 match = i;
2328 break;
2330 if (first_match == -1) {
2331 first_match = i;
2335 if (match == -1) {
2336 if (first_match == -1) {
2337 match = 1;
2338 } else {
2339 match = first_match;
2341 parse = &fd_formats[match];
2343 *nb_heads = parse->max_head + 1;
2344 *max_track = parse->max_track;
2345 *last_sect = parse->last_sect;
2346 *drive = parse->drive;
2347 *rate = parse->rate;
2351 int bdrv_get_translation_hint(BlockDriverState *bs)
2353 return bs->translation;
2356 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2357 BlockErrorAction on_write_error)
2359 bs->on_read_error = on_read_error;
2360 bs->on_write_error = on_write_error;
2363 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2365 return is_read ? bs->on_read_error : bs->on_write_error;
2368 int bdrv_is_read_only(BlockDriverState *bs)
2370 return bs->read_only;
2373 int bdrv_is_sg(BlockDriverState *bs)
2375 return bs->sg;
2378 int bdrv_enable_write_cache(BlockDriverState *bs)
2380 return bs->enable_write_cache;
2383 int bdrv_is_encrypted(BlockDriverState *bs)
2385 if (bs->backing_hd && bs->backing_hd->encrypted)
2386 return 1;
2387 return bs->encrypted;
2390 int bdrv_key_required(BlockDriverState *bs)
2392 BlockDriverState *backing_hd = bs->backing_hd;
2394 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2395 return 1;
2396 return (bs->encrypted && !bs->valid_key);
2399 int bdrv_set_key(BlockDriverState *bs, const char *key)
2401 int ret;
2402 if (bs->backing_hd && bs->backing_hd->encrypted) {
2403 ret = bdrv_set_key(bs->backing_hd, key);
2404 if (ret < 0)
2405 return ret;
2406 if (!bs->encrypted)
2407 return 0;
2409 if (!bs->encrypted) {
2410 return -EINVAL;
2411 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2412 return -ENOMEDIUM;
2414 ret = bs->drv->bdrv_set_key(bs, key);
2415 if (ret < 0) {
2416 bs->valid_key = 0;
2417 } else if (!bs->valid_key) {
2418 bs->valid_key = 1;
2419 /* call the change callback now, we skipped it on open */
2420 bdrv_dev_change_media_cb(bs, true);
2422 return ret;
2425 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2427 if (!bs->drv) {
2428 buf[0] = '\0';
2429 } else {
2430 pstrcpy(buf, buf_size, bs->drv->format_name);
2434 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2435 void *opaque)
2437 BlockDriver *drv;
2439 QLIST_FOREACH(drv, &bdrv_drivers, list) {
2440 it(opaque, drv->format_name);
2444 BlockDriverState *bdrv_find(const char *name)
2446 BlockDriverState *bs;
2448 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2449 if (!strcmp(name, bs->device_name)) {
2450 return bs;
2453 return NULL;
2456 BlockDriverState *bdrv_next(BlockDriverState *bs)
2458 if (!bs) {
2459 return QTAILQ_FIRST(&bdrv_states);
2461 return QTAILQ_NEXT(bs, list);
2464 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2466 BlockDriverState *bs;
2468 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2469 it(opaque, bs);
2473 const char *bdrv_get_device_name(BlockDriverState *bs)
2475 return bs->device_name;
2478 int bdrv_get_flags(BlockDriverState *bs)
2480 return bs->open_flags;
2483 void bdrv_flush_all(void)
2485 BlockDriverState *bs;
2487 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2488 bdrv_flush(bs);
2492 int bdrv_has_zero_init(BlockDriverState *bs)
2494 assert(bs->drv);
2496 if (bs->drv->bdrv_has_zero_init) {
2497 return bs->drv->bdrv_has_zero_init(bs);
2500 return 1;
2503 typedef struct BdrvCoIsAllocatedData {
2504 BlockDriverState *bs;
2505 int64_t sector_num;
2506 int nb_sectors;
2507 int *pnum;
2508 int ret;
2509 bool done;
2510 } BdrvCoIsAllocatedData;
2513 * Returns true iff the specified sector is present in the disk image. Drivers
2514 * not implementing the functionality are assumed to not support backing files,
2515 * hence all their sectors are reported as allocated.
2517 * If 'sector_num' is beyond the end of the disk image the return value is 0
2518 * and 'pnum' is set to 0.
2520 * 'pnum' is set to the number of sectors (including and immediately following
2521 * the specified sector) that are known to be in the same
2522 * allocated/unallocated state.
2524 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2525 * beyond the end of the disk image it will be clamped.
2527 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2528 int nb_sectors, int *pnum)
2530 int64_t n;
2532 if (sector_num >= bs->total_sectors) {
2533 *pnum = 0;
2534 return 0;
2537 n = bs->total_sectors - sector_num;
2538 if (n < nb_sectors) {
2539 nb_sectors = n;
2542 if (!bs->drv->bdrv_co_is_allocated) {
2543 *pnum = nb_sectors;
2544 return 1;
2547 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2550 /* Coroutine wrapper for bdrv_is_allocated() */
2551 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2553 BdrvCoIsAllocatedData *data = opaque;
2554 BlockDriverState *bs = data->bs;
2556 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2557 data->pnum);
2558 data->done = true;
2562 * Synchronous wrapper around bdrv_co_is_allocated().
2564 * See bdrv_co_is_allocated() for details.
2566 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2567 int *pnum)
2569 Coroutine *co;
2570 BdrvCoIsAllocatedData data = {
2571 .bs = bs,
2572 .sector_num = sector_num,
2573 .nb_sectors = nb_sectors,
2574 .pnum = pnum,
2575 .done = false,
2578 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2579 qemu_coroutine_enter(co, &data);
2580 while (!data.done) {
2581 qemu_aio_wait();
2583 return data.ret;
2587 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2589 * Return true if the given sector is allocated in any image between
2590 * BASE and TOP (inclusive). BASE can be NULL to check if the given
2591 * sector is allocated in any image of the chain. Return false otherwise.
2593 * 'pnum' is set to the number of sectors (including and immediately following
2594 * the specified sector) that are known to be in the same
2595 * allocated/unallocated state.
2598 int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
2599 BlockDriverState *base,
2600 int64_t sector_num,
2601 int nb_sectors, int *pnum)
2603 BlockDriverState *intermediate;
2604 int ret, n = nb_sectors;
2606 intermediate = top;
2607 while (intermediate && intermediate != base) {
2608 int pnum_inter;
2609 ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
2610 &pnum_inter);
2611 if (ret < 0) {
2612 return ret;
2613 } else if (ret) {
2614 *pnum = pnum_inter;
2615 return 1;
2619 * [sector_num, nb_sectors] is unallocated on top but intermediate
2620 * might have
2622 * [sector_num+x, nr_sectors] allocated.
2624 if (n > pnum_inter) {
2625 n = pnum_inter;
2628 intermediate = intermediate->backing_hd;
2631 *pnum = n;
2632 return 0;
2635 BlockInfoList *qmp_query_block(Error **errp)
2637 BlockInfoList *head = NULL, *cur_item = NULL;
2638 BlockDriverState *bs;
2640 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2641 BlockInfoList *info = g_malloc0(sizeof(*info));
2643 info->value = g_malloc0(sizeof(*info->value));
2644 info->value->device = g_strdup(bs->device_name);
2645 info->value->type = g_strdup("unknown");
2646 info->value->locked = bdrv_dev_is_medium_locked(bs);
2647 info->value->removable = bdrv_dev_has_removable_media(bs);
2649 if (bdrv_dev_has_removable_media(bs)) {
2650 info->value->has_tray_open = true;
2651 info->value->tray_open = bdrv_dev_is_tray_open(bs);
2654 if (bdrv_iostatus_is_enabled(bs)) {
2655 info->value->has_io_status = true;
2656 info->value->io_status = bs->iostatus;
2659 if (bs->drv) {
2660 info->value->has_inserted = true;
2661 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2662 info->value->inserted->file = g_strdup(bs->filename);
2663 info->value->inserted->ro = bs->read_only;
2664 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2665 info->value->inserted->encrypted = bs->encrypted;
2666 if (bs->backing_file[0]) {
2667 info->value->inserted->has_backing_file = true;
2668 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2671 if (bs->io_limits_enabled) {
2672 info->value->inserted->bps =
2673 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2674 info->value->inserted->bps_rd =
2675 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2676 info->value->inserted->bps_wr =
2677 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2678 info->value->inserted->iops =
2679 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2680 info->value->inserted->iops_rd =
2681 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2682 info->value->inserted->iops_wr =
2683 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2687 /* XXX: waiting for the qapi to support GSList */
2688 if (!cur_item) {
2689 head = cur_item = info;
2690 } else {
2691 cur_item->next = info;
2692 cur_item = info;
2696 return head;
2699 /* Consider exposing this as a full fledged QMP command */
2700 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2702 BlockStats *s;
2704 s = g_malloc0(sizeof(*s));
2706 if (bs->device_name[0]) {
2707 s->has_device = true;
2708 s->device = g_strdup(bs->device_name);
2711 s->stats = g_malloc0(sizeof(*s->stats));
2712 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2713 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2714 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2715 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2716 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2717 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2718 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2719 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2720 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2722 if (bs->file) {
2723 s->has_parent = true;
2724 s->parent = qmp_query_blockstat(bs->file, NULL);
2727 return s;
2730 BlockStatsList *qmp_query_blockstats(Error **errp)
2732 BlockStatsList *head = NULL, *cur_item = NULL;
2733 BlockDriverState *bs;
2735 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2736 BlockStatsList *info = g_malloc0(sizeof(*info));
2737 info->value = qmp_query_blockstat(bs, NULL);
2739 /* XXX: waiting for the qapi to support GSList */
2740 if (!cur_item) {
2741 head = cur_item = info;
2742 } else {
2743 cur_item->next = info;
2744 cur_item = info;
2748 return head;
2751 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2753 if (bs->backing_hd && bs->backing_hd->encrypted)
2754 return bs->backing_file;
2755 else if (bs->encrypted)
2756 return bs->filename;
2757 else
2758 return NULL;
2761 void bdrv_get_backing_filename(BlockDriverState *bs,
2762 char *filename, int filename_size)
2764 pstrcpy(filename, filename_size, bs->backing_file);
2767 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2768 const uint8_t *buf, int nb_sectors)
2770 BlockDriver *drv = bs->drv;
2771 if (!drv)
2772 return -ENOMEDIUM;
2773 if (!drv->bdrv_write_compressed)
2774 return -ENOTSUP;
2775 if (bdrv_check_request(bs, sector_num, nb_sectors))
2776 return -EIO;
2778 if (bs->dirty_bitmap) {
2779 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2782 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2785 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2787 BlockDriver *drv = bs->drv;
2788 if (!drv)
2789 return -ENOMEDIUM;
2790 if (!drv->bdrv_get_info)
2791 return -ENOTSUP;
2792 memset(bdi, 0, sizeof(*bdi));
2793 return drv->bdrv_get_info(bs, bdi);
2796 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2797 int64_t pos, int size)
2799 BlockDriver *drv = bs->drv;
2800 if (!drv)
2801 return -ENOMEDIUM;
2802 if (drv->bdrv_save_vmstate)
2803 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2804 if (bs->file)
2805 return bdrv_save_vmstate(bs->file, buf, pos, size);
2806 return -ENOTSUP;
2809 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2810 int64_t pos, int size)
2812 BlockDriver *drv = bs->drv;
2813 if (!drv)
2814 return -ENOMEDIUM;
2815 if (drv->bdrv_load_vmstate)
2816 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2817 if (bs->file)
2818 return bdrv_load_vmstate(bs->file, buf, pos, size);
2819 return -ENOTSUP;
2822 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2824 BlockDriver *drv = bs->drv;
2826 if (!drv || !drv->bdrv_debug_event) {
2827 return;
2830 return drv->bdrv_debug_event(bs, event);
2834 /**************************************************************/
2835 /* handling of snapshots */
2837 int bdrv_can_snapshot(BlockDriverState *bs)
2839 BlockDriver *drv = bs->drv;
2840 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2841 return 0;
2844 if (!drv->bdrv_snapshot_create) {
2845 if (bs->file != NULL) {
2846 return bdrv_can_snapshot(bs->file);
2848 return 0;
2851 return 1;
2854 int bdrv_is_snapshot(BlockDriverState *bs)
2856 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2859 BlockDriverState *bdrv_snapshots(void)
2861 BlockDriverState *bs;
2863 if (bs_snapshots) {
2864 return bs_snapshots;
2867 bs = NULL;
2868 while ((bs = bdrv_next(bs))) {
2869 if (bdrv_can_snapshot(bs)) {
2870 bs_snapshots = bs;
2871 return bs;
2874 return NULL;
2877 int bdrv_snapshot_create(BlockDriverState *bs,
2878 QEMUSnapshotInfo *sn_info)
2880 BlockDriver *drv = bs->drv;
2881 if (!drv)
2882 return -ENOMEDIUM;
2883 if (drv->bdrv_snapshot_create)
2884 return drv->bdrv_snapshot_create(bs, sn_info);
2885 if (bs->file)
2886 return bdrv_snapshot_create(bs->file, sn_info);
2887 return -ENOTSUP;
2890 int bdrv_snapshot_goto(BlockDriverState *bs,
2891 const char *snapshot_id)
2893 BlockDriver *drv = bs->drv;
2894 int ret, open_ret;
2896 if (!drv)
2897 return -ENOMEDIUM;
2898 if (drv->bdrv_snapshot_goto)
2899 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2901 if (bs->file) {
2902 drv->bdrv_close(bs);
2903 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2904 open_ret = drv->bdrv_open(bs, bs->open_flags);
2905 if (open_ret < 0) {
2906 bdrv_delete(bs->file);
2907 bs->drv = NULL;
2908 return open_ret;
2910 return ret;
2913 return -ENOTSUP;
2916 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2918 BlockDriver *drv = bs->drv;
2919 if (!drv)
2920 return -ENOMEDIUM;
2921 if (drv->bdrv_snapshot_delete)
2922 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2923 if (bs->file)
2924 return bdrv_snapshot_delete(bs->file, snapshot_id);
2925 return -ENOTSUP;
2928 int bdrv_snapshot_list(BlockDriverState *bs,
2929 QEMUSnapshotInfo **psn_info)
2931 BlockDriver *drv = bs->drv;
2932 if (!drv)
2933 return -ENOMEDIUM;
2934 if (drv->bdrv_snapshot_list)
2935 return drv->bdrv_snapshot_list(bs, psn_info);
2936 if (bs->file)
2937 return bdrv_snapshot_list(bs->file, psn_info);
2938 return -ENOTSUP;
2941 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2942 const char *snapshot_name)
2944 BlockDriver *drv = bs->drv;
2945 if (!drv) {
2946 return -ENOMEDIUM;
2948 if (!bs->read_only) {
2949 return -EINVAL;
2951 if (drv->bdrv_snapshot_load_tmp) {
2952 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2954 return -ENOTSUP;
2957 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2958 const char *backing_file)
2960 if (!bs->drv) {
2961 return NULL;
2964 if (bs->backing_hd) {
2965 if (strcmp(bs->backing_file, backing_file) == 0) {
2966 return bs->backing_hd;
2967 } else {
2968 return bdrv_find_backing_image(bs->backing_hd, backing_file);
2972 return NULL;
2975 #define NB_SUFFIXES 4
2977 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2979 static const char suffixes[NB_SUFFIXES] = "KMGT";
2980 int64_t base;
2981 int i;
2983 if (size <= 999) {
2984 snprintf(buf, buf_size, "%" PRId64, size);
2985 } else {
2986 base = 1024;
2987 for(i = 0; i < NB_SUFFIXES; i++) {
2988 if (size < (10 * base)) {
2989 snprintf(buf, buf_size, "%0.1f%c",
2990 (double)size / base,
2991 suffixes[i]);
2992 break;
2993 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2994 snprintf(buf, buf_size, "%" PRId64 "%c",
2995 ((size + (base >> 1)) / base),
2996 suffixes[i]);
2997 break;
2999 base = base * 1024;
3002 return buf;
3005 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
3007 char buf1[128], date_buf[128], clock_buf[128];
3008 #ifdef _WIN32
3009 struct tm *ptm;
3010 #else
3011 struct tm tm;
3012 #endif
3013 time_t ti;
3014 int64_t secs;
3016 if (!sn) {
3017 snprintf(buf, buf_size,
3018 "%-10s%-20s%7s%20s%15s",
3019 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
3020 } else {
3021 ti = sn->date_sec;
3022 #ifdef _WIN32
3023 ptm = localtime(&ti);
3024 strftime(date_buf, sizeof(date_buf),
3025 "%Y-%m-%d %H:%M:%S", ptm);
3026 #else
3027 localtime_r(&ti, &tm);
3028 strftime(date_buf, sizeof(date_buf),
3029 "%Y-%m-%d %H:%M:%S", &tm);
3030 #endif
3031 secs = sn->vm_clock_nsec / 1000000000;
3032 snprintf(clock_buf, sizeof(clock_buf),
3033 "%02d:%02d:%02d.%03d",
3034 (int)(secs / 3600),
3035 (int)((secs / 60) % 60),
3036 (int)(secs % 60),
3037 (int)((sn->vm_clock_nsec / 1000000) % 1000));
3038 snprintf(buf, buf_size,
3039 "%-10s%-20s%7s%20s%15s",
3040 sn->id_str, sn->name,
3041 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
3042 date_buf,
3043 clock_buf);
3045 return buf;
3048 /**************************************************************/
3049 /* async I/Os */
3051 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3052 QEMUIOVector *qiov, int nb_sectors,
3053 BlockDriverCompletionFunc *cb, void *opaque)
3055 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3057 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3058 cb, opaque, false);
3061 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3062 QEMUIOVector *qiov, int nb_sectors,
3063 BlockDriverCompletionFunc *cb, void *opaque)
3065 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3067 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3068 cb, opaque, true);
3072 typedef struct MultiwriteCB {
3073 int error;
3074 int num_requests;
3075 int num_callbacks;
3076 struct {
3077 BlockDriverCompletionFunc *cb;
3078 void *opaque;
3079 QEMUIOVector *free_qiov;
3080 } callbacks[];
3081 } MultiwriteCB;
3083 static void multiwrite_user_cb(MultiwriteCB *mcb)
3085 int i;
3087 for (i = 0; i < mcb->num_callbacks; i++) {
3088 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3089 if (mcb->callbacks[i].free_qiov) {
3090 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3092 g_free(mcb->callbacks[i].free_qiov);
3096 static void multiwrite_cb(void *opaque, int ret)
3098 MultiwriteCB *mcb = opaque;
3100 trace_multiwrite_cb(mcb, ret);
3102 if (ret < 0 && !mcb->error) {
3103 mcb->error = ret;
3106 mcb->num_requests--;
3107 if (mcb->num_requests == 0) {
3108 multiwrite_user_cb(mcb);
3109 g_free(mcb);
3113 static int multiwrite_req_compare(const void *a, const void *b)
3115 const BlockRequest *req1 = a, *req2 = b;
3118 * Note that we can't simply subtract req2->sector from req1->sector
3119 * here as that could overflow the return value.
3121 if (req1->sector > req2->sector) {
3122 return 1;
3123 } else if (req1->sector < req2->sector) {
3124 return -1;
3125 } else {
3126 return 0;
3131 * Takes a bunch of requests and tries to merge them. Returns the number of
3132 * requests that remain after merging.
3134 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3135 int num_reqs, MultiwriteCB *mcb)
3137 int i, outidx;
3139 // Sort requests by start sector
3140 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3142 // Check if adjacent requests touch the same clusters. If so, combine them,
3143 // filling up gaps with zero sectors.
3144 outidx = 0;
3145 for (i = 1; i < num_reqs; i++) {
3146 int merge = 0;
3147 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3149 // Handle exactly sequential writes and overlapping writes.
3150 if (reqs[i].sector <= oldreq_last) {
3151 merge = 1;
3154 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3155 merge = 0;
3158 if (merge) {
3159 size_t size;
3160 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3161 qemu_iovec_init(qiov,
3162 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3164 // Add the first request to the merged one. If the requests are
3165 // overlapping, drop the last sectors of the first request.
3166 size = (reqs[i].sector - reqs[outidx].sector) << 9;
3167 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3169 // We should need to add any zeros between the two requests
3170 assert (reqs[i].sector <= oldreq_last);
3172 // Add the second request
3173 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3175 reqs[outidx].nb_sectors = qiov->size >> 9;
3176 reqs[outidx].qiov = qiov;
3178 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3179 } else {
3180 outidx++;
3181 reqs[outidx].sector = reqs[i].sector;
3182 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3183 reqs[outidx].qiov = reqs[i].qiov;
3187 return outidx + 1;
3191 * Submit multiple AIO write requests at once.
3193 * On success, the function returns 0 and all requests in the reqs array have
3194 * been submitted. In error case this function returns -1, and any of the
3195 * requests may or may not be submitted yet. In particular, this means that the
3196 * callback will be called for some of the requests, for others it won't. The
3197 * caller must check the error field of the BlockRequest to wait for the right
3198 * callbacks (if error != 0, no callback will be called).
3200 * The implementation may modify the contents of the reqs array, e.g. to merge
3201 * requests. However, the fields opaque and error are left unmodified as they
3202 * are used to signal failure for a single request to the caller.
3204 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3206 MultiwriteCB *mcb;
3207 int i;
3209 /* don't submit writes if we don't have a medium */
3210 if (bs->drv == NULL) {
3211 for (i = 0; i < num_reqs; i++) {
3212 reqs[i].error = -ENOMEDIUM;
3214 return -1;
3217 if (num_reqs == 0) {
3218 return 0;
3221 // Create MultiwriteCB structure
3222 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3223 mcb->num_requests = 0;
3224 mcb->num_callbacks = num_reqs;
3226 for (i = 0; i < num_reqs; i++) {
3227 mcb->callbacks[i].cb = reqs[i].cb;
3228 mcb->callbacks[i].opaque = reqs[i].opaque;
3231 // Check for mergable requests
3232 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3234 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3236 /* Run the aio requests. */
3237 mcb->num_requests = num_reqs;
3238 for (i = 0; i < num_reqs; i++) {
3239 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3240 reqs[i].nb_sectors, multiwrite_cb, mcb);
3243 return 0;
3246 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3248 acb->pool->cancel(acb);
3251 /* block I/O throttling */
3252 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3253 bool is_write, double elapsed_time, uint64_t *wait)
3255 uint64_t bps_limit = 0;
3256 double bytes_limit, bytes_base, bytes_res;
3257 double slice_time, wait_time;
3259 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3260 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3261 } else if (bs->io_limits.bps[is_write]) {
3262 bps_limit = bs->io_limits.bps[is_write];
3263 } else {
3264 if (wait) {
3265 *wait = 0;
3268 return false;
3271 slice_time = bs->slice_end - bs->slice_start;
3272 slice_time /= (NANOSECONDS_PER_SECOND);
3273 bytes_limit = bps_limit * slice_time;
3274 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3275 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3276 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3279 /* bytes_base: the bytes of data which have been read/written; and
3280 * it is obtained from the history statistic info.
3281 * bytes_res: the remaining bytes of data which need to be read/written.
3282 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3283 * the total time for completing reading/writting all data.
3285 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3287 if (bytes_base + bytes_res <= bytes_limit) {
3288 if (wait) {
3289 *wait = 0;
3292 return false;
3295 /* Calc approx time to dispatch */
3296 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3298 /* When the I/O rate at runtime exceeds the limits,
3299 * bs->slice_end need to be extended in order that the current statistic
3300 * info can be kept until the timer fire, so it is increased and tuned
3301 * based on the result of experiment.
3303 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3304 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3305 if (wait) {
3306 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3309 return true;
3312 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3313 double elapsed_time, uint64_t *wait)
3315 uint64_t iops_limit = 0;
3316 double ios_limit, ios_base;
3317 double slice_time, wait_time;
3319 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3320 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3321 } else if (bs->io_limits.iops[is_write]) {
3322 iops_limit = bs->io_limits.iops[is_write];
3323 } else {
3324 if (wait) {
3325 *wait = 0;
3328 return false;
3331 slice_time = bs->slice_end - bs->slice_start;
3332 slice_time /= (NANOSECONDS_PER_SECOND);
3333 ios_limit = iops_limit * slice_time;
3334 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3335 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3336 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3339 if (ios_base + 1 <= ios_limit) {
3340 if (wait) {
3341 *wait = 0;
3344 return false;
3347 /* Calc approx time to dispatch */
3348 wait_time = (ios_base + 1) / iops_limit;
3349 if (wait_time > elapsed_time) {
3350 wait_time = wait_time - elapsed_time;
3351 } else {
3352 wait_time = 0;
3355 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3356 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3357 if (wait) {
3358 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3361 return true;
3364 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3365 bool is_write, int64_t *wait)
3367 int64_t now, max_wait;
3368 uint64_t bps_wait = 0, iops_wait = 0;
3369 double elapsed_time;
3370 int bps_ret, iops_ret;
3372 now = qemu_get_clock_ns(vm_clock);
3373 if ((bs->slice_start < now)
3374 && (bs->slice_end > now)) {
3375 bs->slice_end = now + bs->slice_time;
3376 } else {
3377 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3378 bs->slice_start = now;
3379 bs->slice_end = now + bs->slice_time;
3381 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3382 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3384 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3385 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3388 elapsed_time = now - bs->slice_start;
3389 elapsed_time /= (NANOSECONDS_PER_SECOND);
3391 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3392 is_write, elapsed_time, &bps_wait);
3393 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3394 elapsed_time, &iops_wait);
3395 if (bps_ret || iops_ret) {
3396 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3397 if (wait) {
3398 *wait = max_wait;
3401 now = qemu_get_clock_ns(vm_clock);
3402 if (bs->slice_end < now + max_wait) {
3403 bs->slice_end = now + max_wait;
3406 return true;
3409 if (wait) {
3410 *wait = 0;
3413 return false;
3416 /**************************************************************/
3417 /* async block device emulation */
3419 typedef struct BlockDriverAIOCBSync {
3420 BlockDriverAIOCB common;
3421 QEMUBH *bh;
3422 int ret;
3423 /* vector translation state */
3424 QEMUIOVector *qiov;
3425 uint8_t *bounce;
3426 int is_write;
3427 } BlockDriverAIOCBSync;
3429 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3431 BlockDriverAIOCBSync *acb =
3432 container_of(blockacb, BlockDriverAIOCBSync, common);
3433 qemu_bh_delete(acb->bh);
3434 acb->bh = NULL;
3435 qemu_aio_release(acb);
3438 static AIOPool bdrv_em_aio_pool = {
3439 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3440 .cancel = bdrv_aio_cancel_em,
3443 static void bdrv_aio_bh_cb(void *opaque)
3445 BlockDriverAIOCBSync *acb = opaque;
3447 if (!acb->is_write)
3448 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3449 qemu_vfree(acb->bounce);
3450 acb->common.cb(acb->common.opaque, acb->ret);
3451 qemu_bh_delete(acb->bh);
3452 acb->bh = NULL;
3453 qemu_aio_release(acb);
3456 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3457 int64_t sector_num,
3458 QEMUIOVector *qiov,
3459 int nb_sectors,
3460 BlockDriverCompletionFunc *cb,
3461 void *opaque,
3462 int is_write)
3465 BlockDriverAIOCBSync *acb;
3467 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3468 acb->is_write = is_write;
3469 acb->qiov = qiov;
3470 acb->bounce = qemu_blockalign(bs, qiov->size);
3471 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3473 if (is_write) {
3474 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3475 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3476 } else {
3477 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3480 qemu_bh_schedule(acb->bh);
3482 return &acb->common;
3485 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3486 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3487 BlockDriverCompletionFunc *cb, void *opaque)
3489 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3492 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3493 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3494 BlockDriverCompletionFunc *cb, void *opaque)
3496 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3500 typedef struct BlockDriverAIOCBCoroutine {
3501 BlockDriverAIOCB common;
3502 BlockRequest req;
3503 bool is_write;
3504 QEMUBH* bh;
3505 } BlockDriverAIOCBCoroutine;
3507 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3509 qemu_aio_flush();
3512 static AIOPool bdrv_em_co_aio_pool = {
3513 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3514 .cancel = bdrv_aio_co_cancel_em,
3517 static void bdrv_co_em_bh(void *opaque)
3519 BlockDriverAIOCBCoroutine *acb = opaque;
3521 acb->common.cb(acb->common.opaque, acb->req.error);
3522 qemu_bh_delete(acb->bh);
3523 qemu_aio_release(acb);
3526 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3527 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3529 BlockDriverAIOCBCoroutine *acb = opaque;
3530 BlockDriverState *bs = acb->common.bs;
3532 if (!acb->is_write) {
3533 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3534 acb->req.nb_sectors, acb->req.qiov, 0);
3535 } else {
3536 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3537 acb->req.nb_sectors, acb->req.qiov, 0);
3540 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3541 qemu_bh_schedule(acb->bh);
3544 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3545 int64_t sector_num,
3546 QEMUIOVector *qiov,
3547 int nb_sectors,
3548 BlockDriverCompletionFunc *cb,
3549 void *opaque,
3550 bool is_write)
3552 Coroutine *co;
3553 BlockDriverAIOCBCoroutine *acb;
3555 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3556 acb->req.sector = sector_num;
3557 acb->req.nb_sectors = nb_sectors;
3558 acb->req.qiov = qiov;
3559 acb->is_write = is_write;
3561 co = qemu_coroutine_create(bdrv_co_do_rw);
3562 qemu_coroutine_enter(co, acb);
3564 return &acb->common;
3567 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3569 BlockDriverAIOCBCoroutine *acb = opaque;
3570 BlockDriverState *bs = acb->common.bs;
3572 acb->req.error = bdrv_co_flush(bs);
3573 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3574 qemu_bh_schedule(acb->bh);
3577 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3578 BlockDriverCompletionFunc *cb, void *opaque)
3580 trace_bdrv_aio_flush(bs, opaque);
3582 Coroutine *co;
3583 BlockDriverAIOCBCoroutine *acb;
3585 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3586 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3587 qemu_coroutine_enter(co, acb);
3589 return &acb->common;
3592 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3594 BlockDriverAIOCBCoroutine *acb = opaque;
3595 BlockDriverState *bs = acb->common.bs;
3597 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3598 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3599 qemu_bh_schedule(acb->bh);
3602 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3603 int64_t sector_num, int nb_sectors,
3604 BlockDriverCompletionFunc *cb, void *opaque)
3606 Coroutine *co;
3607 BlockDriverAIOCBCoroutine *acb;
3609 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3611 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3612 acb->req.sector = sector_num;
3613 acb->req.nb_sectors = nb_sectors;
3614 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3615 qemu_coroutine_enter(co, acb);
3617 return &acb->common;
3620 void bdrv_init(void)
3622 module_call_init(MODULE_INIT_BLOCK);
3625 void bdrv_init_with_whitelist(void)
3627 use_bdrv_whitelist = 1;
3628 bdrv_init();
3631 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3632 BlockDriverCompletionFunc *cb, void *opaque)
3634 BlockDriverAIOCB *acb;
3636 if (pool->free_aiocb) {
3637 acb = pool->free_aiocb;
3638 pool->free_aiocb = acb->next;
3639 } else {
3640 acb = g_malloc0(pool->aiocb_size);
3641 acb->pool = pool;
3643 acb->bs = bs;
3644 acb->cb = cb;
3645 acb->opaque = opaque;
3646 return acb;
3649 void qemu_aio_release(void *p)
3651 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3652 AIOPool *pool = acb->pool;
3653 acb->next = pool->free_aiocb;
3654 pool->free_aiocb = acb;
3657 /**************************************************************/
3658 /* Coroutine block device emulation */
3660 typedef struct CoroutineIOCompletion {
3661 Coroutine *coroutine;
3662 int ret;
3663 } CoroutineIOCompletion;
3665 static void bdrv_co_io_em_complete(void *opaque, int ret)
3667 CoroutineIOCompletion *co = opaque;
3669 co->ret = ret;
3670 qemu_coroutine_enter(co->coroutine, NULL);
3673 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3674 int nb_sectors, QEMUIOVector *iov,
3675 bool is_write)
3677 CoroutineIOCompletion co = {
3678 .coroutine = qemu_coroutine_self(),
3680 BlockDriverAIOCB *acb;
3682 if (is_write) {
3683 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3684 bdrv_co_io_em_complete, &co);
3685 } else {
3686 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3687 bdrv_co_io_em_complete, &co);
3690 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3691 if (!acb) {
3692 return -EIO;
3694 qemu_coroutine_yield();
3696 return co.ret;
3699 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3700 int64_t sector_num, int nb_sectors,
3701 QEMUIOVector *iov)
3703 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3706 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3707 int64_t sector_num, int nb_sectors,
3708 QEMUIOVector *iov)
3710 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3713 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3715 RwCo *rwco = opaque;
3717 rwco->ret = bdrv_co_flush(rwco->bs);
3720 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3722 int ret;
3724 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3725 return 0;
3728 /* Write back cached data to the OS even with cache=unsafe */
3729 if (bs->drv->bdrv_co_flush_to_os) {
3730 ret = bs->drv->bdrv_co_flush_to_os(bs);
3731 if (ret < 0) {
3732 return ret;
3736 /* But don't actually force it to the disk with cache=unsafe */
3737 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3738 return 0;
3741 if (bs->drv->bdrv_co_flush_to_disk) {
3742 ret = bs->drv->bdrv_co_flush_to_disk(bs);
3743 } else if (bs->drv->bdrv_aio_flush) {
3744 BlockDriverAIOCB *acb;
3745 CoroutineIOCompletion co = {
3746 .coroutine = qemu_coroutine_self(),
3749 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3750 if (acb == NULL) {
3751 ret = -EIO;
3752 } else {
3753 qemu_coroutine_yield();
3754 ret = co.ret;
3756 } else {
3758 * Some block drivers always operate in either writethrough or unsafe
3759 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3760 * know how the server works (because the behaviour is hardcoded or
3761 * depends on server-side configuration), so we can't ensure that
3762 * everything is safe on disk. Returning an error doesn't work because
3763 * that would break guests even if the server operates in writethrough
3764 * mode.
3766 * Let's hope the user knows what he's doing.
3768 ret = 0;
3770 if (ret < 0) {
3771 return ret;
3774 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3775 * in the case of cache=unsafe, so there are no useless flushes.
3777 return bdrv_co_flush(bs->file);
3780 void bdrv_invalidate_cache(BlockDriverState *bs)
3782 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3783 bs->drv->bdrv_invalidate_cache(bs);
3787 void bdrv_invalidate_cache_all(void)
3789 BlockDriverState *bs;
3791 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3792 bdrv_invalidate_cache(bs);
3796 void bdrv_clear_incoming_migration_all(void)
3798 BlockDriverState *bs;
3800 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3801 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3805 int bdrv_flush(BlockDriverState *bs)
3807 Coroutine *co;
3808 RwCo rwco = {
3809 .bs = bs,
3810 .ret = NOT_DONE,
3813 if (qemu_in_coroutine()) {
3814 /* Fast-path if already in coroutine context */
3815 bdrv_flush_co_entry(&rwco);
3816 } else {
3817 co = qemu_coroutine_create(bdrv_flush_co_entry);
3818 qemu_coroutine_enter(co, &rwco);
3819 while (rwco.ret == NOT_DONE) {
3820 qemu_aio_wait();
3824 return rwco.ret;
3827 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3829 RwCo *rwco = opaque;
3831 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3834 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3835 int nb_sectors)
3837 if (!bs->drv) {
3838 return -ENOMEDIUM;
3839 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3840 return -EIO;
3841 } else if (bs->read_only) {
3842 return -EROFS;
3843 } else if (bs->drv->bdrv_co_discard) {
3844 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3845 } else if (bs->drv->bdrv_aio_discard) {
3846 BlockDriverAIOCB *acb;
3847 CoroutineIOCompletion co = {
3848 .coroutine = qemu_coroutine_self(),
3851 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3852 bdrv_co_io_em_complete, &co);
3853 if (acb == NULL) {
3854 return -EIO;
3855 } else {
3856 qemu_coroutine_yield();
3857 return co.ret;
3859 } else {
3860 return 0;
3864 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3866 Coroutine *co;
3867 RwCo rwco = {
3868 .bs = bs,
3869 .sector_num = sector_num,
3870 .nb_sectors = nb_sectors,
3871 .ret = NOT_DONE,
3874 if (qemu_in_coroutine()) {
3875 /* Fast-path if already in coroutine context */
3876 bdrv_discard_co_entry(&rwco);
3877 } else {
3878 co = qemu_coroutine_create(bdrv_discard_co_entry);
3879 qemu_coroutine_enter(co, &rwco);
3880 while (rwco.ret == NOT_DONE) {
3881 qemu_aio_wait();
3885 return rwco.ret;
3888 /**************************************************************/
3889 /* removable device support */
3892 * Return TRUE if the media is present
3894 int bdrv_is_inserted(BlockDriverState *bs)
3896 BlockDriver *drv = bs->drv;
3898 if (!drv)
3899 return 0;
3900 if (!drv->bdrv_is_inserted)
3901 return 1;
3902 return drv->bdrv_is_inserted(bs);
3906 * Return whether the media changed since the last call to this
3907 * function, or -ENOTSUP if we don't know. Most drivers don't know.
3909 int bdrv_media_changed(BlockDriverState *bs)
3911 BlockDriver *drv = bs->drv;
3913 if (drv && drv->bdrv_media_changed) {
3914 return drv->bdrv_media_changed(bs);
3916 return -ENOTSUP;
3920 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3922 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3924 BlockDriver *drv = bs->drv;
3926 if (drv && drv->bdrv_eject) {
3927 drv->bdrv_eject(bs, eject_flag);
3930 if (bs->device_name[0] != '\0') {
3931 bdrv_emit_qmp_eject_event(bs, eject_flag);
3936 * Lock or unlock the media (if it is locked, the user won't be able
3937 * to eject it manually).
3939 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3941 BlockDriver *drv = bs->drv;
3943 trace_bdrv_lock_medium(bs, locked);
3945 if (drv && drv->bdrv_lock_medium) {
3946 drv->bdrv_lock_medium(bs, locked);
3950 /* needed for generic scsi interface */
3952 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3954 BlockDriver *drv = bs->drv;
3956 if (drv && drv->bdrv_ioctl)
3957 return drv->bdrv_ioctl(bs, req, buf);
3958 return -ENOTSUP;
3961 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3962 unsigned long int req, void *buf,
3963 BlockDriverCompletionFunc *cb, void *opaque)
3965 BlockDriver *drv = bs->drv;
3967 if (drv && drv->bdrv_aio_ioctl)
3968 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3969 return NULL;
3972 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3974 bs->buffer_alignment = align;
3977 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3979 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3982 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3984 int64_t bitmap_size;
3986 bs->dirty_count = 0;
3987 if (enable) {
3988 if (!bs->dirty_bitmap) {
3989 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3990 BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
3991 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
3993 bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
3995 } else {
3996 if (bs->dirty_bitmap) {
3997 g_free(bs->dirty_bitmap);
3998 bs->dirty_bitmap = NULL;
4003 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4005 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
4007 if (bs->dirty_bitmap &&
4008 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
4009 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
4010 (1UL << (chunk % (sizeof(unsigned long) * 8))));
4011 } else {
4012 return 0;
4016 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4017 int nr_sectors)
4019 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
4022 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4024 return bs->dirty_count;
4027 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4029 assert(bs->in_use != in_use);
4030 bs->in_use = in_use;
4033 int bdrv_in_use(BlockDriverState *bs)
4035 return bs->in_use;
4038 void bdrv_iostatus_enable(BlockDriverState *bs)
4040 bs->iostatus_enabled = true;
4041 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4044 /* The I/O status is only enabled if the drive explicitly
4045 * enables it _and_ the VM is configured to stop on errors */
4046 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4048 return (bs->iostatus_enabled &&
4049 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
4050 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
4051 bs->on_read_error == BLOCK_ERR_STOP_ANY));
4054 void bdrv_iostatus_disable(BlockDriverState *bs)
4056 bs->iostatus_enabled = false;
4059 void bdrv_iostatus_reset(BlockDriverState *bs)
4061 if (bdrv_iostatus_is_enabled(bs)) {
4062 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4066 /* XXX: Today this is set by device models because it makes the implementation
4067 quite simple. However, the block layer knows about the error, so it's
4068 possible to implement this without device models being involved */
4069 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4071 if (bdrv_iostatus_is_enabled(bs) &&
4072 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4073 assert(error >= 0);
4074 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4075 BLOCK_DEVICE_IO_STATUS_FAILED;
4079 void
4080 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4081 enum BlockAcctType type)
4083 assert(type < BDRV_MAX_IOTYPE);
4085 cookie->bytes = bytes;
4086 cookie->start_time_ns = get_clock();
4087 cookie->type = type;
4090 void
4091 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4093 assert(cookie->type < BDRV_MAX_IOTYPE);
4095 bs->nr_bytes[cookie->type] += cookie->bytes;
4096 bs->nr_ops[cookie->type]++;
4097 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4100 int bdrv_img_create(const char *filename, const char *fmt,
4101 const char *base_filename, const char *base_fmt,
4102 char *options, uint64_t img_size, int flags)
4104 QEMUOptionParameter *param = NULL, *create_options = NULL;
4105 QEMUOptionParameter *backing_fmt, *backing_file, *size;
4106 BlockDriverState *bs = NULL;
4107 BlockDriver *drv, *proto_drv;
4108 BlockDriver *backing_drv = NULL;
4109 int ret = 0;
4111 /* Find driver and parse its options */
4112 drv = bdrv_find_format(fmt);
4113 if (!drv) {
4114 error_report("Unknown file format '%s'", fmt);
4115 ret = -EINVAL;
4116 goto out;
4119 proto_drv = bdrv_find_protocol(filename);
4120 if (!proto_drv) {
4121 error_report("Unknown protocol '%s'", filename);
4122 ret = -EINVAL;
4123 goto out;
4126 create_options = append_option_parameters(create_options,
4127 drv->create_options);
4128 create_options = append_option_parameters(create_options,
4129 proto_drv->create_options);
4131 /* Create parameter list with default values */
4132 param = parse_option_parameters("", create_options, param);
4134 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4136 /* Parse -o options */
4137 if (options) {
4138 param = parse_option_parameters(options, create_options, param);
4139 if (param == NULL) {
4140 error_report("Invalid options for file format '%s'.", fmt);
4141 ret = -EINVAL;
4142 goto out;
4146 if (base_filename) {
4147 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4148 base_filename)) {
4149 error_report("Backing file not supported for file format '%s'",
4150 fmt);
4151 ret = -EINVAL;
4152 goto out;
4156 if (base_fmt) {
4157 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4158 error_report("Backing file format not supported for file "
4159 "format '%s'", fmt);
4160 ret = -EINVAL;
4161 goto out;
4165 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4166 if (backing_file && backing_file->value.s) {
4167 if (!strcmp(filename, backing_file->value.s)) {
4168 error_report("Error: Trying to create an image with the "
4169 "same filename as the backing file");
4170 ret = -EINVAL;
4171 goto out;
4175 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4176 if (backing_fmt && backing_fmt->value.s) {
4177 backing_drv = bdrv_find_format(backing_fmt->value.s);
4178 if (!backing_drv) {
4179 error_report("Unknown backing file format '%s'",
4180 backing_fmt->value.s);
4181 ret = -EINVAL;
4182 goto out;
4186 // The size for the image must always be specified, with one exception:
4187 // If we are using a backing file, we can obtain the size from there
4188 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4189 if (size && size->value.n == -1) {
4190 if (backing_file && backing_file->value.s) {
4191 uint64_t size;
4192 char buf[32];
4193 int back_flags;
4195 /* backing files always opened read-only */
4196 back_flags =
4197 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4199 bs = bdrv_new("");
4201 ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
4202 if (ret < 0) {
4203 error_report("Could not open '%s'", backing_file->value.s);
4204 goto out;
4206 bdrv_get_geometry(bs, &size);
4207 size *= 512;
4209 snprintf(buf, sizeof(buf), "%" PRId64, size);
4210 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4211 } else {
4212 error_report("Image creation needs a size parameter");
4213 ret = -EINVAL;
4214 goto out;
4218 printf("Formatting '%s', fmt=%s ", filename, fmt);
4219 print_option_parameters(param);
4220 puts("");
4222 ret = bdrv_create(drv, filename, param);
4224 if (ret < 0) {
4225 if (ret == -ENOTSUP) {
4226 error_report("Formatting or formatting option not supported for "
4227 "file format '%s'", fmt);
4228 } else if (ret == -EFBIG) {
4229 error_report("The image size is too large for file format '%s'",
4230 fmt);
4231 } else {
4232 error_report("%s: error while creating %s: %s", filename, fmt,
4233 strerror(-ret));
4237 out:
4238 free_option_parameters(create_options);
4239 free_option_parameters(param);
4241 if (bs) {
4242 bdrv_delete(bs);
4245 return ret;
4248 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4249 int64_t speed, BlockDriverCompletionFunc *cb,
4250 void *opaque, Error **errp)
4252 BlockJob *job;
4254 if (bs->job || bdrv_in_use(bs)) {
4255 error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4256 return NULL;
4258 bdrv_set_in_use(bs, 1);
4260 job = g_malloc0(job_type->instance_size);
4261 job->job_type = job_type;
4262 job->bs = bs;
4263 job->cb = cb;
4264 job->opaque = opaque;
4265 job->busy = true;
4266 bs->job = job;
4268 /* Only set speed when necessary to avoid NotSupported error */
4269 if (speed != 0) {
4270 Error *local_err = NULL;
4272 block_job_set_speed(job, speed, &local_err);
4273 if (error_is_set(&local_err)) {
4274 bs->job = NULL;
4275 g_free(job);
4276 bdrv_set_in_use(bs, 0);
4277 error_propagate(errp, local_err);
4278 return NULL;
4281 return job;
4284 void block_job_complete(BlockJob *job, int ret)
4286 BlockDriverState *bs = job->bs;
4288 assert(bs->job == job);
4289 job->cb(job->opaque, ret);
4290 bs->job = NULL;
4291 g_free(job);
4292 bdrv_set_in_use(bs, 0);
4295 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4297 Error *local_err = NULL;
4299 if (!job->job_type->set_speed) {
4300 error_set(errp, QERR_NOT_SUPPORTED);
4301 return;
4303 job->job_type->set_speed(job, speed, &local_err);
4304 if (error_is_set(&local_err)) {
4305 error_propagate(errp, local_err);
4306 return;
4309 job->speed = speed;
4312 void block_job_cancel(BlockJob *job)
4314 job->cancelled = true;
4315 if (job->co && !job->busy) {
4316 qemu_coroutine_enter(job->co, NULL);
4320 bool block_job_is_cancelled(BlockJob *job)
4322 return job->cancelled;
4325 struct BlockCancelData {
4326 BlockJob *job;
4327 BlockDriverCompletionFunc *cb;
4328 void *opaque;
4329 bool cancelled;
4330 int ret;
4333 static void block_job_cancel_cb(void *opaque, int ret)
4335 struct BlockCancelData *data = opaque;
4337 data->cancelled = block_job_is_cancelled(data->job);
4338 data->ret = ret;
4339 data->cb(data->opaque, ret);
4342 int block_job_cancel_sync(BlockJob *job)
4344 struct BlockCancelData data;
4345 BlockDriverState *bs = job->bs;
4347 assert(bs->job == job);
4349 /* Set up our own callback to store the result and chain to
4350 * the original callback.
4352 data.job = job;
4353 data.cb = job->cb;
4354 data.opaque = job->opaque;
4355 data.ret = -EINPROGRESS;
4356 job->cb = block_job_cancel_cb;
4357 job->opaque = &data;
4358 block_job_cancel(job);
4359 while (data.ret == -EINPROGRESS) {
4360 qemu_aio_wait();
4362 return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
4365 void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns)
4367 /* Check cancellation *before* setting busy = false, too! */
4368 if (!block_job_is_cancelled(job)) {
4369 job->busy = false;
4370 co_sleep_ns(clock, ns);
4371 job->busy = true;