iscsi: fix iSER compilation
[qemu.git] / block / io.c
blob2b09c656d0899307329ffe1b7a9c5c1b319f5b0d
1 /*
2 * Block layer I/O functions
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
25 #include "qemu/osdep.h"
26 #include "trace.h"
27 #include "sysemu/block-backend.h"
28 #include "block/aio-wait.h"
29 #include "block/blockjob.h"
30 #include "block/blockjob_int.h"
31 #include "block/block_int.h"
32 #include "qemu/cutils.h"
33 #include "qapi/error.h"
34 #include "qemu/error-report.h"
36 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
38 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
39 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
41 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
42 int64_t offset, int bytes, BdrvRequestFlags flags);
44 void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
46 BdrvChild *c, *next;
48 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
49 if (c == ignore) {
50 continue;
52 if (c->role->drained_begin) {
53 c->role->drained_begin(c);
58 void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
60 BdrvChild *c, *next;
62 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
63 if (c == ignore) {
64 continue;
66 if (c->role->drained_end) {
67 c->role->drained_end(c);
72 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
74 dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
75 dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
76 dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
77 src->opt_mem_alignment);
78 dst->min_mem_alignment = MAX(dst->min_mem_alignment,
79 src->min_mem_alignment);
80 dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
83 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
85 BlockDriver *drv = bs->drv;
86 Error *local_err = NULL;
88 memset(&bs->bl, 0, sizeof(bs->bl));
90 if (!drv) {
91 return;
94 /* Default alignment based on whether driver has byte interface */
95 bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512;
97 /* Take some limits from the children as a default */
98 if (bs->file) {
99 bdrv_refresh_limits(bs->file->bs, &local_err);
100 if (local_err) {
101 error_propagate(errp, local_err);
102 return;
104 bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
105 } else {
106 bs->bl.min_mem_alignment = 512;
107 bs->bl.opt_mem_alignment = getpagesize();
109 /* Safe default since most protocols use readv()/writev()/etc */
110 bs->bl.max_iov = IOV_MAX;
113 if (bs->backing) {
114 bdrv_refresh_limits(bs->backing->bs, &local_err);
115 if (local_err) {
116 error_propagate(errp, local_err);
117 return;
119 bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
122 /* Then let the driver override it */
123 if (drv->bdrv_refresh_limits) {
124 drv->bdrv_refresh_limits(bs, errp);
129 * The copy-on-read flag is actually a reference count so multiple users may
130 * use the feature without worrying about clobbering its previous state.
131 * Copy-on-read stays enabled until all users have called to disable it.
133 void bdrv_enable_copy_on_read(BlockDriverState *bs)
135 atomic_inc(&bs->copy_on_read);
138 void bdrv_disable_copy_on_read(BlockDriverState *bs)
140 int old = atomic_fetch_dec(&bs->copy_on_read);
141 assert(old >= 1);
144 typedef struct {
145 Coroutine *co;
146 BlockDriverState *bs;
147 bool done;
148 bool begin;
149 bool recursive;
150 BdrvChild *parent;
151 } BdrvCoDrainData;
153 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
155 BdrvCoDrainData *data = opaque;
156 BlockDriverState *bs = data->bs;
158 if (data->begin) {
159 bs->drv->bdrv_co_drain_begin(bs);
160 } else {
161 bs->drv->bdrv_co_drain_end(bs);
164 /* Set data->done before reading bs->wakeup. */
165 atomic_mb_set(&data->done, true);
166 bdrv_wakeup(bs);
169 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
170 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
172 BdrvChild *child, *tmp;
173 BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
175 if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
176 (!begin && !bs->drv->bdrv_co_drain_end)) {
177 return;
180 data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
181 bdrv_coroutine_enter(bs, data.co);
182 BDRV_POLL_WHILE(bs, !data.done);
184 if (recursive) {
185 QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
186 bdrv_drain_invoke(child->bs, begin, true);
191 static bool bdrv_drain_recurse(BlockDriverState *bs)
193 BdrvChild *child, *tmp;
194 bool waited;
196 /* Wait for drained requests to finish */
197 waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
199 QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
200 BlockDriverState *bs = child->bs;
201 bool in_main_loop =
202 qemu_get_current_aio_context() == qemu_get_aio_context();
203 assert(bs->refcnt > 0);
204 if (in_main_loop) {
205 /* In case the recursive bdrv_drain_recurse processes a
206 * block_job_defer_to_main_loop BH and modifies the graph,
207 * let's hold a reference to bs until we are done.
209 * IOThread doesn't have such a BH, and it is not safe to call
210 * bdrv_unref without BQL, so skip doing it there.
212 bdrv_ref(bs);
214 waited |= bdrv_drain_recurse(bs);
215 if (in_main_loop) {
216 bdrv_unref(bs);
220 return waited;
223 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
224 BdrvChild *parent);
225 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
226 BdrvChild *parent);
228 static void bdrv_co_drain_bh_cb(void *opaque)
230 BdrvCoDrainData *data = opaque;
231 Coroutine *co = data->co;
232 BlockDriverState *bs = data->bs;
234 bdrv_dec_in_flight(bs);
235 if (data->begin) {
236 bdrv_do_drained_begin(bs, data->recursive, data->parent);
237 } else {
238 bdrv_do_drained_end(bs, data->recursive, data->parent);
241 data->done = true;
242 aio_co_wake(co);
245 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
246 bool begin, bool recursive,
247 BdrvChild *parent)
249 BdrvCoDrainData data;
251 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
252 * other coroutines run if they were queued from
253 * qemu_co_queue_run_restart(). */
255 assert(qemu_in_coroutine());
256 data = (BdrvCoDrainData) {
257 .co = qemu_coroutine_self(),
258 .bs = bs,
259 .done = false,
260 .begin = begin,
261 .recursive = recursive,
262 .parent = parent,
264 bdrv_inc_in_flight(bs);
265 aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
266 bdrv_co_drain_bh_cb, &data);
268 qemu_coroutine_yield();
269 /* If we are resumed from some other event (such as an aio completion or a
270 * timer callback), it is a bug in the caller that should be fixed. */
271 assert(data.done);
274 void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
275 BdrvChild *parent)
277 BdrvChild *child, *next;
279 if (qemu_in_coroutine()) {
280 bdrv_co_yield_to_drain(bs, true, recursive, parent);
281 return;
284 /* Stop things in parent-to-child order */
285 if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
286 aio_disable_external(bdrv_get_aio_context(bs));
289 bdrv_parent_drained_begin(bs, parent);
290 bdrv_drain_invoke(bs, true, false);
291 bdrv_drain_recurse(bs);
293 if (recursive) {
294 bs->recursive_quiesce_counter++;
295 QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
296 bdrv_do_drained_begin(child->bs, true, child);
301 void bdrv_drained_begin(BlockDriverState *bs)
303 bdrv_do_drained_begin(bs, false, NULL);
306 void bdrv_subtree_drained_begin(BlockDriverState *bs)
308 bdrv_do_drained_begin(bs, true, NULL);
311 void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
312 BdrvChild *parent)
314 BdrvChild *child, *next;
315 int old_quiesce_counter;
317 if (qemu_in_coroutine()) {
318 bdrv_co_yield_to_drain(bs, false, recursive, parent);
319 return;
321 assert(bs->quiesce_counter > 0);
322 old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
324 /* Re-enable things in child-to-parent order */
325 bdrv_drain_invoke(bs, false, false);
326 bdrv_parent_drained_end(bs, parent);
327 if (old_quiesce_counter == 1) {
328 aio_enable_external(bdrv_get_aio_context(bs));
331 if (recursive) {
332 bs->recursive_quiesce_counter--;
333 QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
334 bdrv_do_drained_end(child->bs, true, child);
339 void bdrv_drained_end(BlockDriverState *bs)
341 bdrv_do_drained_end(bs, false, NULL);
344 void bdrv_subtree_drained_end(BlockDriverState *bs)
346 bdrv_do_drained_end(bs, true, NULL);
349 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
351 int i;
353 for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
354 bdrv_do_drained_begin(child->bs, true, child);
358 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
360 int i;
362 for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
363 bdrv_do_drained_end(child->bs, true, child);
368 * Wait for pending requests to complete on a single BlockDriverState subtree,
369 * and suspend block driver's internal I/O until next request arrives.
371 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
372 * AioContext.
374 * Only this BlockDriverState's AioContext is run, so in-flight requests must
375 * not depend on events in other AioContexts. In that case, use
376 * bdrv_drain_all() instead.
378 void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
380 assert(qemu_in_coroutine());
381 bdrv_drained_begin(bs);
382 bdrv_drained_end(bs);
385 void bdrv_drain(BlockDriverState *bs)
387 bdrv_drained_begin(bs);
388 bdrv_drained_end(bs);
392 * Wait for pending requests to complete across all BlockDriverStates
394 * This function does not flush data to disk, use bdrv_flush_all() for that
395 * after calling this function.
397 * This pauses all block jobs and disables external clients. It must
398 * be paired with bdrv_drain_all_end().
400 * NOTE: no new block jobs or BlockDriverStates can be created between
401 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
403 void bdrv_drain_all_begin(void)
405 /* Always run first iteration so any pending completion BHs run */
406 bool waited = true;
407 BlockDriverState *bs;
408 BdrvNextIterator it;
409 GSList *aio_ctxs = NULL, *ctx;
411 /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
412 * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
413 * nodes in several different AioContexts, so make sure we're in the main
414 * context. */
415 assert(qemu_get_current_aio_context() == qemu_get_aio_context());
417 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
418 AioContext *aio_context = bdrv_get_aio_context(bs);
420 /* Stop things in parent-to-child order */
421 aio_context_acquire(aio_context);
422 aio_disable_external(aio_context);
423 bdrv_parent_drained_begin(bs, NULL);
424 bdrv_drain_invoke(bs, true, true);
425 aio_context_release(aio_context);
427 if (!g_slist_find(aio_ctxs, aio_context)) {
428 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
432 /* Note that completion of an asynchronous I/O operation can trigger any
433 * number of other I/O operations on other devices---for example a
434 * coroutine can submit an I/O request to another device in response to
435 * request completion. Therefore we must keep looping until there was no
436 * more activity rather than simply draining each device independently.
438 while (waited) {
439 waited = false;
441 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
442 AioContext *aio_context = ctx->data;
444 aio_context_acquire(aio_context);
445 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
446 if (aio_context == bdrv_get_aio_context(bs)) {
447 waited |= bdrv_drain_recurse(bs);
450 aio_context_release(aio_context);
454 g_slist_free(aio_ctxs);
457 void bdrv_drain_all_end(void)
459 BlockDriverState *bs;
460 BdrvNextIterator it;
462 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
463 AioContext *aio_context = bdrv_get_aio_context(bs);
465 /* Re-enable things in child-to-parent order */
466 aio_context_acquire(aio_context);
467 bdrv_drain_invoke(bs, false, true);
468 bdrv_parent_drained_end(bs, NULL);
469 aio_enable_external(aio_context);
470 aio_context_release(aio_context);
474 void bdrv_drain_all(void)
476 bdrv_drain_all_begin();
477 bdrv_drain_all_end();
481 * Remove an active request from the tracked requests list
483 * This function should be called when a tracked request is completing.
485 static void tracked_request_end(BdrvTrackedRequest *req)
487 if (req->serialising) {
488 atomic_dec(&req->bs->serialising_in_flight);
491 qemu_co_mutex_lock(&req->bs->reqs_lock);
492 QLIST_REMOVE(req, list);
493 qemu_co_queue_restart_all(&req->wait_queue);
494 qemu_co_mutex_unlock(&req->bs->reqs_lock);
498 * Add an active request to the tracked requests list
500 static void tracked_request_begin(BdrvTrackedRequest *req,
501 BlockDriverState *bs,
502 int64_t offset,
503 unsigned int bytes,
504 enum BdrvTrackedRequestType type)
506 *req = (BdrvTrackedRequest){
507 .bs = bs,
508 .offset = offset,
509 .bytes = bytes,
510 .type = type,
511 .co = qemu_coroutine_self(),
512 .serialising = false,
513 .overlap_offset = offset,
514 .overlap_bytes = bytes,
517 qemu_co_queue_init(&req->wait_queue);
519 qemu_co_mutex_lock(&bs->reqs_lock);
520 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
521 qemu_co_mutex_unlock(&bs->reqs_lock);
524 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
526 int64_t overlap_offset = req->offset & ~(align - 1);
527 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
528 - overlap_offset;
530 if (!req->serialising) {
531 atomic_inc(&req->bs->serialising_in_flight);
532 req->serialising = true;
535 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
536 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
540 * Round a region to cluster boundaries
542 void bdrv_round_to_clusters(BlockDriverState *bs,
543 int64_t offset, int64_t bytes,
544 int64_t *cluster_offset,
545 int64_t *cluster_bytes)
547 BlockDriverInfo bdi;
549 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
550 *cluster_offset = offset;
551 *cluster_bytes = bytes;
552 } else {
553 int64_t c = bdi.cluster_size;
554 *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
555 *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
559 static int bdrv_get_cluster_size(BlockDriverState *bs)
561 BlockDriverInfo bdi;
562 int ret;
564 ret = bdrv_get_info(bs, &bdi);
565 if (ret < 0 || bdi.cluster_size == 0) {
566 return bs->bl.request_alignment;
567 } else {
568 return bdi.cluster_size;
572 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
573 int64_t offset, unsigned int bytes)
575 /* aaaa bbbb */
576 if (offset >= req->overlap_offset + req->overlap_bytes) {
577 return false;
579 /* bbbb aaaa */
580 if (req->overlap_offset >= offset + bytes) {
581 return false;
583 return true;
586 void bdrv_inc_in_flight(BlockDriverState *bs)
588 atomic_inc(&bs->in_flight);
591 void bdrv_wakeup(BlockDriverState *bs)
593 aio_wait_kick(bdrv_get_aio_wait(bs));
596 void bdrv_dec_in_flight(BlockDriverState *bs)
598 atomic_dec(&bs->in_flight);
599 bdrv_wakeup(bs);
602 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
604 BlockDriverState *bs = self->bs;
605 BdrvTrackedRequest *req;
606 bool retry;
607 bool waited = false;
609 if (!atomic_read(&bs->serialising_in_flight)) {
610 return false;
613 do {
614 retry = false;
615 qemu_co_mutex_lock(&bs->reqs_lock);
616 QLIST_FOREACH(req, &bs->tracked_requests, list) {
617 if (req == self || (!req->serialising && !self->serialising)) {
618 continue;
620 if (tracked_request_overlaps(req, self->overlap_offset,
621 self->overlap_bytes))
623 /* Hitting this means there was a reentrant request, for
624 * example, a block driver issuing nested requests. This must
625 * never happen since it means deadlock.
627 assert(qemu_coroutine_self() != req->co);
629 /* If the request is already (indirectly) waiting for us, or
630 * will wait for us as soon as it wakes up, then just go on
631 * (instead of producing a deadlock in the former case). */
632 if (!req->waiting_for) {
633 self->waiting_for = req;
634 qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
635 self->waiting_for = NULL;
636 retry = true;
637 waited = true;
638 break;
642 qemu_co_mutex_unlock(&bs->reqs_lock);
643 } while (retry);
645 return waited;
648 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
649 size_t size)
651 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
652 return -EIO;
655 if (!bdrv_is_inserted(bs)) {
656 return -ENOMEDIUM;
659 if (offset < 0) {
660 return -EIO;
663 return 0;
666 typedef struct RwCo {
667 BdrvChild *child;
668 int64_t offset;
669 QEMUIOVector *qiov;
670 bool is_write;
671 int ret;
672 BdrvRequestFlags flags;
673 } RwCo;
675 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
677 RwCo *rwco = opaque;
679 if (!rwco->is_write) {
680 rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
681 rwco->qiov->size, rwco->qiov,
682 rwco->flags);
683 } else {
684 rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
685 rwco->qiov->size, rwco->qiov,
686 rwco->flags);
691 * Process a vectored synchronous request using coroutines
693 static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
694 QEMUIOVector *qiov, bool is_write,
695 BdrvRequestFlags flags)
697 Coroutine *co;
698 RwCo rwco = {
699 .child = child,
700 .offset = offset,
701 .qiov = qiov,
702 .is_write = is_write,
703 .ret = NOT_DONE,
704 .flags = flags,
707 if (qemu_in_coroutine()) {
708 /* Fast-path if already in coroutine context */
709 bdrv_rw_co_entry(&rwco);
710 } else {
711 co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
712 bdrv_coroutine_enter(child->bs, co);
713 BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
715 return rwco.ret;
719 * Process a synchronous request using coroutines
721 static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf,
722 int nb_sectors, bool is_write, BdrvRequestFlags flags)
724 QEMUIOVector qiov;
725 struct iovec iov = {
726 .iov_base = (void *)buf,
727 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
730 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
731 return -EINVAL;
734 qemu_iovec_init_external(&qiov, &iov, 1);
735 return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS,
736 &qiov, is_write, flags);
739 /* return < 0 if error. See bdrv_write() for the return codes */
740 int bdrv_read(BdrvChild *child, int64_t sector_num,
741 uint8_t *buf, int nb_sectors)
743 return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0);
746 /* Return < 0 if error. Important errors are:
747 -EIO generic I/O error (may happen for all errors)
748 -ENOMEDIUM No media inserted.
749 -EINVAL Invalid sector number or nb_sectors
750 -EACCES Trying to write a read-only device
752 int bdrv_write(BdrvChild *child, int64_t sector_num,
753 const uint8_t *buf, int nb_sectors)
755 return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
758 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
759 int bytes, BdrvRequestFlags flags)
761 QEMUIOVector qiov;
762 struct iovec iov = {
763 .iov_base = NULL,
764 .iov_len = bytes,
767 qemu_iovec_init_external(&qiov, &iov, 1);
768 return bdrv_prwv_co(child, offset, &qiov, true,
769 BDRV_REQ_ZERO_WRITE | flags);
773 * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
774 * The operation is sped up by checking the block status and only writing
775 * zeroes to the device if they currently do not return zeroes. Optional
776 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
777 * BDRV_REQ_FUA).
779 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
781 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
783 int ret;
784 int64_t target_size, bytes, offset = 0;
785 BlockDriverState *bs = child->bs;
787 target_size = bdrv_getlength(bs);
788 if (target_size < 0) {
789 return target_size;
792 for (;;) {
793 bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
794 if (bytes <= 0) {
795 return 0;
797 ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
798 if (ret < 0) {
799 error_report("error getting block status at offset %" PRId64 ": %s",
800 offset, strerror(-ret));
801 return ret;
803 if (ret & BDRV_BLOCK_ZERO) {
804 offset += bytes;
805 continue;
807 ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
808 if (ret < 0) {
809 error_report("error writing zeroes at offset %" PRId64 ": %s",
810 offset, strerror(-ret));
811 return ret;
813 offset += bytes;
817 int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
819 int ret;
821 ret = bdrv_prwv_co(child, offset, qiov, false, 0);
822 if (ret < 0) {
823 return ret;
826 return qiov->size;
829 int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
831 QEMUIOVector qiov;
832 struct iovec iov = {
833 .iov_base = (void *)buf,
834 .iov_len = bytes,
837 if (bytes < 0) {
838 return -EINVAL;
841 qemu_iovec_init_external(&qiov, &iov, 1);
842 return bdrv_preadv(child, offset, &qiov);
845 int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
847 int ret;
849 ret = bdrv_prwv_co(child, offset, qiov, true, 0);
850 if (ret < 0) {
851 return ret;
854 return qiov->size;
857 int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
859 QEMUIOVector qiov;
860 struct iovec iov = {
861 .iov_base = (void *) buf,
862 .iov_len = bytes,
865 if (bytes < 0) {
866 return -EINVAL;
869 qemu_iovec_init_external(&qiov, &iov, 1);
870 return bdrv_pwritev(child, offset, &qiov);
874 * Writes to the file and ensures that no writes are reordered across this
875 * request (acts as a barrier)
877 * Returns 0 on success, -errno in error cases.
879 int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
880 const void *buf, int count)
882 int ret;
884 ret = bdrv_pwrite(child, offset, buf, count);
885 if (ret < 0) {
886 return ret;
889 ret = bdrv_flush(child->bs);
890 if (ret < 0) {
891 return ret;
894 return 0;
897 typedef struct CoroutineIOCompletion {
898 Coroutine *coroutine;
899 int ret;
900 } CoroutineIOCompletion;
902 static void bdrv_co_io_em_complete(void *opaque, int ret)
904 CoroutineIOCompletion *co = opaque;
906 co->ret = ret;
907 aio_co_wake(co->coroutine);
910 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
911 uint64_t offset, uint64_t bytes,
912 QEMUIOVector *qiov, int flags)
914 BlockDriver *drv = bs->drv;
915 int64_t sector_num;
916 unsigned int nb_sectors;
918 assert(!(flags & ~BDRV_REQ_MASK));
920 if (!drv) {
921 return -ENOMEDIUM;
924 if (drv->bdrv_co_preadv) {
925 return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
928 sector_num = offset >> BDRV_SECTOR_BITS;
929 nb_sectors = bytes >> BDRV_SECTOR_BITS;
931 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
932 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
933 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
935 if (drv->bdrv_co_readv) {
936 return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
937 } else {
938 BlockAIOCB *acb;
939 CoroutineIOCompletion co = {
940 .coroutine = qemu_coroutine_self(),
943 acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
944 bdrv_co_io_em_complete, &co);
945 if (acb == NULL) {
946 return -EIO;
947 } else {
948 qemu_coroutine_yield();
949 return co.ret;
954 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
955 uint64_t offset, uint64_t bytes,
956 QEMUIOVector *qiov, int flags)
958 BlockDriver *drv = bs->drv;
959 int64_t sector_num;
960 unsigned int nb_sectors;
961 int ret;
963 assert(!(flags & ~BDRV_REQ_MASK));
965 if (!drv) {
966 return -ENOMEDIUM;
969 if (drv->bdrv_co_pwritev) {
970 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
971 flags & bs->supported_write_flags);
972 flags &= ~bs->supported_write_flags;
973 goto emulate_flags;
976 sector_num = offset >> BDRV_SECTOR_BITS;
977 nb_sectors = bytes >> BDRV_SECTOR_BITS;
979 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
980 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
981 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
983 if (drv->bdrv_co_writev_flags) {
984 ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
985 flags & bs->supported_write_flags);
986 flags &= ~bs->supported_write_flags;
987 } else if (drv->bdrv_co_writev) {
988 assert(!bs->supported_write_flags);
989 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
990 } else {
991 BlockAIOCB *acb;
992 CoroutineIOCompletion co = {
993 .coroutine = qemu_coroutine_self(),
996 acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
997 bdrv_co_io_em_complete, &co);
998 if (acb == NULL) {
999 ret = -EIO;
1000 } else {
1001 qemu_coroutine_yield();
1002 ret = co.ret;
1006 emulate_flags:
1007 if (ret == 0 && (flags & BDRV_REQ_FUA)) {
1008 ret = bdrv_co_flush(bs);
1011 return ret;
1014 static int coroutine_fn
1015 bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
1016 uint64_t bytes, QEMUIOVector *qiov)
1018 BlockDriver *drv = bs->drv;
1020 if (!drv) {
1021 return -ENOMEDIUM;
1024 if (!drv->bdrv_co_pwritev_compressed) {
1025 return -ENOTSUP;
1028 return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
1031 static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
1032 int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
1034 BlockDriverState *bs = child->bs;
1036 /* Perform I/O through a temporary buffer so that users who scribble over
1037 * their read buffer while the operation is in progress do not end up
1038 * modifying the image file. This is critical for zero-copy guest I/O
1039 * where anything might happen inside guest memory.
1041 void *bounce_buffer;
1043 BlockDriver *drv = bs->drv;
1044 struct iovec iov;
1045 QEMUIOVector local_qiov;
1046 int64_t cluster_offset;
1047 int64_t cluster_bytes;
1048 size_t skip_bytes;
1049 int ret;
1050 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1051 BDRV_REQUEST_MAX_BYTES);
1052 unsigned int progress = 0;
1054 if (!drv) {
1055 return -ENOMEDIUM;
1058 /* FIXME We cannot require callers to have write permissions when all they
1059 * are doing is a read request. If we did things right, write permissions
1060 * would be obtained anyway, but internally by the copy-on-read code. As
1061 * long as it is implemented here rather than in a separate filter driver,
1062 * the copy-on-read code doesn't have its own BdrvChild, however, for which
1063 * it could request permissions. Therefore we have to bypass the permission
1064 * system for the moment. */
1065 // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1067 /* Cover entire cluster so no additional backing file I/O is required when
1068 * allocating cluster in the image file. Note that this value may exceed
1069 * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
1070 * is one reason we loop rather than doing it all at once.
1072 bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
1073 skip_bytes = offset - cluster_offset;
1075 trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
1076 cluster_offset, cluster_bytes);
1078 bounce_buffer = qemu_try_blockalign(bs,
1079 MIN(MIN(max_transfer, cluster_bytes),
1080 MAX_BOUNCE_BUFFER));
1081 if (bounce_buffer == NULL) {
1082 ret = -ENOMEM;
1083 goto err;
1086 while (cluster_bytes) {
1087 int64_t pnum;
1089 ret = bdrv_is_allocated(bs, cluster_offset,
1090 MIN(cluster_bytes, max_transfer), &pnum);
1091 if (ret < 0) {
1092 /* Safe to treat errors in querying allocation as if
1093 * unallocated; we'll probably fail again soon on the
1094 * read, but at least that will set a decent errno.
1096 pnum = MIN(cluster_bytes, max_transfer);
1099 assert(skip_bytes < pnum);
1101 if (ret <= 0) {
1102 /* Must copy-on-read; use the bounce buffer */
1103 iov.iov_base = bounce_buffer;
1104 iov.iov_len = pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
1105 qemu_iovec_init_external(&local_qiov, &iov, 1);
1107 ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
1108 &local_qiov, 0);
1109 if (ret < 0) {
1110 goto err;
1113 bdrv_debug_event(bs, BLKDBG_COR_WRITE);
1114 if (drv->bdrv_co_pwrite_zeroes &&
1115 buffer_is_zero(bounce_buffer, pnum)) {
1116 /* FIXME: Should we (perhaps conditionally) be setting
1117 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
1118 * that still correctly reads as zero? */
1119 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 0);
1120 } else {
1121 /* This does not change the data on the disk, it is not
1122 * necessary to flush even in cache=writethrough mode.
1124 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
1125 &local_qiov, 0);
1128 if (ret < 0) {
1129 /* It might be okay to ignore write errors for guest
1130 * requests. If this is a deliberate copy-on-read
1131 * then we don't want to ignore the error. Simply
1132 * report it in all cases.
1134 goto err;
1137 qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes,
1138 pnum - skip_bytes);
1139 } else {
1140 /* Read directly into the destination */
1141 qemu_iovec_init(&local_qiov, qiov->niov);
1142 qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes);
1143 ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size,
1144 &local_qiov, 0);
1145 qemu_iovec_destroy(&local_qiov);
1146 if (ret < 0) {
1147 goto err;
1151 cluster_offset += pnum;
1152 cluster_bytes -= pnum;
1153 progress += pnum - skip_bytes;
1154 skip_bytes = 0;
1156 ret = 0;
1158 err:
1159 qemu_vfree(bounce_buffer);
1160 return ret;
1164 * Forwards an already correctly aligned request to the BlockDriver. This
1165 * handles copy on read, zeroing after EOF, and fragmentation of large
1166 * reads; any other features must be implemented by the caller.
1168 static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
1169 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1170 int64_t align, QEMUIOVector *qiov, int flags)
1172 BlockDriverState *bs = child->bs;
1173 int64_t total_bytes, max_bytes;
1174 int ret = 0;
1175 uint64_t bytes_remaining = bytes;
1176 int max_transfer;
1178 assert(is_power_of_2(align));
1179 assert((offset & (align - 1)) == 0);
1180 assert((bytes & (align - 1)) == 0);
1181 assert(!qiov || bytes == qiov->size);
1182 assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1183 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1184 align);
1186 /* TODO: We would need a per-BDS .supported_read_flags and
1187 * potential fallback support, if we ever implement any read flags
1188 * to pass through to drivers. For now, there aren't any
1189 * passthrough flags. */
1190 assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
1192 /* Handle Copy on Read and associated serialisation */
1193 if (flags & BDRV_REQ_COPY_ON_READ) {
1194 /* If we touch the same cluster it counts as an overlap. This
1195 * guarantees that allocating writes will be serialized and not race
1196 * with each other for the same cluster. For example, in copy-on-read
1197 * it ensures that the CoR read and write operations are atomic and
1198 * guest writes cannot interleave between them. */
1199 mark_request_serialising(req, bdrv_get_cluster_size(bs));
1202 if (!(flags & BDRV_REQ_NO_SERIALISING)) {
1203 wait_serialising_requests(req);
1206 if (flags & BDRV_REQ_COPY_ON_READ) {
1207 int64_t pnum;
1209 ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
1210 if (ret < 0) {
1211 goto out;
1214 if (!ret || pnum != bytes) {
1215 ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
1216 goto out;
1220 /* Forward the request to the BlockDriver, possibly fragmenting it */
1221 total_bytes = bdrv_getlength(bs);
1222 if (total_bytes < 0) {
1223 ret = total_bytes;
1224 goto out;
1227 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1228 if (bytes <= max_bytes && bytes <= max_transfer) {
1229 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
1230 goto out;
1233 while (bytes_remaining) {
1234 int num;
1236 if (max_bytes) {
1237 QEMUIOVector local_qiov;
1239 num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
1240 assert(num);
1241 qemu_iovec_init(&local_qiov, qiov->niov);
1242 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1244 ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
1245 num, &local_qiov, 0);
1246 max_bytes -= num;
1247 qemu_iovec_destroy(&local_qiov);
1248 } else {
1249 num = bytes_remaining;
1250 ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
1251 bytes_remaining);
1253 if (ret < 0) {
1254 goto out;
1256 bytes_remaining -= num;
1259 out:
1260 return ret < 0 ? ret : 0;
1264 * Handle a read request in coroutine context
1266 int coroutine_fn bdrv_co_preadv(BdrvChild *child,
1267 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1268 BdrvRequestFlags flags)
1270 BlockDriverState *bs = child->bs;
1271 BlockDriver *drv = bs->drv;
1272 BdrvTrackedRequest req;
1274 uint64_t align = bs->bl.request_alignment;
1275 uint8_t *head_buf = NULL;
1276 uint8_t *tail_buf = NULL;
1277 QEMUIOVector local_qiov;
1278 bool use_local_qiov = false;
1279 int ret;
1281 trace_bdrv_co_preadv(child->bs, offset, bytes, flags);
1283 if (!drv) {
1284 return -ENOMEDIUM;
1287 ret = bdrv_check_byte_request(bs, offset, bytes);
1288 if (ret < 0) {
1289 return ret;
1292 bdrv_inc_in_flight(bs);
1294 /* Don't do copy-on-read if we read data before write operation */
1295 if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) {
1296 flags |= BDRV_REQ_COPY_ON_READ;
1299 /* Align read if necessary by padding qiov */
1300 if (offset & (align - 1)) {
1301 head_buf = qemu_blockalign(bs, align);
1302 qemu_iovec_init(&local_qiov, qiov->niov + 2);
1303 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1304 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1305 use_local_qiov = true;
1307 bytes += offset & (align - 1);
1308 offset = offset & ~(align - 1);
1311 if ((offset + bytes) & (align - 1)) {
1312 if (!use_local_qiov) {
1313 qemu_iovec_init(&local_qiov, qiov->niov + 1);
1314 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1315 use_local_qiov = true;
1317 tail_buf = qemu_blockalign(bs, align);
1318 qemu_iovec_add(&local_qiov, tail_buf,
1319 align - ((offset + bytes) & (align - 1)));
1321 bytes = ROUND_UP(bytes, align);
1324 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1325 ret = bdrv_aligned_preadv(child, &req, offset, bytes, align,
1326 use_local_qiov ? &local_qiov : qiov,
1327 flags);
1328 tracked_request_end(&req);
1329 bdrv_dec_in_flight(bs);
1331 if (use_local_qiov) {
1332 qemu_iovec_destroy(&local_qiov);
1333 qemu_vfree(head_buf);
1334 qemu_vfree(tail_buf);
1337 return ret;
1340 static int coroutine_fn bdrv_co_do_readv(BdrvChild *child,
1341 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1342 BdrvRequestFlags flags)
1344 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1345 return -EINVAL;
1348 return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS,
1349 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1352 int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num,
1353 int nb_sectors, QEMUIOVector *qiov)
1355 return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0);
1358 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1359 int64_t offset, int bytes, BdrvRequestFlags flags)
1361 BlockDriver *drv = bs->drv;
1362 QEMUIOVector qiov;
1363 struct iovec iov = {0};
1364 int ret = 0;
1365 bool need_flush = false;
1366 int head = 0;
1367 int tail = 0;
1369 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1370 int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1371 bs->bl.request_alignment);
1372 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
1374 if (!drv) {
1375 return -ENOMEDIUM;
1378 assert(alignment % bs->bl.request_alignment == 0);
1379 head = offset % alignment;
1380 tail = (offset + bytes) % alignment;
1381 max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1382 assert(max_write_zeroes >= bs->bl.request_alignment);
1384 while (bytes > 0 && !ret) {
1385 int num = bytes;
1387 /* Align request. Block drivers can expect the "bulk" of the request
1388 * to be aligned, and that unaligned requests do not cross cluster
1389 * boundaries.
1391 if (head) {
1392 /* Make a small request up to the first aligned sector. For
1393 * convenience, limit this request to max_transfer even if
1394 * we don't need to fall back to writes. */
1395 num = MIN(MIN(bytes, max_transfer), alignment - head);
1396 head = (head + num) % alignment;
1397 assert(num < max_write_zeroes);
1398 } else if (tail && num > alignment) {
1399 /* Shorten the request to the last aligned sector. */
1400 num -= tail;
1403 /* limit request size */
1404 if (num > max_write_zeroes) {
1405 num = max_write_zeroes;
1408 ret = -ENOTSUP;
1409 /* First try the efficient write zeroes operation */
1410 if (drv->bdrv_co_pwrite_zeroes) {
1411 ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1412 flags & bs->supported_zero_flags);
1413 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1414 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1415 need_flush = true;
1417 } else {
1418 assert(!bs->supported_zero_flags);
1421 if (ret == -ENOTSUP) {
1422 /* Fall back to bounce buffer if write zeroes is unsupported */
1423 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1425 if ((flags & BDRV_REQ_FUA) &&
1426 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1427 /* No need for bdrv_driver_pwrite() to do a fallback
1428 * flush on each chunk; use just one at the end */
1429 write_flags &= ~BDRV_REQ_FUA;
1430 need_flush = true;
1432 num = MIN(num, max_transfer);
1433 iov.iov_len = num;
1434 if (iov.iov_base == NULL) {
1435 iov.iov_base = qemu_try_blockalign(bs, num);
1436 if (iov.iov_base == NULL) {
1437 ret = -ENOMEM;
1438 goto fail;
1440 memset(iov.iov_base, 0, num);
1442 qemu_iovec_init_external(&qiov, &iov, 1);
1444 ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
1446 /* Keep bounce buffer around if it is big enough for all
1447 * all future requests.
1449 if (num < max_transfer) {
1450 qemu_vfree(iov.iov_base);
1451 iov.iov_base = NULL;
1455 offset += num;
1456 bytes -= num;
1459 fail:
1460 if (ret == 0 && need_flush) {
1461 ret = bdrv_co_flush(bs);
1463 qemu_vfree(iov.iov_base);
1464 return ret;
1468 * Forwards an already correctly aligned write request to the BlockDriver,
1469 * after possibly fragmenting it.
1471 static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
1472 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1473 int64_t align, QEMUIOVector *qiov, int flags)
1475 BlockDriverState *bs = child->bs;
1476 BlockDriver *drv = bs->drv;
1477 bool waited;
1478 int ret;
1480 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1481 uint64_t bytes_remaining = bytes;
1482 int max_transfer;
1484 if (!drv) {
1485 return -ENOMEDIUM;
1488 if (bdrv_has_readonly_bitmaps(bs)) {
1489 return -EPERM;
1492 assert(is_power_of_2(align));
1493 assert((offset & (align - 1)) == 0);
1494 assert((bytes & (align - 1)) == 0);
1495 assert(!qiov || bytes == qiov->size);
1496 assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1497 assert(!(flags & ~BDRV_REQ_MASK));
1498 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1499 align);
1501 waited = wait_serialising_requests(req);
1502 assert(!waited || !req->serialising);
1503 assert(req->overlap_offset <= offset);
1504 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1505 assert(child->perm & BLK_PERM_WRITE);
1506 assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
1508 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1510 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1511 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
1512 qemu_iovec_is_zero(qiov)) {
1513 flags |= BDRV_REQ_ZERO_WRITE;
1514 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1515 flags |= BDRV_REQ_MAY_UNMAP;
1519 if (ret < 0) {
1520 /* Do nothing, write notifier decided to fail this request */
1521 } else if (flags & BDRV_REQ_ZERO_WRITE) {
1522 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1523 ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
1524 } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
1525 ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov);
1526 } else if (bytes <= max_transfer) {
1527 bdrv_debug_event(bs, BLKDBG_PWRITEV);
1528 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
1529 } else {
1530 bdrv_debug_event(bs, BLKDBG_PWRITEV);
1531 while (bytes_remaining) {
1532 int num = MIN(bytes_remaining, max_transfer);
1533 QEMUIOVector local_qiov;
1534 int local_flags = flags;
1536 assert(num);
1537 if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
1538 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1539 /* If FUA is going to be emulated by flush, we only
1540 * need to flush on the last iteration */
1541 local_flags &= ~BDRV_REQ_FUA;
1543 qemu_iovec_init(&local_qiov, qiov->niov);
1544 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1546 ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
1547 num, &local_qiov, local_flags);
1548 qemu_iovec_destroy(&local_qiov);
1549 if (ret < 0) {
1550 break;
1552 bytes_remaining -= num;
1555 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
1557 atomic_inc(&bs->write_gen);
1558 bdrv_set_dirty(bs, offset, bytes);
1560 stat64_max(&bs->wr_highest_offset, offset + bytes);
1562 if (ret >= 0) {
1563 bs->total_sectors = MAX(bs->total_sectors, end_sector);
1564 ret = 0;
1567 return ret;
1570 static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
1571 int64_t offset,
1572 unsigned int bytes,
1573 BdrvRequestFlags flags,
1574 BdrvTrackedRequest *req)
1576 BlockDriverState *bs = child->bs;
1577 uint8_t *buf = NULL;
1578 QEMUIOVector local_qiov;
1579 struct iovec iov;
1580 uint64_t align = bs->bl.request_alignment;
1581 unsigned int head_padding_bytes, tail_padding_bytes;
1582 int ret = 0;
1584 head_padding_bytes = offset & (align - 1);
1585 tail_padding_bytes = (align - (offset + bytes)) & (align - 1);
1588 assert(flags & BDRV_REQ_ZERO_WRITE);
1589 if (head_padding_bytes || tail_padding_bytes) {
1590 buf = qemu_blockalign(bs, align);
1591 iov = (struct iovec) {
1592 .iov_base = buf,
1593 .iov_len = align,
1595 qemu_iovec_init_external(&local_qiov, &iov, 1);
1597 if (head_padding_bytes) {
1598 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1600 /* RMW the unaligned part before head. */
1601 mark_request_serialising(req, align);
1602 wait_serialising_requests(req);
1603 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1604 ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align,
1605 align, &local_qiov, 0);
1606 if (ret < 0) {
1607 goto fail;
1609 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1611 memset(buf + head_padding_bytes, 0, zero_bytes);
1612 ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align,
1613 align, &local_qiov,
1614 flags & ~BDRV_REQ_ZERO_WRITE);
1615 if (ret < 0) {
1616 goto fail;
1618 offset += zero_bytes;
1619 bytes -= zero_bytes;
1622 assert(!bytes || (offset & (align - 1)) == 0);
1623 if (bytes >= align) {
1624 /* Write the aligned part in the middle. */
1625 uint64_t aligned_bytes = bytes & ~(align - 1);
1626 ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
1627 NULL, flags);
1628 if (ret < 0) {
1629 goto fail;
1631 bytes -= aligned_bytes;
1632 offset += aligned_bytes;
1635 assert(!bytes || (offset & (align - 1)) == 0);
1636 if (bytes) {
1637 assert(align == tail_padding_bytes + bytes);
1638 /* RMW the unaligned part after tail. */
1639 mark_request_serialising(req, align);
1640 wait_serialising_requests(req);
1641 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1642 ret = bdrv_aligned_preadv(child, req, offset, align,
1643 align, &local_qiov, 0);
1644 if (ret < 0) {
1645 goto fail;
1647 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1649 memset(buf, 0, bytes);
1650 ret = bdrv_aligned_pwritev(child, req, offset, align, align,
1651 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1653 fail:
1654 qemu_vfree(buf);
1655 return ret;
1660 * Handle a write request in coroutine context
1662 int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
1663 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1664 BdrvRequestFlags flags)
1666 BlockDriverState *bs = child->bs;
1667 BdrvTrackedRequest req;
1668 uint64_t align = bs->bl.request_alignment;
1669 uint8_t *head_buf = NULL;
1670 uint8_t *tail_buf = NULL;
1671 QEMUIOVector local_qiov;
1672 bool use_local_qiov = false;
1673 int ret;
1675 trace_bdrv_co_pwritev(child->bs, offset, bytes, flags);
1677 if (!bs->drv) {
1678 return -ENOMEDIUM;
1680 if (bs->read_only) {
1681 return -EPERM;
1683 assert(!(bs->open_flags & BDRV_O_INACTIVE));
1685 ret = bdrv_check_byte_request(bs, offset, bytes);
1686 if (ret < 0) {
1687 return ret;
1690 bdrv_inc_in_flight(bs);
1692 * Align write if necessary by performing a read-modify-write cycle.
1693 * Pad qiov with the read parts and be sure to have a tracked request not
1694 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1696 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
1698 if (flags & BDRV_REQ_ZERO_WRITE) {
1699 ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
1700 goto out;
1703 if (offset & (align - 1)) {
1704 QEMUIOVector head_qiov;
1705 struct iovec head_iov;
1707 mark_request_serialising(&req, align);
1708 wait_serialising_requests(&req);
1710 head_buf = qemu_blockalign(bs, align);
1711 head_iov = (struct iovec) {
1712 .iov_base = head_buf,
1713 .iov_len = align,
1715 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1717 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1718 ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align,
1719 align, &head_qiov, 0);
1720 if (ret < 0) {
1721 goto fail;
1723 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1725 qemu_iovec_init(&local_qiov, qiov->niov + 2);
1726 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1727 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1728 use_local_qiov = true;
1730 bytes += offset & (align - 1);
1731 offset = offset & ~(align - 1);
1733 /* We have read the tail already if the request is smaller
1734 * than one aligned block.
1736 if (bytes < align) {
1737 qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
1738 bytes = align;
1742 if ((offset + bytes) & (align - 1)) {
1743 QEMUIOVector tail_qiov;
1744 struct iovec tail_iov;
1745 size_t tail_bytes;
1746 bool waited;
1748 mark_request_serialising(&req, align);
1749 waited = wait_serialising_requests(&req);
1750 assert(!waited || !use_local_qiov);
1752 tail_buf = qemu_blockalign(bs, align);
1753 tail_iov = (struct iovec) {
1754 .iov_base = tail_buf,
1755 .iov_len = align,
1757 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1759 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1760 ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1),
1761 align, align, &tail_qiov, 0);
1762 if (ret < 0) {
1763 goto fail;
1765 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1767 if (!use_local_qiov) {
1768 qemu_iovec_init(&local_qiov, qiov->niov + 1);
1769 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1770 use_local_qiov = true;
1773 tail_bytes = (offset + bytes) & (align - 1);
1774 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1776 bytes = ROUND_UP(bytes, align);
1779 ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
1780 use_local_qiov ? &local_qiov : qiov,
1781 flags);
1783 fail:
1785 if (use_local_qiov) {
1786 qemu_iovec_destroy(&local_qiov);
1788 qemu_vfree(head_buf);
1789 qemu_vfree(tail_buf);
1790 out:
1791 tracked_request_end(&req);
1792 bdrv_dec_in_flight(bs);
1793 return ret;
1796 static int coroutine_fn bdrv_co_do_writev(BdrvChild *child,
1797 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1798 BdrvRequestFlags flags)
1800 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1801 return -EINVAL;
1804 return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS,
1805 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1808 int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num,
1809 int nb_sectors, QEMUIOVector *qiov)
1811 return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0);
1814 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
1815 int bytes, BdrvRequestFlags flags)
1817 trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
1819 if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
1820 flags &= ~BDRV_REQ_MAY_UNMAP;
1823 return bdrv_co_pwritev(child, offset, bytes, NULL,
1824 BDRV_REQ_ZERO_WRITE | flags);
1828 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
1830 int bdrv_flush_all(void)
1832 BdrvNextIterator it;
1833 BlockDriverState *bs = NULL;
1834 int result = 0;
1836 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
1837 AioContext *aio_context = bdrv_get_aio_context(bs);
1838 int ret;
1840 aio_context_acquire(aio_context);
1841 ret = bdrv_flush(bs);
1842 if (ret < 0 && !result) {
1843 result = ret;
1845 aio_context_release(aio_context);
1848 return result;
1852 typedef struct BdrvCoBlockStatusData {
1853 BlockDriverState *bs;
1854 BlockDriverState *base;
1855 bool want_zero;
1856 int64_t offset;
1857 int64_t bytes;
1858 int64_t *pnum;
1859 int64_t *map;
1860 BlockDriverState **file;
1861 int ret;
1862 bool done;
1863 } BdrvCoBlockStatusData;
1865 int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
1866 bool want_zero,
1867 int64_t offset,
1868 int64_t bytes,
1869 int64_t *pnum,
1870 int64_t *map,
1871 BlockDriverState **file)
1873 assert(bs->file && bs->file->bs);
1874 *pnum = bytes;
1875 *map = offset;
1876 *file = bs->file->bs;
1877 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
1880 int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs,
1881 bool want_zero,
1882 int64_t offset,
1883 int64_t bytes,
1884 int64_t *pnum,
1885 int64_t *map,
1886 BlockDriverState **file)
1888 assert(bs->backing && bs->backing->bs);
1889 *pnum = bytes;
1890 *map = offset;
1891 *file = bs->backing->bs;
1892 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
1896 * Returns the allocation status of the specified sectors.
1897 * Drivers not implementing the functionality are assumed to not support
1898 * backing files, hence all their sectors are reported as allocated.
1900 * If 'want_zero' is true, the caller is querying for mapping
1901 * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
1902 * _ZERO where possible; otherwise, the result favors larger 'pnum',
1903 * with a focus on accurate BDRV_BLOCK_ALLOCATED.
1905 * If 'offset' is beyond the end of the disk image the return value is
1906 * BDRV_BLOCK_EOF and 'pnum' is set to 0.
1908 * 'bytes' is the max value 'pnum' should be set to. If bytes goes
1909 * beyond the end of the disk image it will be clamped; if 'pnum' is set to
1910 * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
1912 * 'pnum' is set to the number of bytes (including and immediately
1913 * following the specified offset) that are easily known to be in the
1914 * same allocated/unallocated state. Note that a second call starting
1915 * at the original offset plus returned pnum may have the same status.
1916 * The returned value is non-zero on success except at end-of-file.
1918 * Returns negative errno on failure. Otherwise, if the
1919 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
1920 * set to the host mapping and BDS corresponding to the guest offset.
1922 static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
1923 bool want_zero,
1924 int64_t offset, int64_t bytes,
1925 int64_t *pnum, int64_t *map,
1926 BlockDriverState **file)
1928 int64_t total_size;
1929 int64_t n; /* bytes */
1930 int ret;
1931 int64_t local_map = 0;
1932 BlockDriverState *local_file = NULL;
1933 int64_t aligned_offset, aligned_bytes;
1934 uint32_t align;
1936 assert(pnum);
1937 *pnum = 0;
1938 total_size = bdrv_getlength(bs);
1939 if (total_size < 0) {
1940 ret = total_size;
1941 goto early_out;
1944 if (offset >= total_size) {
1945 ret = BDRV_BLOCK_EOF;
1946 goto early_out;
1948 if (!bytes) {
1949 ret = 0;
1950 goto early_out;
1953 n = total_size - offset;
1954 if (n < bytes) {
1955 bytes = n;
1958 /* Must be non-NULL or bdrv_getlength() would have failed */
1959 assert(bs->drv);
1960 if (!bs->drv->bdrv_co_block_status) {
1961 *pnum = bytes;
1962 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1963 if (offset + bytes == total_size) {
1964 ret |= BDRV_BLOCK_EOF;
1966 if (bs->drv->protocol_name) {
1967 ret |= BDRV_BLOCK_OFFSET_VALID;
1968 local_map = offset;
1969 local_file = bs;
1971 goto early_out;
1974 bdrv_inc_in_flight(bs);
1976 /* Round out to request_alignment boundaries */
1977 align = bs->bl.request_alignment;
1978 aligned_offset = QEMU_ALIGN_DOWN(offset, align);
1979 aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
1981 ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
1982 aligned_bytes, pnum, &local_map,
1983 &local_file);
1984 if (ret < 0) {
1985 *pnum = 0;
1986 goto out;
1990 * The driver's result must be a non-zero multiple of request_alignment.
1991 * Clamp pnum and adjust map to original request.
1993 assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
1994 align > offset - aligned_offset);
1995 *pnum -= offset - aligned_offset;
1996 if (*pnum > bytes) {
1997 *pnum = bytes;
1999 if (ret & BDRV_BLOCK_OFFSET_VALID) {
2000 local_map += offset - aligned_offset;
2003 if (ret & BDRV_BLOCK_RAW) {
2004 assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
2005 ret = bdrv_co_block_status(local_file, want_zero, local_map,
2006 *pnum, pnum, &local_map, &local_file);
2007 goto out;
2010 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
2011 ret |= BDRV_BLOCK_ALLOCATED;
2012 } else if (want_zero) {
2013 if (bdrv_unallocated_blocks_are_zero(bs)) {
2014 ret |= BDRV_BLOCK_ZERO;
2015 } else if (bs->backing) {
2016 BlockDriverState *bs2 = bs->backing->bs;
2017 int64_t size2 = bdrv_getlength(bs2);
2019 if (size2 >= 0 && offset >= size2) {
2020 ret |= BDRV_BLOCK_ZERO;
2025 if (want_zero && local_file && local_file != bs &&
2026 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
2027 (ret & BDRV_BLOCK_OFFSET_VALID)) {
2028 int64_t file_pnum;
2029 int ret2;
2031 ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
2032 *pnum, &file_pnum, NULL, NULL);
2033 if (ret2 >= 0) {
2034 /* Ignore errors. This is just providing extra information, it
2035 * is useful but not necessary.
2037 if (ret2 & BDRV_BLOCK_EOF &&
2038 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
2040 * It is valid for the format block driver to read
2041 * beyond the end of the underlying file's current
2042 * size; such areas read as zero.
2044 ret |= BDRV_BLOCK_ZERO;
2045 } else {
2046 /* Limit request to the range reported by the protocol driver */
2047 *pnum = file_pnum;
2048 ret |= (ret2 & BDRV_BLOCK_ZERO);
2053 out:
2054 bdrv_dec_in_flight(bs);
2055 if (ret >= 0 && offset + *pnum == total_size) {
2056 ret |= BDRV_BLOCK_EOF;
2058 early_out:
2059 if (file) {
2060 *file = local_file;
2062 if (map) {
2063 *map = local_map;
2065 return ret;
2068 static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
2069 BlockDriverState *base,
2070 bool want_zero,
2071 int64_t offset,
2072 int64_t bytes,
2073 int64_t *pnum,
2074 int64_t *map,
2075 BlockDriverState **file)
2077 BlockDriverState *p;
2078 int ret = 0;
2079 bool first = true;
2081 assert(bs != base);
2082 for (p = bs; p != base; p = backing_bs(p)) {
2083 ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
2084 file);
2085 if (ret < 0) {
2086 break;
2088 if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
2090 * Reading beyond the end of the file continues to read
2091 * zeroes, but we can only widen the result to the
2092 * unallocated length we learned from an earlier
2093 * iteration.
2095 *pnum = bytes;
2097 if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
2098 break;
2100 /* [offset, pnum] unallocated on this layer, which could be only
2101 * the first part of [offset, bytes]. */
2102 bytes = MIN(bytes, *pnum);
2103 first = false;
2105 return ret;
2108 /* Coroutine wrapper for bdrv_block_status_above() */
2109 static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque)
2111 BdrvCoBlockStatusData *data = opaque;
2113 data->ret = bdrv_co_block_status_above(data->bs, data->base,
2114 data->want_zero,
2115 data->offset, data->bytes,
2116 data->pnum, data->map, data->file);
2117 data->done = true;
2121 * Synchronous wrapper around bdrv_co_block_status_above().
2123 * See bdrv_co_block_status_above() for details.
2125 static int bdrv_common_block_status_above(BlockDriverState *bs,
2126 BlockDriverState *base,
2127 bool want_zero, int64_t offset,
2128 int64_t bytes, int64_t *pnum,
2129 int64_t *map,
2130 BlockDriverState **file)
2132 Coroutine *co;
2133 BdrvCoBlockStatusData data = {
2134 .bs = bs,
2135 .base = base,
2136 .want_zero = want_zero,
2137 .offset = offset,
2138 .bytes = bytes,
2139 .pnum = pnum,
2140 .map = map,
2141 .file = file,
2142 .done = false,
2145 if (qemu_in_coroutine()) {
2146 /* Fast-path if already in coroutine context */
2147 bdrv_block_status_above_co_entry(&data);
2148 } else {
2149 co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data);
2150 bdrv_coroutine_enter(bs, co);
2151 BDRV_POLL_WHILE(bs, !data.done);
2153 return data.ret;
2156 int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
2157 int64_t offset, int64_t bytes, int64_t *pnum,
2158 int64_t *map, BlockDriverState **file)
2160 return bdrv_common_block_status_above(bs, base, true, offset, bytes,
2161 pnum, map, file);
2164 int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
2165 int64_t *pnum, int64_t *map, BlockDriverState **file)
2167 return bdrv_block_status_above(bs, backing_bs(bs),
2168 offset, bytes, pnum, map, file);
2171 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
2172 int64_t bytes, int64_t *pnum)
2174 int ret;
2175 int64_t dummy;
2177 ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset,
2178 bytes, pnum ? pnum : &dummy, NULL,
2179 NULL);
2180 if (ret < 0) {
2181 return ret;
2183 return !!(ret & BDRV_BLOCK_ALLOCATED);
2187 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2189 * Return true if (a prefix of) the given range is allocated in any image
2190 * between BASE and TOP (inclusive). BASE can be NULL to check if the given
2191 * offset is allocated in any image of the chain. Return false otherwise,
2192 * or negative errno on failure.
2194 * 'pnum' is set to the number of bytes (including and immediately
2195 * following the specified offset) that are known to be in the same
2196 * allocated/unallocated state. Note that a subsequent call starting
2197 * at 'offset + *pnum' may return the same allocation status (in other
2198 * words, the result is not necessarily the maximum possible range);
2199 * but 'pnum' will only be 0 when end of file is reached.
2202 int bdrv_is_allocated_above(BlockDriverState *top,
2203 BlockDriverState *base,
2204 int64_t offset, int64_t bytes, int64_t *pnum)
2206 BlockDriverState *intermediate;
2207 int ret;
2208 int64_t n = bytes;
2210 intermediate = top;
2211 while (intermediate && intermediate != base) {
2212 int64_t pnum_inter;
2213 int64_t size_inter;
2215 ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
2216 if (ret < 0) {
2217 return ret;
2219 if (ret) {
2220 *pnum = pnum_inter;
2221 return 1;
2224 size_inter = bdrv_getlength(intermediate);
2225 if (size_inter < 0) {
2226 return size_inter;
2228 if (n > pnum_inter &&
2229 (intermediate == top || offset + pnum_inter < size_inter)) {
2230 n = pnum_inter;
2233 intermediate = backing_bs(intermediate);
2236 *pnum = n;
2237 return 0;
2240 typedef struct BdrvVmstateCo {
2241 BlockDriverState *bs;
2242 QEMUIOVector *qiov;
2243 int64_t pos;
2244 bool is_read;
2245 int ret;
2246 } BdrvVmstateCo;
2248 static int coroutine_fn
2249 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2250 bool is_read)
2252 BlockDriver *drv = bs->drv;
2253 int ret = -ENOTSUP;
2255 bdrv_inc_in_flight(bs);
2257 if (!drv) {
2258 ret = -ENOMEDIUM;
2259 } else if (drv->bdrv_load_vmstate) {
2260 if (is_read) {
2261 ret = drv->bdrv_load_vmstate(bs, qiov, pos);
2262 } else {
2263 ret = drv->bdrv_save_vmstate(bs, qiov, pos);
2265 } else if (bs->file) {
2266 ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
2269 bdrv_dec_in_flight(bs);
2270 return ret;
2273 static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
2275 BdrvVmstateCo *co = opaque;
2276 co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
2279 static inline int
2280 bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2281 bool is_read)
2283 if (qemu_in_coroutine()) {
2284 return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
2285 } else {
2286 BdrvVmstateCo data = {
2287 .bs = bs,
2288 .qiov = qiov,
2289 .pos = pos,
2290 .is_read = is_read,
2291 .ret = -EINPROGRESS,
2293 Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
2295 bdrv_coroutine_enter(bs, co);
2296 BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
2297 return data.ret;
2301 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2302 int64_t pos, int size)
2304 QEMUIOVector qiov;
2305 struct iovec iov = {
2306 .iov_base = (void *) buf,
2307 .iov_len = size,
2309 int ret;
2311 qemu_iovec_init_external(&qiov, &iov, 1);
2313 ret = bdrv_writev_vmstate(bs, &qiov, pos);
2314 if (ret < 0) {
2315 return ret;
2318 return size;
2321 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2323 return bdrv_rw_vmstate(bs, qiov, pos, false);
2326 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2327 int64_t pos, int size)
2329 QEMUIOVector qiov;
2330 struct iovec iov = {
2331 .iov_base = buf,
2332 .iov_len = size,
2334 int ret;
2336 qemu_iovec_init_external(&qiov, &iov, 1);
2337 ret = bdrv_readv_vmstate(bs, &qiov, pos);
2338 if (ret < 0) {
2339 return ret;
2342 return size;
2345 int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2347 return bdrv_rw_vmstate(bs, qiov, pos, true);
2350 /**************************************************************/
2351 /* async I/Os */
2353 void bdrv_aio_cancel(BlockAIOCB *acb)
2355 qemu_aio_ref(acb);
2356 bdrv_aio_cancel_async(acb);
2357 while (acb->refcnt > 1) {
2358 if (acb->aiocb_info->get_aio_context) {
2359 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
2360 } else if (acb->bs) {
2361 /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
2362 * assert that we're not using an I/O thread. Thread-safe
2363 * code should use bdrv_aio_cancel_async exclusively.
2365 assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
2366 aio_poll(bdrv_get_aio_context(acb->bs), true);
2367 } else {
2368 abort();
2371 qemu_aio_unref(acb);
2374 /* Async version of aio cancel. The caller is not blocked if the acb implements
2375 * cancel_async, otherwise we do nothing and let the request normally complete.
2376 * In either case the completion callback must be called. */
2377 void bdrv_aio_cancel_async(BlockAIOCB *acb)
2379 if (acb->aiocb_info->cancel_async) {
2380 acb->aiocb_info->cancel_async(acb);
2384 /**************************************************************/
2385 /* Coroutine block device emulation */
2387 typedef struct FlushCo {
2388 BlockDriverState *bs;
2389 int ret;
2390 } FlushCo;
2393 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2395 FlushCo *rwco = opaque;
2397 rwco->ret = bdrv_co_flush(rwco->bs);
2400 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2402 int current_gen;
2403 int ret = 0;
2405 bdrv_inc_in_flight(bs);
2407 if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2408 bdrv_is_sg(bs)) {
2409 goto early_exit;
2412 qemu_co_mutex_lock(&bs->reqs_lock);
2413 current_gen = atomic_read(&bs->write_gen);
2415 /* Wait until any previous flushes are completed */
2416 while (bs->active_flush_req) {
2417 qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
2420 /* Flushes reach this point in nondecreasing current_gen order. */
2421 bs->active_flush_req = true;
2422 qemu_co_mutex_unlock(&bs->reqs_lock);
2424 /* Write back all layers by calling one driver function */
2425 if (bs->drv->bdrv_co_flush) {
2426 ret = bs->drv->bdrv_co_flush(bs);
2427 goto out;
2430 /* Write back cached data to the OS even with cache=unsafe */
2431 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2432 if (bs->drv->bdrv_co_flush_to_os) {
2433 ret = bs->drv->bdrv_co_flush_to_os(bs);
2434 if (ret < 0) {
2435 goto out;
2439 /* But don't actually force it to the disk with cache=unsafe */
2440 if (bs->open_flags & BDRV_O_NO_FLUSH) {
2441 goto flush_parent;
2444 /* Check if we really need to flush anything */
2445 if (bs->flushed_gen == current_gen) {
2446 goto flush_parent;
2449 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2450 if (!bs->drv) {
2451 /* bs->drv->bdrv_co_flush() might have ejected the BDS
2452 * (even in case of apparent success) */
2453 ret = -ENOMEDIUM;
2454 goto out;
2456 if (bs->drv->bdrv_co_flush_to_disk) {
2457 ret = bs->drv->bdrv_co_flush_to_disk(bs);
2458 } else if (bs->drv->bdrv_aio_flush) {
2459 BlockAIOCB *acb;
2460 CoroutineIOCompletion co = {
2461 .coroutine = qemu_coroutine_self(),
2464 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2465 if (acb == NULL) {
2466 ret = -EIO;
2467 } else {
2468 qemu_coroutine_yield();
2469 ret = co.ret;
2471 } else {
2473 * Some block drivers always operate in either writethrough or unsafe
2474 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2475 * know how the server works (because the behaviour is hardcoded or
2476 * depends on server-side configuration), so we can't ensure that
2477 * everything is safe on disk. Returning an error doesn't work because
2478 * that would break guests even if the server operates in writethrough
2479 * mode.
2481 * Let's hope the user knows what he's doing.
2483 ret = 0;
2486 if (ret < 0) {
2487 goto out;
2490 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
2491 * in the case of cache=unsafe, so there are no useless flushes.
2493 flush_parent:
2494 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2495 out:
2496 /* Notify any pending flushes that we have completed */
2497 if (ret == 0) {
2498 bs->flushed_gen = current_gen;
2501 qemu_co_mutex_lock(&bs->reqs_lock);
2502 bs->active_flush_req = false;
2503 /* Return value is ignored - it's ok if wait queue is empty */
2504 qemu_co_queue_next(&bs->flush_queue);
2505 qemu_co_mutex_unlock(&bs->reqs_lock);
2507 early_exit:
2508 bdrv_dec_in_flight(bs);
2509 return ret;
2512 int bdrv_flush(BlockDriverState *bs)
2514 Coroutine *co;
2515 FlushCo flush_co = {
2516 .bs = bs,
2517 .ret = NOT_DONE,
2520 if (qemu_in_coroutine()) {
2521 /* Fast-path if already in coroutine context */
2522 bdrv_flush_co_entry(&flush_co);
2523 } else {
2524 co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
2525 bdrv_coroutine_enter(bs, co);
2526 BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
2529 return flush_co.ret;
2532 typedef struct DiscardCo {
2533 BlockDriverState *bs;
2534 int64_t offset;
2535 int bytes;
2536 int ret;
2537 } DiscardCo;
2538 static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
2540 DiscardCo *rwco = opaque;
2542 rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->bytes);
2545 int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
2546 int bytes)
2548 BdrvTrackedRequest req;
2549 int max_pdiscard, ret;
2550 int head, tail, align;
2552 if (!bs->drv) {
2553 return -ENOMEDIUM;
2556 if (bdrv_has_readonly_bitmaps(bs)) {
2557 return -EPERM;
2560 ret = bdrv_check_byte_request(bs, offset, bytes);
2561 if (ret < 0) {
2562 return ret;
2563 } else if (bs->read_only) {
2564 return -EPERM;
2566 assert(!(bs->open_flags & BDRV_O_INACTIVE));
2568 /* Do nothing if disabled. */
2569 if (!(bs->open_flags & BDRV_O_UNMAP)) {
2570 return 0;
2573 if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
2574 return 0;
2577 /* Discard is advisory, but some devices track and coalesce
2578 * unaligned requests, so we must pass everything down rather than
2579 * round here. Still, most devices will just silently ignore
2580 * unaligned requests (by returning -ENOTSUP), so we must fragment
2581 * the request accordingly. */
2582 align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
2583 assert(align % bs->bl.request_alignment == 0);
2584 head = offset % align;
2585 tail = (offset + bytes) % align;
2587 bdrv_inc_in_flight(bs);
2588 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
2590 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2591 if (ret < 0) {
2592 goto out;
2595 max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
2596 align);
2597 assert(max_pdiscard >= bs->bl.request_alignment);
2599 while (bytes > 0) {
2600 int num = bytes;
2602 if (head) {
2603 /* Make small requests to get to alignment boundaries. */
2604 num = MIN(bytes, align - head);
2605 if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
2606 num %= bs->bl.request_alignment;
2608 head = (head + num) % align;
2609 assert(num < max_pdiscard);
2610 } else if (tail) {
2611 if (num > align) {
2612 /* Shorten the request to the last aligned cluster. */
2613 num -= tail;
2614 } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
2615 tail > bs->bl.request_alignment) {
2616 tail %= bs->bl.request_alignment;
2617 num -= tail;
2620 /* limit request size */
2621 if (num > max_pdiscard) {
2622 num = max_pdiscard;
2625 if (!bs->drv) {
2626 ret = -ENOMEDIUM;
2627 goto out;
2629 if (bs->drv->bdrv_co_pdiscard) {
2630 ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
2631 } else {
2632 BlockAIOCB *acb;
2633 CoroutineIOCompletion co = {
2634 .coroutine = qemu_coroutine_self(),
2637 acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
2638 bdrv_co_io_em_complete, &co);
2639 if (acb == NULL) {
2640 ret = -EIO;
2641 goto out;
2642 } else {
2643 qemu_coroutine_yield();
2644 ret = co.ret;
2647 if (ret && ret != -ENOTSUP) {
2648 goto out;
2651 offset += num;
2652 bytes -= num;
2654 ret = 0;
2655 out:
2656 atomic_inc(&bs->write_gen);
2657 bdrv_set_dirty(bs, req.offset, req.bytes);
2658 tracked_request_end(&req);
2659 bdrv_dec_in_flight(bs);
2660 return ret;
2663 int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
2665 Coroutine *co;
2666 DiscardCo rwco = {
2667 .bs = bs,
2668 .offset = offset,
2669 .bytes = bytes,
2670 .ret = NOT_DONE,
2673 if (qemu_in_coroutine()) {
2674 /* Fast-path if already in coroutine context */
2675 bdrv_pdiscard_co_entry(&rwco);
2676 } else {
2677 co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
2678 bdrv_coroutine_enter(bs, co);
2679 BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE);
2682 return rwco.ret;
2685 int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
2687 BlockDriver *drv = bs->drv;
2688 CoroutineIOCompletion co = {
2689 .coroutine = qemu_coroutine_self(),
2691 BlockAIOCB *acb;
2693 bdrv_inc_in_flight(bs);
2694 if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
2695 co.ret = -ENOTSUP;
2696 goto out;
2699 if (drv->bdrv_co_ioctl) {
2700 co.ret = drv->bdrv_co_ioctl(bs, req, buf);
2701 } else {
2702 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2703 if (!acb) {
2704 co.ret = -ENOTSUP;
2705 goto out;
2707 qemu_coroutine_yield();
2709 out:
2710 bdrv_dec_in_flight(bs);
2711 return co.ret;
2714 void *qemu_blockalign(BlockDriverState *bs, size_t size)
2716 return qemu_memalign(bdrv_opt_mem_align(bs), size);
2719 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2721 return memset(qemu_blockalign(bs, size), 0, size);
2724 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2726 size_t align = bdrv_opt_mem_align(bs);
2728 /* Ensure that NULL is never returned on success */
2729 assert(align > 0);
2730 if (size == 0) {
2731 size = align;
2734 return qemu_try_memalign(align, size);
2737 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2739 void *mem = qemu_try_blockalign(bs, size);
2741 if (mem) {
2742 memset(mem, 0, size);
2745 return mem;
2749 * Check if all memory in this vector is sector aligned.
2751 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2753 int i;
2754 size_t alignment = bdrv_min_mem_align(bs);
2756 for (i = 0; i < qiov->niov; i++) {
2757 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2758 return false;
2760 if (qiov->iov[i].iov_len % alignment) {
2761 return false;
2765 return true;
2768 void bdrv_add_before_write_notifier(BlockDriverState *bs,
2769 NotifierWithReturn *notifier)
2771 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2774 void bdrv_io_plug(BlockDriverState *bs)
2776 BdrvChild *child;
2778 QLIST_FOREACH(child, &bs->children, next) {
2779 bdrv_io_plug(child->bs);
2782 if (atomic_fetch_inc(&bs->io_plugged) == 0) {
2783 BlockDriver *drv = bs->drv;
2784 if (drv && drv->bdrv_io_plug) {
2785 drv->bdrv_io_plug(bs);
2790 void bdrv_io_unplug(BlockDriverState *bs)
2792 BdrvChild *child;
2794 assert(bs->io_plugged);
2795 if (atomic_fetch_dec(&bs->io_plugged) == 1) {
2796 BlockDriver *drv = bs->drv;
2797 if (drv && drv->bdrv_io_unplug) {
2798 drv->bdrv_io_unplug(bs);
2802 QLIST_FOREACH(child, &bs->children, next) {
2803 bdrv_io_unplug(child->bs);
2807 void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size)
2809 BdrvChild *child;
2811 if (bs->drv && bs->drv->bdrv_register_buf) {
2812 bs->drv->bdrv_register_buf(bs, host, size);
2814 QLIST_FOREACH(child, &bs->children, next) {
2815 bdrv_register_buf(child->bs, host, size);
2819 void bdrv_unregister_buf(BlockDriverState *bs, void *host)
2821 BdrvChild *child;
2823 if (bs->drv && bs->drv->bdrv_unregister_buf) {
2824 bs->drv->bdrv_unregister_buf(bs, host);
2826 QLIST_FOREACH(child, &bs->children, next) {
2827 bdrv_unregister_buf(child->bs, host);