block/io.c

   1 /*
   2  * Block layer I/O functions
   3  *
   4  * Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "trace.h"
  27 #include "sysemu/block-backend.h"
  28 #include "block/aio-wait.h"
  29 #include "block/blockjob.h"
  30 #include "block/blockjob_int.h"
  31 #include "block/block_int.h"
  32 #include "block/coroutines.h"
  33 #include "block/dirty-bitmap.h"
  34 #include "block/write-threshold.h"
  35 #include "qemu/cutils.h"
  36 #include "qemu/memalign.h"
  37 #include "qapi/error.h"
  38 #include "qemu/error-report.h"
  39 #include "qemu/main-loop.h"
  40 #include "sysemu/replay.h"
  41
  42 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
  43 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
  44
  45 static void coroutine_fn GRAPH_RDLOCK
  46 bdrv_parent_cb_resize(BlockDriverState *bs);
  47
  48 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
  49     int64_t offset, int64_t bytes, BdrvRequestFlags flags);
  50
  51 static void GRAPH_RDLOCK
  52 bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
  53 {
  54     BdrvChild *c, *next;
  55     IO_OR_GS_CODE();
  56     assert_bdrv_graph_readable();
  57
  58     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
  59         if (c == ignore) {
  60             continue;
  61         }
  62         bdrv_parent_drained_begin_single(c);
  63     }
  64 }
  65
  66 void bdrv_parent_drained_end_single(BdrvChild *c)
  67 {
  68     GLOBAL_STATE_CODE();
  69
  70     assert(c->quiesced_parent);
  71     c->quiesced_parent = false;
  72
  73     if (c->klass->drained_end) {
  74         c->klass->drained_end(c);
  75     }
  76 }
  77
  78 static void GRAPH_RDLOCK
  79 bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
  80 {
  81     BdrvChild *c;
  82     IO_OR_GS_CODE();
  83     assert_bdrv_graph_readable();
  84
  85     QLIST_FOREACH(c, &bs->parents, next_parent) {
  86         if (c == ignore) {
  87             continue;
  88         }
  89         bdrv_parent_drained_end_single(c);
  90     }
  91 }
  92
  93 bool bdrv_parent_drained_poll_single(BdrvChild *c)
  94 {
  95     IO_OR_GS_CODE();
  96
  97     if (c->klass->drained_poll) {
  98         return c->klass->drained_poll(c);
  99     }
 100     return false;
 101 }
 102
 103 static bool GRAPH_RDLOCK
 104 bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
 105                          bool ignore_bds_parents)
 106 {
 107     BdrvChild *c, *next;
 108     bool busy = false;
 109     IO_OR_GS_CODE();
 110     assert_bdrv_graph_readable();
 111
 112     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
 113         if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
 114             continue;
 115         }
 116         busy |= bdrv_parent_drained_poll_single(c);
 117     }
 118
 119     return busy;
 120 }
 121
 122 void bdrv_parent_drained_begin_single(BdrvChild *c)
 123 {
 124     GLOBAL_STATE_CODE();
 125
 126     assert(!c->quiesced_parent);
 127     c->quiesced_parent = true;
 128
 129     if (c->klass->drained_begin) {
 130         /* called with rdlock taken, but it doesn't really need it. */
 131         c->klass->drained_begin(c);
 132     }
 133 }
 134
 135 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
 136 {
 137     dst->pdiscard_alignment = MAX(dst->pdiscard_alignment,
 138                                   src->pdiscard_alignment);
 139     dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
 140     dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
 141     dst->max_hw_transfer = MIN_NON_ZERO(dst->max_hw_transfer,
 142                                         src->max_hw_transfer);
 143     dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
 144                                  src->opt_mem_alignment);
 145     dst->min_mem_alignment = MAX(dst->min_mem_alignment,
 146                                  src->min_mem_alignment);
 147     dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
 148     dst->max_hw_iov = MIN_NON_ZERO(dst->max_hw_iov, src->max_hw_iov);
 149 }
 150
 151 typedef struct BdrvRefreshLimitsState {
 152     BlockDriverState *bs;
 153     BlockLimits old_bl;
 154 } BdrvRefreshLimitsState;
 155
 156 static void bdrv_refresh_limits_abort(void *opaque)
 157 {
 158     BdrvRefreshLimitsState *s = opaque;
 159
 160     s->bs->bl = s->old_bl;
 161 }
 162
 163 static TransactionActionDrv bdrv_refresh_limits_drv = {
 164     .abort = bdrv_refresh_limits_abort,
 165     .clean = g_free,
 166 };
 167
 168 /* @tran is allowed to be NULL, in this case no rollback is possible. */
 169 void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp)
 170 {
 171     ERRP_GUARD();
 172     BlockDriver *drv = bs->drv;
 173     BdrvChild *c;
 174     bool have_limits;
 175
 176     GLOBAL_STATE_CODE();
 177
 178     if (tran) {
 179         BdrvRefreshLimitsState *s = g_new(BdrvRefreshLimitsState, 1);
 180         *s = (BdrvRefreshLimitsState) {
 181             .bs = bs,
 182             .old_bl = bs->bl,
 183         };
 184         tran_add(tran, &bdrv_refresh_limits_drv, s);
 185     }
 186
 187     memset(&bs->bl, 0, sizeof(bs->bl));
 188
 189     if (!drv) {
 190         return;
 191     }
 192
 193     /* Default alignment based on whether driver has byte interface */
 194     bs->bl.request_alignment = (drv->bdrv_co_preadv ||
 195                                 drv->bdrv_aio_preadv ||
 196                                 drv->bdrv_co_preadv_part) ? 1 : 512;
 197
 198     /* Take some limits from the children as a default */
 199     have_limits = false;
 200     QLIST_FOREACH(c, &bs->children, next) {
 201         if (c->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED | BDRV_CHILD_COW))
 202         {
 203             bdrv_merge_limits(&bs->bl, &c->bs->bl);
 204             have_limits = true;
 205         }
 206
 207         if (c->role & BDRV_CHILD_FILTERED) {
 208             bs->bl.has_variable_length |= c->bs->bl.has_variable_length;
 209         }
 210     }
 211
 212     if (!have_limits) {
 213         bs->bl.min_mem_alignment = 512;
 214         bs->bl.opt_mem_alignment = qemu_real_host_page_size();
 215
 216         /* Safe default since most protocols use readv()/writev()/etc */
 217         bs->bl.max_iov = IOV_MAX;
 218     }
 219
 220     /* Then let the driver override it */
 221     if (drv->bdrv_refresh_limits) {
 222         drv->bdrv_refresh_limits(bs, errp);
 223         if (*errp) {
 224             return;
 225         }
 226     }
 227
 228     if (bs->bl.request_alignment > BDRV_MAX_ALIGNMENT) {
 229         error_setg(errp, "Driver requires too large request alignment");
 230     }
 231 }
 232
 233 /**
 234  * The copy-on-read flag is actually a reference count so multiple users may
 235  * use the feature without worrying about clobbering its previous state.
 236  * Copy-on-read stays enabled until all users have called to disable it.
 237  */
 238 void bdrv_enable_copy_on_read(BlockDriverState *bs)
 239 {
 240     IO_CODE();
 241     qatomic_inc(&bs->copy_on_read);
 242 }
 243
 244 void bdrv_disable_copy_on_read(BlockDriverState *bs)
 245 {
 246     int old = qatomic_fetch_dec(&bs->copy_on_read);
 247     IO_CODE();
 248     assert(old >= 1);
 249 }
 250
 251 typedef struct {
 252     Coroutine *co;
 253     BlockDriverState *bs;
 254     bool done;
 255     bool begin;
 256     bool poll;
 257     BdrvChild *parent;
 258 } BdrvCoDrainData;
 259
 260 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
 261 bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent,
 262                      bool ignore_bds_parents)
 263 {
 264     GLOBAL_STATE_CODE();
 265
 266     if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
 267         return true;
 268     }
 269
 270     if (qatomic_read(&bs->in_flight)) {
 271         return true;
 272     }
 273
 274     return false;
 275 }
 276
 277 static bool bdrv_drain_poll_top_level(BlockDriverState *bs,
 278                                       BdrvChild *ignore_parent)
 279 {
 280     GLOBAL_STATE_CODE();
 281     GRAPH_RDLOCK_GUARD_MAINLOOP();
 282
 283     return bdrv_drain_poll(bs, ignore_parent, false);
 284 }
 285
 286 static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent,
 287                                   bool poll);
 288 static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
 289
 290 static void bdrv_co_drain_bh_cb(void *opaque)
 291 {
 292     BdrvCoDrainData *data = opaque;
 293     Coroutine *co = data->co;
 294     BlockDriverState *bs = data->bs;
 295
 296     if (bs) {
 297         AioContext *ctx = bdrv_get_aio_context(bs);
 298         aio_context_acquire(ctx);
 299         bdrv_dec_in_flight(bs);
 300         if (data->begin) {
 301             bdrv_do_drained_begin(bs, data->parent, data->poll);
 302         } else {
 303             assert(!data->poll);
 304             bdrv_do_drained_end(bs, data->parent);
 305         }
 306         aio_context_release(ctx);
 307     } else {
 308         assert(data->begin);
 309         bdrv_drain_all_begin();
 310     }
 311
 312     data->done = true;
 313     aio_co_wake(co);
 314 }
 315
 316 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
 317                                                 bool begin,
 318                                                 BdrvChild *parent,
 319                                                 bool poll)
 320 {
 321     BdrvCoDrainData data;
 322     Coroutine *self = qemu_coroutine_self();
 323     AioContext *ctx = bdrv_get_aio_context(bs);
 324     AioContext *co_ctx = qemu_coroutine_get_aio_context(self);
 325
 326     /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
 327      * other coroutines run if they were queued by aio_co_enter(). */
 328
 329     assert(qemu_in_coroutine());
 330     data = (BdrvCoDrainData) {
 331         .co = self,
 332         .bs = bs,
 333         .done = false,
 334         .begin = begin,
 335         .parent = parent,
 336         .poll = poll,
 337     };
 338
 339     if (bs) {
 340         bdrv_inc_in_flight(bs);
 341     }
 342
 343     /*
 344      * Temporarily drop the lock across yield or we would get deadlocks.
 345      * bdrv_co_drain_bh_cb() reaquires the lock as needed.
 346      *
 347      * When we yield below, the lock for the current context will be
 348      * released, so if this is actually the lock that protects bs, don't drop
 349      * it a second time.
 350      */
 351     if (ctx != co_ctx) {
 352         aio_context_release(ctx);
 353     }
 354     replay_bh_schedule_oneshot_event(qemu_get_aio_context(),
 355                                      bdrv_co_drain_bh_cb, &data);
 356
 357     qemu_coroutine_yield();
 358     /* If we are resumed from some other event (such as an aio completion or a
 359      * timer callback), it is a bug in the caller that should be fixed. */
 360     assert(data.done);
 361
 362     /* Reacquire the AioContext of bs if we dropped it */
 363     if (ctx != co_ctx) {
 364         aio_context_acquire(ctx);
 365     }
 366 }
 367
 368 static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent,
 369                                   bool poll)
 370 {
 371     IO_OR_GS_CODE();
 372
 373     if (qemu_in_coroutine()) {
 374         bdrv_co_yield_to_drain(bs, true, parent, poll);
 375         return;
 376     }
 377
 378     GLOBAL_STATE_CODE();
 379
 380     /* Stop things in parent-to-child order */
 381     if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) {
 382         GRAPH_RDLOCK_GUARD_MAINLOOP();
 383         bdrv_parent_drained_begin(bs, parent);
 384         if (bs->drv && bs->drv->bdrv_drain_begin) {
 385             bs->drv->bdrv_drain_begin(bs);
 386         }
 387     }
 388
 389     /*
 390      * Wait for drained requests to finish.
 391      *
 392      * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
 393      * call is needed so things in this AioContext can make progress even
 394      * though we don't return to the main AioContext loop - this automatically
 395      * includes other nodes in the same AioContext and therefore all child
 396      * nodes.
 397      */
 398     if (poll) {
 399         BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, parent));
 400     }
 401 }
 402
 403 void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, BdrvChild *parent)
 404 {
 405     bdrv_do_drained_begin(bs, parent, false);
 406 }
 407
 408 void coroutine_mixed_fn
 409 bdrv_drained_begin(BlockDriverState *bs)
 410 {
 411     IO_OR_GS_CODE();
 412     bdrv_do_drained_begin(bs, NULL, true);
 413 }
 414
 415 /**
 416  * This function does not poll, nor must any of its recursively called
 417  * functions.
 418  */
 419 static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
 420 {
 421     int old_quiesce_counter;
 422
 423     IO_OR_GS_CODE();
 424
 425     if (qemu_in_coroutine()) {
 426         bdrv_co_yield_to_drain(bs, false, parent, false);
 427         return;
 428     }
 429
 430     /* At this point, we should be always running in the main loop. */
 431     GLOBAL_STATE_CODE();
 432     assert(bs->quiesce_counter > 0);
 433     GLOBAL_STATE_CODE();
 434
 435     /* Re-enable things in child-to-parent order */
 436     old_quiesce_counter = qatomic_fetch_dec(&bs->quiesce_counter);
 437     if (old_quiesce_counter == 1) {
 438         GRAPH_RDLOCK_GUARD_MAINLOOP();
 439         if (bs->drv && bs->drv->bdrv_drain_end) {
 440             bs->drv->bdrv_drain_end(bs);
 441         }
 442         bdrv_parent_drained_end(bs, parent);
 443     }
 444 }
 445
 446 void bdrv_drained_end(BlockDriverState *bs)
 447 {
 448     IO_OR_GS_CODE();
 449     bdrv_do_drained_end(bs, NULL);
 450 }
 451
 452 void bdrv_drain(BlockDriverState *bs)
 453 {
 454     IO_OR_GS_CODE();
 455     bdrv_drained_begin(bs);
 456     bdrv_drained_end(bs);
 457 }
 458
 459 static void bdrv_drain_assert_idle(BlockDriverState *bs)
 460 {
 461     BdrvChild *child, *next;
 462     GLOBAL_STATE_CODE();
 463     GRAPH_RDLOCK_GUARD_MAINLOOP();
 464
 465     assert(qatomic_read(&bs->in_flight) == 0);
 466     QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 467         bdrv_drain_assert_idle(child->bs);
 468     }
 469 }
 470
 471 unsigned int bdrv_drain_all_count = 0;
 472
 473 static bool bdrv_drain_all_poll(void)
 474 {
 475     BlockDriverState *bs = NULL;
 476     bool result = false;
 477
 478     GLOBAL_STATE_CODE();
 479     GRAPH_RDLOCK_GUARD_MAINLOOP();
 480
 481     /* bdrv_drain_poll() can't make changes to the graph and we are holding the
 482      * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
 483     while ((bs = bdrv_next_all_states(bs))) {
 484         AioContext *aio_context = bdrv_get_aio_context(bs);
 485         aio_context_acquire(aio_context);
 486         result |= bdrv_drain_poll(bs, NULL, true);
 487         aio_context_release(aio_context);
 488     }
 489
 490     return result;
 491 }
 492
 493 /*
 494  * Wait for pending requests to complete across all BlockDriverStates
 495  *
 496  * This function does not flush data to disk, use bdrv_flush_all() for that
 497  * after calling this function.
 498  *
 499  * This pauses all block jobs and disables external clients. It must
 500  * be paired with bdrv_drain_all_end().
 501  *
 502  * NOTE: no new block jobs or BlockDriverStates can be created between
 503  * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
 504  */
 505 void bdrv_drain_all_begin_nopoll(void)
 506 {
 507     BlockDriverState *bs = NULL;
 508     GLOBAL_STATE_CODE();
 509
 510     /*
 511      * bdrv queue is managed by record/replay,
 512      * waiting for finishing the I/O requests may
 513      * be infinite
 514      */
 515     if (replay_events_enabled()) {
 516         return;
 517     }
 518
 519     /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
 520      * loop AioContext, so make sure we're in the main context. */
 521     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 522     assert(bdrv_drain_all_count < INT_MAX);
 523     bdrv_drain_all_count++;
 524
 525     /* Quiesce all nodes, without polling in-flight requests yet. The graph
 526      * cannot change during this loop. */
 527     while ((bs = bdrv_next_all_states(bs))) {
 528         AioContext *aio_context = bdrv_get_aio_context(bs);
 529
 530         aio_context_acquire(aio_context);
 531         bdrv_do_drained_begin(bs, NULL, false);
 532         aio_context_release(aio_context);
 533     }
 534 }
 535
 536 void coroutine_mixed_fn bdrv_drain_all_begin(void)
 537 {
 538     BlockDriverState *bs = NULL;
 539
 540     if (qemu_in_coroutine()) {
 541         bdrv_co_yield_to_drain(NULL, true, NULL, true);
 542         return;
 543     }
 544
 545     /*
 546      * bdrv queue is managed by record/replay,
 547      * waiting for finishing the I/O requests may
 548      * be infinite
 549      */
 550     if (replay_events_enabled()) {
 551         return;
 552     }
 553
 554     bdrv_drain_all_begin_nopoll();
 555
 556     /* Now poll the in-flight requests */
 557     AIO_WAIT_WHILE_UNLOCKED(NULL, bdrv_drain_all_poll());
 558
 559     while ((bs = bdrv_next_all_states(bs))) {
 560         bdrv_drain_assert_idle(bs);
 561     }
 562 }
 563
 564 void bdrv_drain_all_end_quiesce(BlockDriverState *bs)
 565 {
 566     GLOBAL_STATE_CODE();
 567
 568     g_assert(bs->quiesce_counter > 0);
 569     g_assert(!bs->refcnt);
 570
 571     while (bs->quiesce_counter) {
 572         bdrv_do_drained_end(bs, NULL);
 573     }
 574 }
 575
 576 void bdrv_drain_all_end(void)
 577 {
 578     BlockDriverState *bs = NULL;
 579     GLOBAL_STATE_CODE();
 580
 581     /*
 582      * bdrv queue is managed by record/replay,
 583      * waiting for finishing the I/O requests may
 584      * be endless
 585      */
 586     if (replay_events_enabled()) {
 587         return;
 588     }
 589
 590     while ((bs = bdrv_next_all_states(bs))) {
 591         AioContext *aio_context = bdrv_get_aio_context(bs);
 592
 593         aio_context_acquire(aio_context);
 594         bdrv_do_drained_end(bs, NULL);
 595         aio_context_release(aio_context);
 596     }
 597
 598     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 599     assert(bdrv_drain_all_count > 0);
 600     bdrv_drain_all_count--;
 601 }
 602
 603 void bdrv_drain_all(void)
 604 {
 605     GLOBAL_STATE_CODE();
 606     bdrv_drain_all_begin();
 607     bdrv_drain_all_end();
 608 }
 609
 610 /**
 611  * Remove an active request from the tracked requests list
 612  *
 613  * This function should be called when a tracked request is completing.
 614  */
 615 static void coroutine_fn tracked_request_end(BdrvTrackedRequest *req)
 616 {
 617     if (req->serialising) {
 618         qatomic_dec(&req->bs->serialising_in_flight);
 619     }
 620
 621     qemu_mutex_lock(&req->bs->reqs_lock);
 622     QLIST_REMOVE(req, list);
 623     qemu_mutex_unlock(&req->bs->reqs_lock);
 624
 625     /*
 626      * At this point qemu_co_queue_wait(&req->wait_queue, ...) won't be called
 627      * anymore because the request has been removed from the list, so it's safe
 628      * to restart the queue outside reqs_lock to minimize the critical section.
 629      */
 630     qemu_co_queue_restart_all(&req->wait_queue);
 631 }
 632
 633 /**
 634  * Add an active request to the tracked requests list
 635  */
 636 static void coroutine_fn tracked_request_begin(BdrvTrackedRequest *req,
 637                                                BlockDriverState *bs,
 638                                                int64_t offset,
 639                                                int64_t bytes,
 640                                                enum BdrvTrackedRequestType type)
 641 {
 642     bdrv_check_request(offset, bytes, &error_abort);
 643
 644     *req = (BdrvTrackedRequest){
 645         .bs = bs,
 646         .offset         = offset,
 647         .bytes          = bytes,
 648         .type           = type,
 649         .co             = qemu_coroutine_self(),
 650         .serialising    = false,
 651         .overlap_offset = offset,
 652         .overlap_bytes  = bytes,
 653     };
 654
 655     qemu_co_queue_init(&req->wait_queue);
 656
 657     qemu_mutex_lock(&bs->reqs_lock);
 658     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
 659     qemu_mutex_unlock(&bs->reqs_lock);
 660 }
 661
 662 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
 663                                      int64_t offset, int64_t bytes)
 664 {
 665     bdrv_check_request(offset, bytes, &error_abort);
 666
 667     /*        aaaa   bbbb */
 668     if (offset >= req->overlap_offset + req->overlap_bytes) {
 669         return false;
 670     }
 671     /* bbbb   aaaa        */
 672     if (req->overlap_offset >= offset + bytes) {
 673         return false;
 674     }
 675     return true;
 676 }
 677
 678 /* Called with self->bs->reqs_lock held */
 679 static coroutine_fn BdrvTrackedRequest *
 680 bdrv_find_conflicting_request(BdrvTrackedRequest *self)
 681 {
 682     BdrvTrackedRequest *req;
 683
 684     QLIST_FOREACH(req, &self->bs->tracked_requests, list) {
 685         if (req == self || (!req->serialising && !self->serialising)) {
 686             continue;
 687         }
 688         if (tracked_request_overlaps(req, self->overlap_offset,
 689                                      self->overlap_bytes))
 690         {
 691             /*
 692              * Hitting this means there was a reentrant request, for
 693              * example, a block driver issuing nested requests.  This must
 694              * never happen since it means deadlock.
 695              */
 696             assert(qemu_coroutine_self() != req->co);
 697
 698             /*
 699              * If the request is already (indirectly) waiting for us, or
 700              * will wait for us as soon as it wakes up, then just go on
 701              * (instead of producing a deadlock in the former case).
 702              */
 703             if (!req->waiting_for) {
 704                 return req;
 705             }
 706         }
 707     }
 708
 709     return NULL;
 710 }
 711
 712 /* Called with self->bs->reqs_lock held */
 713 static void coroutine_fn
 714 bdrv_wait_serialising_requests_locked(BdrvTrackedRequest *self)
 715 {
 716     BdrvTrackedRequest *req;
 717
 718     while ((req = bdrv_find_conflicting_request(self))) {
 719         self->waiting_for = req;
 720         qemu_co_queue_wait(&req->wait_queue, &self->bs->reqs_lock);
 721         self->waiting_for = NULL;
 722     }
 723 }
 724
 725 /* Called with req->bs->reqs_lock held */
 726 static void tracked_request_set_serialising(BdrvTrackedRequest *req,
 727                                             uint64_t align)
 728 {
 729     int64_t overlap_offset = req->offset & ~(align - 1);
 730     int64_t overlap_bytes =
 731         ROUND_UP(req->offset + req->bytes, align) - overlap_offset;
 732
 733     bdrv_check_request(req->offset, req->bytes, &error_abort);
 734
 735     if (!req->serialising) {
 736         qatomic_inc(&req->bs->serialising_in_flight);
 737         req->serialising = true;
 738     }
 739
 740     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
 741     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
 742 }
 743
 744 /**
 745  * Return the tracked request on @bs for the current coroutine, or
 746  * NULL if there is none.
 747  */
 748 BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs)
 749 {
 750     BdrvTrackedRequest *req;
 751     Coroutine *self = qemu_coroutine_self();
 752     IO_CODE();
 753
 754     QLIST_FOREACH(req, &bs->tracked_requests, list) {
 755         if (req->co == self) {
 756             return req;
 757         }
 758     }
 759
 760     return NULL;
 761 }
 762
 763 /**
 764  * Round a region to subcluster (if supported) or cluster boundaries
 765  */
 766 void coroutine_fn GRAPH_RDLOCK
 767 bdrv_round_to_subclusters(BlockDriverState *bs, int64_t offset, int64_t bytes,
 768                           int64_t *align_offset, int64_t *align_bytes)
 769 {
 770     BlockDriverInfo bdi;
 771     IO_CODE();
 772     if (bdrv_co_get_info(bs, &bdi) < 0 || bdi.subcluster_size == 0) {
 773         *align_offset = offset;
 774         *align_bytes = bytes;
 775     } else {
 776         int64_t c = bdi.subcluster_size;
 777         *align_offset = QEMU_ALIGN_DOWN(offset, c);
 778         *align_bytes = QEMU_ALIGN_UP(offset - *align_offset + bytes, c);
 779     }
 780 }
 781
 782 static int coroutine_fn GRAPH_RDLOCK bdrv_get_cluster_size(BlockDriverState *bs)
 783 {
 784     BlockDriverInfo bdi;
 785     int ret;
 786
 787     ret = bdrv_co_get_info(bs, &bdi);
 788     if (ret < 0 || bdi.cluster_size == 0) {
 789         return bs->bl.request_alignment;
 790     } else {
 791         return bdi.cluster_size;
 792     }
 793 }
 794
 795 void bdrv_inc_in_flight(BlockDriverState *bs)
 796 {
 797     IO_CODE();
 798     qatomic_inc(&bs->in_flight);
 799 }
 800
 801 void bdrv_wakeup(BlockDriverState *bs)
 802 {
 803     IO_CODE();
 804     aio_wait_kick();
 805 }
 806
 807 void bdrv_dec_in_flight(BlockDriverState *bs)
 808 {
 809     IO_CODE();
 810     qatomic_dec(&bs->in_flight);
 811     bdrv_wakeup(bs);
 812 }
 813
 814 static void coroutine_fn
 815 bdrv_wait_serialising_requests(BdrvTrackedRequest *self)
 816 {
 817     BlockDriverState *bs = self->bs;
 818
 819     if (!qatomic_read(&bs->serialising_in_flight)) {
 820         return;
 821     }
 822
 823     qemu_mutex_lock(&bs->reqs_lock);
 824     bdrv_wait_serialising_requests_locked(self);
 825     qemu_mutex_unlock(&bs->reqs_lock);
 826 }
 827
 828 void coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req,
 829                                                 uint64_t align)
 830 {
 831     IO_CODE();
 832
 833     qemu_mutex_lock(&req->bs->reqs_lock);
 834
 835     tracked_request_set_serialising(req, align);
 836     bdrv_wait_serialising_requests_locked(req);
 837
 838     qemu_mutex_unlock(&req->bs->reqs_lock);
 839 }
 840
 841 int bdrv_check_qiov_request(int64_t offset, int64_t bytes,
 842                             QEMUIOVector *qiov, size_t qiov_offset,
 843                             Error **errp)
 844 {
 845     /*
 846      * Check generic offset/bytes correctness
 847      */
 848
 849     if (offset < 0) {
 850         error_setg(errp, "offset is negative: %" PRIi64, offset);
 851         return -EIO;
 852     }
 853
 854     if (bytes < 0) {
 855         error_setg(errp, "bytes is negative: %" PRIi64, bytes);
 856         return -EIO;
 857     }
 858
 859     if (bytes > BDRV_MAX_LENGTH) {
 860         error_setg(errp, "bytes(%" PRIi64 ") exceeds maximum(%" PRIi64 ")",
 861                    bytes, BDRV_MAX_LENGTH);
 862         return -EIO;
 863     }
 864
 865     if (offset > BDRV_MAX_LENGTH) {
 866         error_setg(errp, "offset(%" PRIi64 ") exceeds maximum(%" PRIi64 ")",
 867                    offset, BDRV_MAX_LENGTH);
 868         return -EIO;
 869     }
 870
 871     if (offset > BDRV_MAX_LENGTH - bytes) {
 872         error_setg(errp, "sum of offset(%" PRIi64 ") and bytes(%" PRIi64 ") "
 873                    "exceeds maximum(%" PRIi64 ")", offset, bytes,
 874                    BDRV_MAX_LENGTH);
 875         return -EIO;
 876     }
 877
 878     if (!qiov) {
 879         return 0;
 880     }
 881
 882     /*
 883      * Check qiov and qiov_offset
 884      */
 885
 886     if (qiov_offset > qiov->size) {
 887         error_setg(errp, "qiov_offset(%zu) overflow io vector size(%zu)",
 888                    qiov_offset, qiov->size);
 889         return -EIO;
 890     }
 891
 892     if (bytes > qiov->size - qiov_offset) {
 893         error_setg(errp, "bytes(%" PRIi64 ") + qiov_offset(%zu) overflow io "
 894                    "vector size(%zu)", bytes, qiov_offset, qiov->size);
 895         return -EIO;
 896     }
 897
 898     return 0;
 899 }
 900
 901 int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp)
 902 {
 903     return bdrv_check_qiov_request(offset, bytes, NULL, 0, errp);
 904 }
 905
 906 static int bdrv_check_request32(int64_t offset, int64_t bytes,
 907                                 QEMUIOVector *qiov, size_t qiov_offset)
 908 {
 909     int ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL);
 910     if (ret < 0) {
 911         return ret;
 912     }
 913
 914     if (bytes > BDRV_REQUEST_MAX_BYTES) {
 915         return -EIO;
 916     }
 917
 918     return 0;
 919 }
 920
 921 /*
 922  * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
 923  * The operation is sped up by checking the block status and only writing
 924  * zeroes to the device if they currently do not return zeroes. Optional
 925  * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
 926  * BDRV_REQ_FUA).
 927  *
 928  * Returns < 0 on error, 0 on success. For error codes see bdrv_pwrite().
 929  */
 930 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
 931 {
 932     int ret;
 933     int64_t target_size, bytes, offset = 0;
 934     BlockDriverState *bs = child->bs;
 935     IO_CODE();
 936
 937     target_size = bdrv_getlength(bs);
 938     if (target_size < 0) {
 939         return target_size;
 940     }
 941
 942     for (;;) {
 943         bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
 944         if (bytes <= 0) {
 945             return 0;
 946         }
 947         ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
 948         if (ret < 0) {
 949             return ret;
 950         }
 951         if (ret & BDRV_BLOCK_ZERO) {
 952             offset += bytes;
 953             continue;
 954         }
 955         ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
 956         if (ret < 0) {
 957             return ret;
 958         }
 959         offset += bytes;
 960     }
 961 }
 962
 963 /*
 964  * Writes to the file and ensures that no writes are reordered across this
 965  * request (acts as a barrier)
 966  *
 967  * Returns 0 on success, -errno in error cases.
 968  */
 969 int coroutine_fn bdrv_co_pwrite_sync(BdrvChild *child, int64_t offset,
 970                                      int64_t bytes, const void *buf,
 971                                      BdrvRequestFlags flags)
 972 {
 973     int ret;
 974     IO_CODE();
 975     assert_bdrv_graph_readable();
 976
 977     ret = bdrv_co_pwrite(child, offset, bytes, buf, flags);
 978     if (ret < 0) {
 979         return ret;
 980     }
 981
 982     ret = bdrv_co_flush(child->bs);
 983     if (ret < 0) {
 984         return ret;
 985     }
 986
 987     return 0;
 988 }
 989
 990 typedef struct CoroutineIOCompletion {
 991     Coroutine *coroutine;
 992     int ret;
 993 } CoroutineIOCompletion;
 994
 995 static void bdrv_co_io_em_complete(void *opaque, int ret)
 996 {
 997     CoroutineIOCompletion *co = opaque;
 998
 999     co->ret = ret;
1000     aio_co_wake(co->coroutine);
1001 }
1002
1003 static int coroutine_fn GRAPH_RDLOCK
1004 bdrv_driver_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
1005                    QEMUIOVector *qiov, size_t qiov_offset, int flags)
1006 {
1007     BlockDriver *drv = bs->drv;
1008     int64_t sector_num;
1009     unsigned int nb_sectors;
1010     QEMUIOVector local_qiov;
1011     int ret;
1012     assert_bdrv_graph_readable();
1013
1014     bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
1015     assert(!(flags & ~bs->supported_read_flags));
1016
1017     if (!drv) {
1018         return -ENOMEDIUM;
1019     }
1020
1021     if (drv->bdrv_co_preadv_part) {
1022         return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset,
1023                                         flags);
1024     }
1025
1026     if (qiov_offset > 0 || bytes != qiov->size) {
1027         qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1028         qiov = &local_qiov;
1029     }
1030
1031     if (drv->bdrv_co_preadv) {
1032         ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
1033         goto out;
1034     }
1035
1036     if (drv->bdrv_aio_preadv) {
1037         BlockAIOCB *acb;
1038         CoroutineIOCompletion co = {
1039             .coroutine = qemu_coroutine_self(),
1040         };
1041
1042         acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
1043                                    bdrv_co_io_em_complete, &co);
1044         if (acb == NULL) {
1045             ret = -EIO;
1046             goto out;
1047         } else {
1048             qemu_coroutine_yield();
1049             ret = co.ret;
1050             goto out;
1051         }
1052     }
1053
1054     sector_num = offset >> BDRV_SECTOR_BITS;
1055     nb_sectors = bytes >> BDRV_SECTOR_BITS;
1056
1057     assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
1058     assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
1059     assert(bytes <= BDRV_REQUEST_MAX_BYTES);
1060     assert(drv->bdrv_co_readv);
1061
1062     ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1063
1064 out:
1065     if (qiov == &local_qiov) {
1066         qemu_iovec_destroy(&local_qiov);
1067     }
1068
1069     return ret;
1070 }
1071
1072 static int coroutine_fn GRAPH_RDLOCK
1073 bdrv_driver_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
1074                     QEMUIOVector *qiov, size_t qiov_offset,
1075                     BdrvRequestFlags flags)
1076 {
1077     BlockDriver *drv = bs->drv;
1078     bool emulate_fua = false;
1079     int64_t sector_num;
1080     unsigned int nb_sectors;
1081     QEMUIOVector local_qiov;
1082     int ret;
1083     assert_bdrv_graph_readable();
1084
1085     bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
1086
1087     if (!drv) {
1088         return -ENOMEDIUM;
1089     }
1090
1091     if ((flags & BDRV_REQ_FUA) &&
1092         (~bs->supported_write_flags & BDRV_REQ_FUA)) {
1093         flags &= ~BDRV_REQ_FUA;
1094         emulate_fua = true;
1095     }
1096
1097     flags &= bs->supported_write_flags;
1098
1099     if (drv->bdrv_co_pwritev_part) {
1100         ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset,
1101                                         flags);
1102         goto emulate_flags;
1103     }
1104
1105     if (qiov_offset > 0 || bytes != qiov->size) {
1106         qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1107         qiov = &local_qiov;
1108     }
1109
1110     if (drv->bdrv_co_pwritev) {
1111         ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, flags);
1112         goto emulate_flags;
1113     }
1114
1115     if (drv->bdrv_aio_pwritev) {
1116         BlockAIOCB *acb;
1117         CoroutineIOCompletion co = {
1118             .coroutine = qemu_coroutine_self(),
1119         };
1120
1121         acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, flags,
1122                                     bdrv_co_io_em_complete, &co);
1123         if (acb == NULL) {
1124             ret = -EIO;
1125         } else {
1126             qemu_coroutine_yield();
1127             ret = co.ret;
1128         }
1129         goto emulate_flags;
1130     }
1131
1132     sector_num = offset >> BDRV_SECTOR_BITS;
1133     nb_sectors = bytes >> BDRV_SECTOR_BITS;
1134
1135     assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
1136     assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
1137     assert(bytes <= BDRV_REQUEST_MAX_BYTES);
1138
1139     assert(drv->bdrv_co_writev);
1140     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, flags);
1141
1142 emulate_flags:
1143     if (ret == 0 && emulate_fua) {
1144         ret = bdrv_co_flush(bs);
1145     }
1146
1147     if (qiov == &local_qiov) {
1148         qemu_iovec_destroy(&local_qiov);
1149     }
1150
1151     return ret;
1152 }
1153
1154 static int coroutine_fn GRAPH_RDLOCK
1155 bdrv_driver_pwritev_compressed(BlockDriverState *bs, int64_t offset,
1156                                int64_t bytes, QEMUIOVector *qiov,
1157                                size_t qiov_offset)
1158 {
1159     BlockDriver *drv = bs->drv;
1160     QEMUIOVector local_qiov;
1161     int ret;
1162     assert_bdrv_graph_readable();
1163
1164     bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
1165
1166     if (!drv) {
1167         return -ENOMEDIUM;
1168     }
1169
1170     if (!block_driver_can_compress(drv)) {
1171         return -ENOTSUP;
1172     }
1173
1174     if (drv->bdrv_co_pwritev_compressed_part) {
1175         return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes,
1176                                                     qiov, qiov_offset);
1177     }
1178
1179     if (qiov_offset == 0) {
1180         return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
1181     }
1182
1183     qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1184     ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov);
1185     qemu_iovec_destroy(&local_qiov);
1186
1187     return ret;
1188 }
1189
1190 static int coroutine_fn GRAPH_RDLOCK
1191 bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
1192                          QEMUIOVector *qiov, size_t qiov_offset, int flags)
1193 {
1194     BlockDriverState *bs = child->bs;
1195
1196     /* Perform I/O through a temporary buffer so that users who scribble over
1197      * their read buffer while the operation is in progress do not end up
1198      * modifying the image file.  This is critical for zero-copy guest I/O
1199      * where anything might happen inside guest memory.
1200      */
1201     void *bounce_buffer = NULL;
1202
1203     BlockDriver *drv = bs->drv;
1204     int64_t align_offset;
1205     int64_t align_bytes;
1206     int64_t skip_bytes;
1207     int ret;
1208     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1209                                     BDRV_REQUEST_MAX_BYTES);
1210     int64_t progress = 0;
1211     bool skip_write;
1212
1213     bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
1214
1215     if (!drv) {
1216         return -ENOMEDIUM;
1217     }
1218
1219     /*
1220      * Do not write anything when the BDS is inactive.  That is not
1221      * allowed, and it would not help.
1222      */
1223     skip_write = (bs->open_flags & BDRV_O_INACTIVE);
1224
1225     /* FIXME We cannot require callers to have write permissions when all they
1226      * are doing is a read request. If we did things right, write permissions
1227      * would be obtained anyway, but internally by the copy-on-read code. As
1228      * long as it is implemented here rather than in a separate filter driver,
1229      * the copy-on-read code doesn't have its own BdrvChild, however, for which
1230      * it could request permissions. Therefore we have to bypass the permission
1231      * system for the moment. */
1232     // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1233
1234     /* Cover entire cluster so no additional backing file I/O is required when
1235      * allocating cluster in the image file.  Note that this value may exceed
1236      * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
1237      * is one reason we loop rather than doing it all at once.
1238      */
1239     bdrv_round_to_subclusters(bs, offset, bytes, &align_offset, &align_bytes);
1240     skip_bytes = offset - align_offset;
1241
1242     trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
1243                                    align_offset, align_bytes);
1244
1245     while (align_bytes) {
1246         int64_t pnum;
1247
1248         if (skip_write) {
1249             ret = 1; /* "already allocated", so nothing will be copied */
1250             pnum = MIN(align_bytes, max_transfer);
1251         } else {
1252             ret = bdrv_co_is_allocated(bs, align_offset,
1253                                        MIN(align_bytes, max_transfer), &pnum);
1254             if (ret < 0) {
1255                 /*
1256                  * Safe to treat errors in querying allocation as if
1257                  * unallocated; we'll probably fail again soon on the
1258                  * read, but at least that will set a decent errno.
1259                  */
1260                 pnum = MIN(align_bytes, max_transfer);
1261             }
1262
1263             /* Stop at EOF if the image ends in the middle of the cluster */
1264             if (ret == 0 && pnum == 0) {
1265                 assert(progress >= bytes);
1266                 break;
1267             }
1268
1269             assert(skip_bytes < pnum);
1270         }
1271
1272         if (ret <= 0) {
1273             QEMUIOVector local_qiov;
1274
1275             /* Must copy-on-read; use the bounce buffer */
1276             pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
1277             if (!bounce_buffer) {
1278                 int64_t max_we_need = MAX(pnum, align_bytes - pnum);
1279                 int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER);
1280                 int64_t bounce_buffer_len = MIN(max_we_need, max_allowed);
1281
1282                 bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len);
1283                 if (!bounce_buffer) {
1284                     ret = -ENOMEM;
1285                     goto err;
1286                 }
1287             }
1288             qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum);
1289
1290             ret = bdrv_driver_preadv(bs, align_offset, pnum,
1291                                      &local_qiov, 0, 0);
1292             if (ret < 0) {
1293                 goto err;
1294             }
1295
1296             bdrv_co_debug_event(bs, BLKDBG_COR_WRITE);
1297             if (drv->bdrv_co_pwrite_zeroes &&
1298                 buffer_is_zero(bounce_buffer, pnum)) {
1299                 /* FIXME: Should we (perhaps conditionally) be setting
1300                  * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
1301                  * that still correctly reads as zero? */
1302                 ret = bdrv_co_do_pwrite_zeroes(bs, align_offset, pnum,
1303                                                BDRV_REQ_WRITE_UNCHANGED);
1304             } else {
1305                 /* This does not change the data on the disk, it is not
1306                  * necessary to flush even in cache=writethrough mode.
1307                  */
1308                 ret = bdrv_driver_pwritev(bs, align_offset, pnum,
1309                                           &local_qiov, 0,
1310                                           BDRV_REQ_WRITE_UNCHANGED);
1311             }
1312
1313             if (ret < 0) {
1314                 /* It might be okay to ignore write errors for guest
1315                  * requests.  If this is a deliberate copy-on-read
1316                  * then we don't want to ignore the error.  Simply
1317                  * report it in all cases.
1318                  */
1319                 goto err;
1320             }
1321
1322             if (!(flags & BDRV_REQ_PREFETCH)) {
1323                 qemu_iovec_from_buf(qiov, qiov_offset + progress,
1324                                     bounce_buffer + skip_bytes,
1325                                     MIN(pnum - skip_bytes, bytes - progress));
1326             }
1327         } else if (!(flags & BDRV_REQ_PREFETCH)) {
1328             /* Read directly into the destination */
1329             ret = bdrv_driver_preadv(bs, offset + progress,
1330                                      MIN(pnum - skip_bytes, bytes - progress),
1331                                      qiov, qiov_offset + progress, 0);
1332             if (ret < 0) {
1333                 goto err;
1334             }
1335         }
1336
1337         align_offset += pnum;
1338         align_bytes -= pnum;
1339         progress += pnum - skip_bytes;
1340         skip_bytes = 0;
1341     }
1342     ret = 0;
1343
1344 err:
1345     qemu_vfree(bounce_buffer);
1346     return ret;
1347 }
1348
1349 /*
1350  * Forwards an already correctly aligned request to the BlockDriver. This
1351  * handles copy on read, zeroing after EOF, and fragmentation of large
1352  * reads; any other features must be implemented by the caller.
1353  */
1354 static int coroutine_fn GRAPH_RDLOCK
1355 bdrv_aligned_preadv(BdrvChild *child, BdrvTrackedRequest *req,
1356                     int64_t offset, int64_t bytes, int64_t align,
1357                     QEMUIOVector *qiov, size_t qiov_offset, int flags)
1358 {
1359     BlockDriverState *bs = child->bs;
1360     int64_t total_bytes, max_bytes;
1361     int ret = 0;
1362     int64_t bytes_remaining = bytes;
1363     int max_transfer;
1364
1365     bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
1366     assert(is_power_of_2(align));
1367     assert((offset & (align - 1)) == 0);
1368     assert((bytes & (align - 1)) == 0);
1369     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1370     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1371                                    align);
1372
1373     /*
1374      * TODO: We would need a per-BDS .supported_read_flags and
1375      * potential fallback support, if we ever implement any read flags
1376      * to pass through to drivers.  For now, there aren't any
1377      * passthrough flags except the BDRV_REQ_REGISTERED_BUF optimization hint.
1378      */
1379     assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH |
1380                        BDRV_REQ_REGISTERED_BUF)));
1381
1382     /* Handle Copy on Read and associated serialisation */
1383     if (flags & BDRV_REQ_COPY_ON_READ) {
1384         /* If we touch the same cluster it counts as an overlap.  This
1385          * guarantees that allocating writes will be serialized and not race
1386          * with each other for the same cluster.  For example, in copy-on-read
1387          * it ensures that the CoR read and write operations are atomic and
1388          * guest writes cannot interleave between them. */
1389         bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs));
1390     } else {
1391         bdrv_wait_serialising_requests(req);
1392     }
1393
1394     if (flags & BDRV_REQ_COPY_ON_READ) {
1395         int64_t pnum;
1396
1397         /* The flag BDRV_REQ_COPY_ON_READ has reached its addressee */
1398         flags &= ~BDRV_REQ_COPY_ON_READ;
1399
1400         ret = bdrv_co_is_allocated(bs, offset, bytes, &pnum);
1401         if (ret < 0) {
1402             goto out;
1403         }
1404
1405         if (!ret || pnum != bytes) {
1406             ret = bdrv_co_do_copy_on_readv(child, offset, bytes,
1407                                            qiov, qiov_offset, flags);
1408             goto out;
1409         } else if (flags & BDRV_REQ_PREFETCH) {
1410             goto out;
1411         }
1412     }
1413
1414     /* Forward the request to the BlockDriver, possibly fragmenting it */
1415     total_bytes = bdrv_co_getlength(bs);
1416     if (total_bytes < 0) {
1417         ret = total_bytes;
1418         goto out;
1419     }
1420
1421     assert(!(flags & ~(bs->supported_read_flags | BDRV_REQ_REGISTERED_BUF)));
1422
1423     max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1424     if (bytes <= max_bytes && bytes <= max_transfer) {
1425         ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, flags);
1426         goto out;
1427     }
1428
1429     while (bytes_remaining) {
1430         int64_t num;
1431
1432         if (max_bytes) {
1433             num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
1434             assert(num);
1435
1436             ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
1437                                      num, qiov,
1438                                      qiov_offset + bytes - bytes_remaining,
1439                                      flags);
1440             max_bytes -= num;
1441         } else {
1442             num = bytes_remaining;
1443             ret = qemu_iovec_memset(qiov, qiov_offset + bytes - bytes_remaining,
1444                                     0, bytes_remaining);
1445         }
1446         if (ret < 0) {
1447             goto out;
1448         }
1449         bytes_remaining -= num;
1450     }
1451
1452 out:
1453     return ret < 0 ? ret : 0;
1454 }
1455
1456 /*
1457  * Request padding
1458  *
1459  *  |<---- align ----->|                     |<----- align ---->|
1460  *  |<- head ->|<------------- bytes ------------->|<-- tail -->|
1461  *  |          |       |                     |     |            |
1462  * -*----------$-------*-------- ... --------*-----$------------*---
1463  *  |          |       |                     |     |            |
1464  *  |          offset  |                     |     end          |
1465  *  ALIGN_DOWN(offset) ALIGN_UP(offset)      ALIGN_DOWN(end)   ALIGN_UP(end)
1466  *  [buf   ... )                             [tail_buf          )
1467  *
1468  * @buf is an aligned allocation needed to store @head and @tail paddings. @head
1469  * is placed at the beginning of @buf and @tail at the @end.
1470  *
1471  * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk
1472  * around tail, if tail exists.
1473  *
1474  * @merge_reads is true for small requests,
1475  * if @buf_len == @head + bytes + @tail. In this case it is possible that both
1476  * head and tail exist but @buf_len == align and @tail_buf == @buf.
1477  *
1478  * @write is true for write requests, false for read requests.
1479  *
1480  * If padding makes the vector too long (exceeding IOV_MAX), then we need to
1481  * merge existing vector elements into a single one.  @collapse_bounce_buf acts
1482  * as the bounce buffer in such cases.  @pre_collapse_qiov has the pre-collapse
1483  * I/O vector elements so for read requests, the data can be copied back after
1484  * the read is done.
1485  */
1486 typedef struct BdrvRequestPadding {
1487     uint8_t *buf;
1488     size_t buf_len;
1489     uint8_t *tail_buf;
1490     size_t head;
1491     size_t tail;
1492     bool merge_reads;
1493     bool write;
1494     QEMUIOVector local_qiov;
1495
1496     uint8_t *collapse_bounce_buf;
1497     size_t collapse_len;
1498     QEMUIOVector pre_collapse_qiov;
1499 } BdrvRequestPadding;
1500
1501 static bool bdrv_init_padding(BlockDriverState *bs,
1502                               int64_t offset, int64_t bytes,
1503                               bool write,
1504                               BdrvRequestPadding *pad)
1505 {
1506     int64_t align = bs->bl.request_alignment;
1507     int64_t sum;
1508
1509     bdrv_check_request(offset, bytes, &error_abort);
1510     assert(align <= INT_MAX); /* documented in block/block_int.h */
1511     assert(align <= SIZE_MAX / 2); /* so we can allocate the buffer */
1512
1513     memset(pad, 0, sizeof(*pad));
1514
1515     pad->head = offset & (align - 1);
1516     pad->tail = ((offset + bytes) & (align - 1));
1517     if (pad->tail) {
1518         pad->tail = align - pad->tail;
1519     }
1520
1521     if (!pad->head && !pad->tail) {
1522         return false;
1523     }
1524
1525     assert(bytes); /* Nothing good in aligning zero-length requests */
1526
1527     sum = pad->head + bytes + pad->tail;
1528     pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align;
1529     pad->buf = qemu_blockalign(bs, pad->buf_len);
1530     pad->merge_reads = sum == pad->buf_len;
1531     if (pad->tail) {
1532         pad->tail_buf = pad->buf + pad->buf_len - align;
1533     }
1534
1535     pad->write = write;
1536
1537     return true;
1538 }
1539
1540 static int coroutine_fn GRAPH_RDLOCK
1541 bdrv_padding_rmw_read(BdrvChild *child, BdrvTrackedRequest *req,
1542                       BdrvRequestPadding *pad, bool zero_middle)
1543 {
1544     QEMUIOVector local_qiov;
1545     BlockDriverState *bs = child->bs;
1546     uint64_t align = bs->bl.request_alignment;
1547     int ret;
1548
1549     assert(req->serialising && pad->buf);
1550
1551     if (pad->head || pad->merge_reads) {
1552         int64_t bytes = pad->merge_reads ? pad->buf_len : align;
1553
1554         qemu_iovec_init_buf(&local_qiov, pad->buf, bytes);
1555
1556         if (pad->head) {
1557             bdrv_co_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1558         }
1559         if (pad->merge_reads && pad->tail) {
1560             bdrv_co_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1561         }
1562         ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes,
1563                                   align, &local_qiov, 0, 0);
1564         if (ret < 0) {
1565             return ret;
1566         }
1567         if (pad->head) {
1568             bdrv_co_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1569         }
1570         if (pad->merge_reads && pad->tail) {
1571             bdrv_co_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1572         }
1573
1574         if (pad->merge_reads) {
1575             goto zero_mem;
1576         }
1577     }
1578
1579     if (pad->tail) {
1580         qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align);
1581
1582         bdrv_co_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1583         ret = bdrv_aligned_preadv(
1584                 child, req,
1585                 req->overlap_offset + req->overlap_bytes - align,
1586                 align, align, &local_qiov, 0, 0);
1587         if (ret < 0) {
1588             return ret;
1589         }
1590         bdrv_co_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1591     }
1592
1593 zero_mem:
1594     if (zero_middle) {
1595         memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail);
1596     }
1597
1598     return 0;
1599 }
1600
1601 /**
1602  * Free *pad's associated buffers, and perform any necessary finalization steps.
1603  */
1604 static void bdrv_padding_finalize(BdrvRequestPadding *pad)
1605 {
1606     if (pad->collapse_bounce_buf) {
1607         if (!pad->write) {
1608             /*
1609              * If padding required elements in the vector to be collapsed into a
1610              * bounce buffer, copy the bounce buffer content back
1611              */
1612             qemu_iovec_from_buf(&pad->pre_collapse_qiov, 0,
1613                                 pad->collapse_bounce_buf, pad->collapse_len);
1614         }
1615         qemu_vfree(pad->collapse_bounce_buf);
1616         qemu_iovec_destroy(&pad->pre_collapse_qiov);
1617     }
1618     if (pad->buf) {
1619         qemu_vfree(pad->buf);
1620         qemu_iovec_destroy(&pad->local_qiov);
1621     }
1622     memset(pad, 0, sizeof(*pad));
1623 }
1624
1625 /*
1626  * Create pad->local_qiov by wrapping @iov in the padding head and tail, while
1627  * ensuring that the resulting vector will not exceed IOV_MAX elements.
1628  *
1629  * To ensure this, when necessary, the first two or three elements of @iov are
1630  * merged into pad->collapse_bounce_buf and replaced by a reference to that
1631  * bounce buffer in pad->local_qiov.
1632  *
1633  * After performing a read request, the data from the bounce buffer must be
1634  * copied back into pad->pre_collapse_qiov (e.g. by bdrv_padding_finalize()).
1635  */
1636 static int bdrv_create_padded_qiov(BlockDriverState *bs,
1637                                    BdrvRequestPadding *pad,
1638                                    struct iovec *iov, int niov,
1639                                    size_t iov_offset, size_t bytes)
1640 {
1641     int padded_niov, surplus_count, collapse_count;
1642
1643     /* Assert this invariant */
1644     assert(niov <= IOV_MAX);
1645
1646     /*
1647      * Cannot pad if resulting length would exceed SIZE_MAX.  Returning an error
1648      * to the guest is not ideal, but there is little else we can do.  At least
1649      * this will practically never happen on 64-bit systems.
1650      */
1651     if (SIZE_MAX - pad->head < bytes ||
1652         SIZE_MAX - pad->head - bytes < pad->tail)
1653     {
1654         return -EINVAL;
1655     }
1656
1657     /* Length of the resulting IOV if we just concatenated everything */
1658     padded_niov = !!pad->head + niov + !!pad->tail;
1659
1660     qemu_iovec_init(&pad->local_qiov, MIN(padded_niov, IOV_MAX));
1661
1662     if (pad->head) {
1663         qemu_iovec_add(&pad->local_qiov, pad->buf, pad->head);
1664     }
1665
1666     /*
1667      * If padded_niov > IOV_MAX, we cannot just concatenate everything.
1668      * Instead, merge the first two or three elements of @iov to reduce the
1669      * number of vector elements as necessary.
1670      */
1671     if (padded_niov > IOV_MAX) {
1672         /*
1673          * Only head and tail can have lead to the number of entries exceeding
1674          * IOV_MAX, so we can exceed it by the head and tail at most.  We need
1675          * to reduce the number of elements by `surplus_count`, so we merge that
1676          * many elements plus one into one element.
1677          */
1678         surplus_count = padded_niov - IOV_MAX;
1679         assert(surplus_count <= !!pad->head + !!pad->tail);
1680         collapse_count = surplus_count + 1;
1681
1682         /*
1683          * Move the elements to collapse into `pad->pre_collapse_qiov`, then
1684          * advance `iov` (and associated variables) by those elements.
1685          */
1686         qemu_iovec_init(&pad->pre_collapse_qiov, collapse_count);
1687         qemu_iovec_concat_iov(&pad->pre_collapse_qiov, iov,
1688                               collapse_count, iov_offset, SIZE_MAX);
1689         iov += collapse_count;
1690         iov_offset = 0;
1691         niov -= collapse_count;
1692         bytes -= pad->pre_collapse_qiov.size;
1693
1694         /*
1695          * Construct the bounce buffer to match the length of the to-collapse
1696          * vector elements, and for write requests, initialize it with the data
1697          * from those elements.  Then add it to `pad->local_qiov`.
1698          */
1699         pad->collapse_len = pad->pre_collapse_qiov.size;
1700         pad->collapse_bounce_buf = qemu_blockalign(bs, pad->collapse_len);
1701         if (pad->write) {
1702             qemu_iovec_to_buf(&pad->pre_collapse_qiov, 0,
1703                               pad->collapse_bounce_buf, pad->collapse_len);
1704         }
1705         qemu_iovec_add(&pad->local_qiov,
1706                        pad->collapse_bounce_buf, pad->collapse_len);
1707     }
1708
1709     qemu_iovec_concat_iov(&pad->local_qiov, iov, niov, iov_offset, bytes);
1710
1711     if (pad->tail) {
1712         qemu_iovec_add(&pad->local_qiov,
1713                        pad->buf + pad->buf_len - pad->tail, pad->tail);
1714     }
1715
1716     assert(pad->local_qiov.niov == MIN(padded_niov, IOV_MAX));
1717     return 0;
1718 }
1719
1720 /*
1721  * bdrv_pad_request
1722  *
1723  * Exchange request parameters with padded request if needed. Don't include RMW
1724  * read of padding, bdrv_padding_rmw_read() should be called separately if
1725  * needed.
1726  *
1727  * @write is true for write requests, false for read requests.
1728  *
1729  * Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out:
1730  *  - on function start they represent original request
1731  *  - on failure or when padding is not needed they are unchanged
1732  *  - on success when padding is needed they represent padded request
1733  */
1734 static int bdrv_pad_request(BlockDriverState *bs,
1735                             QEMUIOVector **qiov, size_t *qiov_offset,
1736                             int64_t *offset, int64_t *bytes,
1737                             bool write,
1738                             BdrvRequestPadding *pad, bool *padded,
1739                             BdrvRequestFlags *flags)
1740 {
1741     int ret;
1742     struct iovec *sliced_iov;
1743     int sliced_niov;
1744     size_t sliced_head, sliced_tail;
1745
1746     /* Should have been checked by the caller already */
1747     ret = bdrv_check_request32(*offset, *bytes, *qiov, *qiov_offset);
1748     if (ret < 0) {
1749         return ret;
1750     }
1751
1752     if (!bdrv_init_padding(bs, *offset, *bytes, write, pad)) {
1753         if (padded) {
1754             *padded = false;
1755         }
1756         return 0;
1757     }
1758
1759     sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes,
1760                                   &sliced_head, &sliced_tail,
1761                                   &sliced_niov);
1762
1763     /* Guaranteed by bdrv_check_request32() */
1764     assert(*bytes <= SIZE_MAX);
1765     ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
1766                                   sliced_head, *bytes);
1767     if (ret < 0) {
1768         bdrv_padding_finalize(pad);
1769         return ret;
1770     }
1771     *bytes += pad->head + pad->tail;
1772     *offset -= pad->head;
1773     *qiov = &pad->local_qiov;
1774     *qiov_offset = 0;
1775     if (padded) {
1776         *padded = true;
1777     }
1778     if (flags) {
1779         /* Can't use optimization hint with bounce buffer */
1780         *flags &= ~BDRV_REQ_REGISTERED_BUF;
1781     }
1782
1783     return 0;
1784 }
1785
1786 int coroutine_fn bdrv_co_preadv(BdrvChild *child,
1787     int64_t offset, int64_t bytes, QEMUIOVector *qiov,
1788     BdrvRequestFlags flags)
1789 {
1790     IO_CODE();
1791     return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags);
1792 }
1793
1794 int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
1795     int64_t offset, int64_t bytes,
1796     QEMUIOVector *qiov, size_t qiov_offset,
1797     BdrvRequestFlags flags)
1798 {
1799     BlockDriverState *bs = child->bs;
1800     BdrvTrackedRequest req;
1801     BdrvRequestPadding pad;
1802     int ret;
1803     IO_CODE();
1804
1805     trace_bdrv_co_preadv_part(bs, offset, bytes, flags);
1806
1807     if (!bdrv_co_is_inserted(bs)) {
1808         return -ENOMEDIUM;
1809     }
1810
1811     ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset);
1812     if (ret < 0) {
1813         return ret;
1814     }
1815
1816     if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) {
1817         /*
1818          * Aligning zero request is nonsense. Even if driver has special meaning
1819          * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass
1820          * it to driver due to request_alignment.
1821          *
1822          * Still, no reason to return an error if someone do unaligned
1823          * zero-length read occasionally.
1824          */
1825         return 0;
1826     }
1827
1828     bdrv_inc_in_flight(bs);
1829
1830     /* Don't do copy-on-read if we read data before write operation */
1831     if (qatomic_read(&bs->copy_on_read)) {
1832         flags |= BDRV_REQ_COPY_ON_READ;
1833     }
1834
1835     ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, false,
1836                            &pad, NULL, &flags);
1837     if (ret < 0) {
1838         goto fail;
1839     }
1840
1841     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1842     ret = bdrv_aligned_preadv(child, &req, offset, bytes,
1843                               bs->bl.request_alignment,
1844                               qiov, qiov_offset, flags);
1845     tracked_request_end(&req);
1846     bdrv_padding_finalize(&pad);
1847
1848 fail:
1849     bdrv_dec_in_flight(bs);
1850
1851     return ret;
1852 }
1853
1854 static int coroutine_fn GRAPH_RDLOCK
1855 bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
1856                          BdrvRequestFlags flags)
1857 {
1858     BlockDriver *drv = bs->drv;
1859     QEMUIOVector qiov;
1860     void *buf = NULL;
1861     int ret = 0;
1862     bool need_flush = false;
1863     int head = 0;
1864     int tail = 0;
1865
1866     int64_t max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes,
1867                                             INT64_MAX);
1868     int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1869                         bs->bl.request_alignment);
1870     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
1871
1872     assert_bdrv_graph_readable();
1873     bdrv_check_request(offset, bytes, &error_abort);
1874
1875     if (!drv) {
1876         return -ENOMEDIUM;
1877     }
1878
1879     if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) {
1880         return -ENOTSUP;
1881     }
1882
1883     /* By definition there is no user buffer so this flag doesn't make sense */
1884     if (flags & BDRV_REQ_REGISTERED_BUF) {
1885         return -EINVAL;
1886     }
1887
1888     /* Invalidate the cached block-status data range if this write overlaps */
1889     bdrv_bsc_invalidate_range(bs, offset, bytes);
1890
1891     assert(alignment % bs->bl.request_alignment == 0);
1892     head = offset % alignment;
1893     tail = (offset + bytes) % alignment;
1894     max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1895     assert(max_write_zeroes >= bs->bl.request_alignment);
1896
1897     while (bytes > 0 && !ret) {
1898         int64_t num = bytes;
1899
1900         /* Align request.  Block drivers can expect the "bulk" of the request
1901          * to be aligned, and that unaligned requests do not cross cluster
1902          * boundaries.
1903          */
1904         if (head) {
1905             /* Make a small request up to the first aligned sector. For
1906              * convenience, limit this request to max_transfer even if
1907              * we don't need to fall back to writes.  */
1908             num = MIN(MIN(bytes, max_transfer), alignment - head);
1909             head = (head + num) % alignment;
1910             assert(num < max_write_zeroes);
1911         } else if (tail && num > alignment) {
1912             /* Shorten the request to the last aligned sector.  */
1913             num -= tail;
1914         }
1915
1916         /* limit request size */
1917         if (num > max_write_zeroes) {
1918             num = max_write_zeroes;
1919         }
1920
1921         ret = -ENOTSUP;
1922         /* First try the efficient write zeroes operation */
1923         if (drv->bdrv_co_pwrite_zeroes) {
1924             ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1925                                              flags & bs->supported_zero_flags);
1926             if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1927                 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1928                 need_flush = true;
1929             }
1930         } else {
1931             assert(!bs->supported_zero_flags);
1932         }
1933
1934         if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) {
1935             /* Fall back to bounce buffer if write zeroes is unsupported */
1936             BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1937
1938             if ((flags & BDRV_REQ_FUA) &&
1939                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1940                 /* No need for bdrv_driver_pwrite() to do a fallback
1941                  * flush on each chunk; use just one at the end */
1942                 write_flags &= ~BDRV_REQ_FUA;
1943                 need_flush = true;
1944             }
1945             num = MIN(num, max_transfer);
1946             if (buf == NULL) {
1947                 buf = qemu_try_blockalign0(bs, num);
1948                 if (buf == NULL) {
1949                     ret = -ENOMEM;
1950                     goto fail;
1951                 }
1952             }
1953             qemu_iovec_init_buf(&qiov, buf, num);
1954
1955             ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags);
1956
1957             /* Keep bounce buffer around if it is big enough for all
1958              * all future requests.
1959              */
1960             if (num < max_transfer) {
1961                 qemu_vfree(buf);
1962                 buf = NULL;
1963             }
1964         }
1965
1966         offset += num;
1967         bytes -= num;
1968     }
1969
1970 fail:
1971     if (ret == 0 && need_flush) {
1972         ret = bdrv_co_flush(bs);
1973     }
1974     qemu_vfree(buf);
1975     return ret;
1976 }
1977
1978 static inline int coroutine_fn GRAPH_RDLOCK
1979 bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes,
1980                           BdrvTrackedRequest *req, int flags)
1981 {
1982     BlockDriverState *bs = child->bs;
1983
1984     bdrv_check_request(offset, bytes, &error_abort);
1985
1986     if (bdrv_is_read_only(bs)) {
1987         return -EPERM;
1988     }
1989
1990     assert(!(bs->open_flags & BDRV_O_INACTIVE));
1991     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1992     assert(!(flags & ~BDRV_REQ_MASK));
1993     assert(!((flags & BDRV_REQ_NO_WAIT) && !(flags & BDRV_REQ_SERIALISING)));
1994
1995     if (flags & BDRV_REQ_SERIALISING) {
1996         QEMU_LOCK_GUARD(&bs->reqs_lock);
1997
1998         tracked_request_set_serialising(req, bdrv_get_cluster_size(bs));
1999
2000         if ((flags & BDRV_REQ_NO_WAIT) && bdrv_find_conflicting_request(req)) {
2001             return -EBUSY;
2002         }
2003
2004         bdrv_wait_serialising_requests_locked(req);
2005     } else {
2006         bdrv_wait_serialising_requests(req);
2007     }
2008
2009     assert(req->overlap_offset <= offset);
2010     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
2011     assert(offset + bytes <= bs->total_sectors * BDRV_SECTOR_SIZE ||
2012            child->perm & BLK_PERM_RESIZE);
2013
2014     switch (req->type) {
2015     case BDRV_TRACKED_WRITE:
2016     case BDRV_TRACKED_DISCARD:
2017         if (flags & BDRV_REQ_WRITE_UNCHANGED) {
2018             assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
2019         } else {
2020             assert(child->perm & BLK_PERM_WRITE);
2021         }
2022         bdrv_write_threshold_check_write(bs, offset, bytes);
2023         return 0;
2024     case BDRV_TRACKED_TRUNCATE:
2025         assert(child->perm & BLK_PERM_RESIZE);
2026         return 0;
2027     default:
2028         abort();
2029     }
2030 }
2031
2032 static inline void coroutine_fn GRAPH_RDLOCK
2033 bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, int64_t bytes,
2034                          BdrvTrackedRequest *req, int ret)
2035 {
2036     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
2037     BlockDriverState *bs = child->bs;
2038
2039     bdrv_check_request(offset, bytes, &error_abort);
2040
2041     qatomic_inc(&bs->write_gen);
2042
2043     /*
2044      * Discard cannot extend the image, but in error handling cases, such as
2045      * when reverting a qcow2 cluster allocation, the discarded range can pass
2046      * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD
2047      * here. Instead, just skip it, since semantically a discard request
2048      * beyond EOF cannot expand the image anyway.
2049      */
2050     if (ret == 0 &&
2051         (req->type == BDRV_TRACKED_TRUNCATE ||
2052          end_sector > bs->total_sectors) &&
2053         req->type != BDRV_TRACKED_DISCARD) {
2054         bs->total_sectors = end_sector;
2055         bdrv_parent_cb_resize(bs);
2056         bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS);
2057     }
2058     if (req->bytes) {
2059         switch (req->type) {
2060         case BDRV_TRACKED_WRITE:
2061             stat64_max(&bs->wr_highest_offset, offset + bytes);
2062             /* fall through, to set dirty bits */
2063         case BDRV_TRACKED_DISCARD:
2064             bdrv_set_dirty(bs, offset, bytes);
2065             break;
2066         default:
2067             break;
2068         }
2069     }
2070 }
2071
2072 /*
2073  * Forwards an already correctly aligned write request to the BlockDriver,
2074  * after possibly fragmenting it.
2075  */
2076 static int coroutine_fn GRAPH_RDLOCK
2077 bdrv_aligned_pwritev(BdrvChild *child, BdrvTrackedRequest *req,
2078                      int64_t offset, int64_t bytes, int64_t align,
2079                      QEMUIOVector *qiov, size_t qiov_offset,
2080                      BdrvRequestFlags flags)
2081 {
2082     BlockDriverState *bs = child->bs;
2083     BlockDriver *drv = bs->drv;
2084     int ret;
2085
2086     int64_t bytes_remaining = bytes;
2087     int max_transfer;
2088
2089     bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
2090
2091     if (!drv) {
2092         return -ENOMEDIUM;
2093     }
2094
2095     if (bdrv_has_readonly_bitmaps(bs)) {
2096         return -EPERM;
2097     }
2098
2099     assert(is_power_of_2(align));
2100     assert((offset & (align - 1)) == 0);
2101     assert((bytes & (align - 1)) == 0);
2102     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
2103                                    align);
2104
2105     ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags);
2106
2107     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
2108         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
2109         qemu_iovec_is_zero(qiov, qiov_offset, bytes)) {
2110         flags |= BDRV_REQ_ZERO_WRITE;
2111         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
2112             flags |= BDRV_REQ_MAY_UNMAP;
2113         }
2114
2115         /* Can't use optimization hint with bufferless zero write */
2116         flags &= ~BDRV_REQ_REGISTERED_BUF;
2117     }
2118
2119     if (ret < 0) {
2120         /* Do nothing, write notifier decided to fail this request */
2121     } else if (flags & BDRV_REQ_ZERO_WRITE) {
2122         bdrv_co_debug_event(bs, BLKDBG_PWRITEV_ZERO);
2123         ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
2124     } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
2125         ret = bdrv_driver_pwritev_compressed(bs, offset, bytes,
2126                                              qiov, qiov_offset);
2127     } else if (bytes <= max_transfer) {
2128         bdrv_co_debug_event(bs, BLKDBG_PWRITEV);
2129         ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags);
2130     } else {
2131         bdrv_co_debug_event(bs, BLKDBG_PWRITEV);
2132         while (bytes_remaining) {
2133             int num = MIN(bytes_remaining, max_transfer);
2134             int local_flags = flags;
2135
2136             assert(num);
2137             if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
2138                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
2139                 /* If FUA is going to be emulated by flush, we only
2140                  * need to flush on the last iteration */
2141                 local_flags &= ~BDRV_REQ_FUA;
2142             }
2143
2144             ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
2145                                       num, qiov,
2146                                       qiov_offset + bytes - bytes_remaining,
2147                                       local_flags);
2148             if (ret < 0) {
2149                 break;
2150             }
2151             bytes_remaining -= num;
2152         }
2153     }
2154     bdrv_co_debug_event(bs, BLKDBG_PWRITEV_DONE);
2155
2156     if (ret >= 0) {
2157         ret = 0;
2158     }
2159     bdrv_co_write_req_finish(child, offset, bytes, req, ret);
2160
2161     return ret;
2162 }
2163
2164 static int coroutine_fn GRAPH_RDLOCK
2165 bdrv_co_do_zero_pwritev(BdrvChild *child, int64_t offset, int64_t bytes,
2166                         BdrvRequestFlags flags, BdrvTrackedRequest *req)
2167 {
2168     BlockDriverState *bs = child->bs;
2169     QEMUIOVector local_qiov;
2170     uint64_t align = bs->bl.request_alignment;
2171     int ret = 0;
2172     bool padding;
2173     BdrvRequestPadding pad;
2174
2175     /* This flag doesn't make sense for padding or zero writes */
2176     flags &= ~BDRV_REQ_REGISTERED_BUF;
2177
2178     padding = bdrv_init_padding(bs, offset, bytes, true, &pad);
2179     if (padding) {
2180         assert(!(flags & BDRV_REQ_NO_WAIT));
2181         bdrv_make_request_serialising(req, align);
2182
2183         bdrv_padding_rmw_read(child, req, &pad, true);
2184
2185         if (pad.head || pad.merge_reads) {
2186             int64_t aligned_offset = offset & ~(align - 1);
2187             int64_t write_bytes = pad.merge_reads ? pad.buf_len : align;
2188
2189             qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes);
2190             ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes,
2191                                        align, &local_qiov, 0,
2192                                        flags & ~BDRV_REQ_ZERO_WRITE);
2193             if (ret < 0 || pad.merge_reads) {
2194                 /* Error or all work is done */
2195                 goto out;
2196             }
2197             offset += write_bytes - pad.head;
2198             bytes -= write_bytes - pad.head;
2199         }
2200     }
2201
2202     assert(!bytes || (offset & (align - 1)) == 0);
2203     if (bytes >= align) {
2204         /* Write the aligned part in the middle. */
2205         int64_t aligned_bytes = bytes & ~(align - 1);
2206         ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
2207                                    NULL, 0, flags);
2208         if (ret < 0) {
2209             goto out;
2210         }
2211         bytes -= aligned_bytes;
2212         offset += aligned_bytes;
2213     }
2214
2215     assert(!bytes || (offset & (align - 1)) == 0);
2216     if (bytes) {
2217         assert(align == pad.tail + bytes);
2218
2219         qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align);
2220         ret = bdrv_aligned_pwritev(child, req, offset, align, align,
2221                                    &local_qiov, 0,
2222                                    flags & ~BDRV_REQ_ZERO_WRITE);
2223     }
2224
2225 out:
2226     bdrv_padding_finalize(&pad);
2227
2228     return ret;
2229 }
2230
2231 /*
2232  * Handle a write request in coroutine context
2233  */
2234 int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
2235     int64_t offset, int64_t bytes, QEMUIOVector *qiov,
2236     BdrvRequestFlags flags)
2237 {
2238     IO_CODE();
2239     return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags);
2240 }
2241
2242 int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
2243     int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset,
2244     BdrvRequestFlags flags)
2245 {
2246     BlockDriverState *bs = child->bs;
2247     BdrvTrackedRequest req;
2248     uint64_t align = bs->bl.request_alignment;
2249     BdrvRequestPadding pad;
2250     int ret;
2251     bool padded = false;
2252     IO_CODE();
2253
2254     trace_bdrv_co_pwritev_part(child->bs, offset, bytes, flags);
2255
2256     if (!bdrv_co_is_inserted(bs)) {
2257         return -ENOMEDIUM;
2258     }
2259
2260     if (flags & BDRV_REQ_ZERO_WRITE) {
2261         ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL);
2262     } else {
2263         ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset);
2264     }
2265     if (ret < 0) {
2266         return ret;
2267     }
2268
2269     /* If the request is misaligned then we can't make it efficient */
2270     if ((flags & BDRV_REQ_NO_FALLBACK) &&
2271         !QEMU_IS_ALIGNED(offset | bytes, align))
2272     {
2273         return -ENOTSUP;
2274     }
2275
2276     if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) {
2277         /*
2278          * Aligning zero request is nonsense. Even if driver has special meaning
2279          * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass
2280          * it to driver due to request_alignment.
2281          *
2282          * Still, no reason to return an error if someone do unaligned
2283          * zero-length write occasionally.
2284          */
2285         return 0;
2286     }
2287
2288     if (!(flags & BDRV_REQ_ZERO_WRITE)) {
2289         /*
2290          * Pad request for following read-modify-write cycle.
2291          * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do
2292          * alignment only if there is no ZERO flag.
2293          */
2294         ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, true,
2295                                &pad, &padded, &flags);
2296         if (ret < 0) {
2297             return ret;
2298         }
2299     }
2300
2301     bdrv_inc_in_flight(bs);
2302     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
2303
2304     if (flags & BDRV_REQ_ZERO_WRITE) {
2305         assert(!padded);
2306         ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
2307         goto out;
2308     }
2309
2310     if (padded) {
2311         /*
2312          * Request was unaligned to request_alignment and therefore
2313          * padded.  We are going to do read-modify-write, and must
2314          * serialize the request to prevent interactions of the
2315          * widened region with other transactions.
2316          */
2317         assert(!(flags & BDRV_REQ_NO_WAIT));
2318         bdrv_make_request_serialising(&req, align);
2319         bdrv_padding_rmw_read(child, &req, &pad, false);
2320     }
2321
2322     ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
2323                                qiov, qiov_offset, flags);
2324
2325     bdrv_padding_finalize(&pad);
2326
2327 out:
2328     tracked_request_end(&req);
2329     bdrv_dec_in_flight(bs);
2330
2331     return ret;
2332 }
2333
2334 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
2335                                        int64_t bytes, BdrvRequestFlags flags)
2336 {
2337     IO_CODE();
2338     trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
2339     assert_bdrv_graph_readable();
2340
2341     if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
2342         flags &= ~BDRV_REQ_MAY_UNMAP;
2343     }
2344
2345     return bdrv_co_pwritev(child, offset, bytes, NULL,
2346                            BDRV_REQ_ZERO_WRITE | flags);
2347 }
2348
2349 /*
2350  * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
2351  */
2352 int bdrv_flush_all(void)
2353 {
2354     BdrvNextIterator it;
2355     BlockDriverState *bs = NULL;
2356     int result = 0;
2357
2358     GLOBAL_STATE_CODE();
2359     GRAPH_RDLOCK_GUARD_MAINLOOP();
2360
2361     /*
2362      * bdrv queue is managed by record/replay,
2363      * creating new flush request for stopping
2364      * the VM may break the determinism
2365      */
2366     if (replay_events_enabled()) {
2367         return result;
2368     }
2369
2370     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
2371         AioContext *aio_context = bdrv_get_aio_context(bs);
2372         int ret;
2373
2374         aio_context_acquire(aio_context);
2375         ret = bdrv_flush(bs);
2376         if (ret < 0 && !result) {
2377             result = ret;
2378         }
2379         aio_context_release(aio_context);
2380     }
2381
2382     return result;
2383 }
2384
2385 /*
2386  * Returns the allocation status of the specified sectors.
2387  * Drivers not implementing the functionality are assumed to not support
2388  * backing files, hence all their sectors are reported as allocated.
2389  *
2390  * If 'want_zero' is true, the caller is querying for mapping
2391  * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
2392  * _ZERO where possible; otherwise, the result favors larger 'pnum',
2393  * with a focus on accurate BDRV_BLOCK_ALLOCATED.
2394  *
2395  * If 'offset' is beyond the end of the disk image the return value is
2396  * BDRV_BLOCK_EOF and 'pnum' is set to 0.
2397  *
2398  * 'bytes' is the max value 'pnum' should be set to.  If bytes goes
2399  * beyond the end of the disk image it will be clamped; if 'pnum' is set to
2400  * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
2401  *
2402  * 'pnum' is set to the number of bytes (including and immediately
2403  * following the specified offset) that are easily known to be in the
2404  * same allocated/unallocated state.  Note that a second call starting
2405  * at the original offset plus returned pnum may have the same status.
2406  * The returned value is non-zero on success except at end-of-file.
2407  *
2408  * Returns negative errno on failure.  Otherwise, if the
2409  * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
2410  * set to the host mapping and BDS corresponding to the guest offset.
2411  */
2412 static int coroutine_fn GRAPH_RDLOCK
2413 bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero,
2414                         int64_t offset, int64_t bytes,
2415                         int64_t *pnum, int64_t *map, BlockDriverState **file)
2416 {
2417     int64_t total_size;
2418     int64_t n; /* bytes */
2419     int ret;
2420     int64_t local_map = 0;
2421     BlockDriverState *local_file = NULL;
2422     int64_t aligned_offset, aligned_bytes;
2423     uint32_t align;
2424     bool has_filtered_child;
2425
2426     assert(pnum);
2427     assert_bdrv_graph_readable();
2428     *pnum = 0;
2429     total_size = bdrv_co_getlength(bs);
2430     if (total_size < 0) {
2431         ret = total_size;
2432         goto early_out;
2433     }
2434
2435     if (offset >= total_size) {
2436         ret = BDRV_BLOCK_EOF;
2437         goto early_out;
2438     }
2439     if (!bytes) {
2440         ret = 0;
2441         goto early_out;
2442     }
2443
2444     n = total_size - offset;
2445     if (n < bytes) {
2446         bytes = n;
2447     }
2448
2449     /* Must be non-NULL or bdrv_co_getlength() would have failed */
2450     assert(bs->drv);
2451     has_filtered_child = bdrv_filter_child(bs);
2452     if (!bs->drv->bdrv_co_block_status && !has_filtered_child) {
2453         *pnum = bytes;
2454         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
2455         if (offset + bytes == total_size) {
2456             ret |= BDRV_BLOCK_EOF;
2457         }
2458         if (bs->drv->protocol_name) {
2459             ret |= BDRV_BLOCK_OFFSET_VALID;
2460             local_map = offset;
2461             local_file = bs;
2462         }
2463         goto early_out;
2464     }
2465
2466     bdrv_inc_in_flight(bs);
2467
2468     /* Round out to request_alignment boundaries */
2469     align = bs->bl.request_alignment;
2470     aligned_offset = QEMU_ALIGN_DOWN(offset, align);
2471     aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
2472
2473     if (bs->drv->bdrv_co_block_status) {
2474         /*
2475          * Use the block-status cache only for protocol nodes: Format
2476          * drivers are generally quick to inquire the status, but protocol
2477          * drivers often need to get information from outside of qemu, so
2478          * we do not have control over the actual implementation.  There
2479          * have been cases where inquiring the status took an unreasonably
2480          * long time, and we can do nothing in qemu to fix it.
2481          * This is especially problematic for images with large data areas,
2482          * because finding the few holes in them and giving them special
2483          * treatment does not gain much performance.  Therefore, we try to
2484          * cache the last-identified data region.
2485          *
2486          * Second, limiting ourselves to protocol nodes allows us to assume
2487          * the block status for data regions to be DATA | OFFSET_VALID, and
2488          * that the host offset is the same as the guest offset.
2489          *
2490          * Note that it is possible that external writers zero parts of
2491          * the cached regions without the cache being invalidated, and so
2492          * we may report zeroes as data.  This is not catastrophic,
2493          * however, because reporting zeroes as data is fine.
2494          */
2495         if (QLIST_EMPTY(&bs->children) &&
2496             bdrv_bsc_is_data(bs, aligned_offset, pnum))
2497         {
2498             ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
2499             local_file = bs;
2500             local_map = aligned_offset;
2501         } else {
2502             ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
2503                                                 aligned_bytes, pnum, &local_map,
2504                                                 &local_file);
2505
2506             /*
2507              * Note that checking QLIST_EMPTY(&bs->children) is also done when
2508              * the cache is queried above.  Technically, we do not need to check
2509              * it here; the worst that can happen is that we fill the cache for
2510              * non-protocol nodes, and then it is never used.  However, filling
2511              * the cache requires an RCU update, so double check here to avoid
2512              * such an update if possible.
2513              *
2514              * Check want_zero, because we only want to update the cache when we
2515              * have accurate information about what is zero and what is data.
2516              */
2517             if (want_zero &&
2518                 ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) &&
2519                 QLIST_EMPTY(&bs->children))
2520             {
2521                 /*
2522                  * When a protocol driver reports BLOCK_OFFSET_VALID, the
2523                  * returned local_map value must be the same as the offset we
2524                  * have passed (aligned_offset), and local_bs must be the node
2525                  * itself.
2526                  * Assert this, because we follow this rule when reading from
2527                  * the cache (see the `local_file = bs` and
2528                  * `local_map = aligned_offset` assignments above), and the
2529                  * result the cache delivers must be the same as the driver
2530                  * would deliver.
2531                  */
2532                 assert(local_file == bs);
2533                 assert(local_map == aligned_offset);
2534                 bdrv_bsc_fill(bs, aligned_offset, *pnum);
2535             }
2536         }
2537     } else {
2538         /* Default code for filters */
2539
2540         local_file = bdrv_filter_bs(bs);
2541         assert(local_file);
2542
2543         *pnum = aligned_bytes;
2544         local_map = aligned_offset;
2545         ret = BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2546     }
2547     if (ret < 0) {
2548         *pnum = 0;
2549         goto out;
2550     }
2551
2552     /*
2553      * The driver's result must be a non-zero multiple of request_alignment.
2554      * Clamp pnum and adjust map to original request.
2555      */
2556     assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
2557            align > offset - aligned_offset);
2558     if (ret & BDRV_BLOCK_RECURSE) {
2559         assert(ret & BDRV_BLOCK_DATA);
2560         assert(ret & BDRV_BLOCK_OFFSET_VALID);
2561         assert(!(ret & BDRV_BLOCK_ZERO));
2562     }
2563
2564     *pnum -= offset - aligned_offset;
2565     if (*pnum > bytes) {
2566         *pnum = bytes;
2567     }
2568     if (ret & BDRV_BLOCK_OFFSET_VALID) {
2569         local_map += offset - aligned_offset;
2570     }
2571
2572     if (ret & BDRV_BLOCK_RAW) {
2573         assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
2574         ret = bdrv_co_do_block_status(local_file, want_zero, local_map,
2575                                       *pnum, pnum, &local_map, &local_file);
2576         goto out;
2577     }
2578
2579     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
2580         ret |= BDRV_BLOCK_ALLOCATED;
2581     } else if (bs->drv->supports_backing) {
2582         BlockDriverState *cow_bs = bdrv_cow_bs(bs);
2583
2584         if (!cow_bs) {
2585             ret |= BDRV_BLOCK_ZERO;
2586         } else if (want_zero) {
2587             int64_t size2 = bdrv_co_getlength(cow_bs);
2588
2589             if (size2 >= 0 && offset >= size2) {
2590                 ret |= BDRV_BLOCK_ZERO;
2591             }
2592         }
2593     }
2594
2595     if (want_zero && ret & BDRV_BLOCK_RECURSE &&
2596         local_file && local_file != bs &&
2597         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
2598         (ret & BDRV_BLOCK_OFFSET_VALID)) {
2599         int64_t file_pnum;
2600         int ret2;
2601
2602         ret2 = bdrv_co_do_block_status(local_file, want_zero, local_map,
2603                                        *pnum, &file_pnum, NULL, NULL);
2604         if (ret2 >= 0) {
2605             /* Ignore errors.  This is just providing extra information, it
2606              * is useful but not necessary.
2607              */
2608             if (ret2 & BDRV_BLOCK_EOF &&
2609                 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
2610                 /*
2611                  * It is valid for the format block driver to read
2612                  * beyond the end of the underlying file's current
2613                  * size; such areas read as zero.
2614                  */
2615                 ret |= BDRV_BLOCK_ZERO;
2616             } else {
2617                 /* Limit request to the range reported by the protocol driver */
2618                 *pnum = file_pnum;
2619                 ret |= (ret2 & BDRV_BLOCK_ZERO);
2620             }
2621         }
2622     }
2623
2624 out:
2625     bdrv_dec_in_flight(bs);
2626     if (ret >= 0 && offset + *pnum == total_size) {
2627         ret |= BDRV_BLOCK_EOF;
2628     }
2629 early_out:
2630     if (file) {
2631         *file = local_file;
2632     }
2633     if (map) {
2634         *map = local_map;
2635     }
2636     return ret;
2637 }
2638
2639 int coroutine_fn
2640 bdrv_co_common_block_status_above(BlockDriverState *bs,
2641                                   BlockDriverState *base,
2642                                   bool include_base,
2643                                   bool want_zero,
2644                                   int64_t offset,
2645                                   int64_t bytes,
2646                                   int64_t *pnum,
2647                                   int64_t *map,
2648                                   BlockDriverState **file,
2649                                   int *depth)
2650 {
2651     int ret;
2652     BlockDriverState *p;
2653     int64_t eof = 0;
2654     int dummy;
2655     IO_CODE();
2656
2657     assert(!include_base || base); /* Can't include NULL base */
2658     assert_bdrv_graph_readable();
2659
2660     if (!depth) {
2661         depth = &dummy;
2662     }
2663     *depth = 0;
2664
2665     if (!include_base && bs == base) {
2666         *pnum = bytes;
2667         return 0;
2668     }
2669
2670     ret = bdrv_co_do_block_status(bs, want_zero, offset, bytes, pnum,
2671                                   map, file);
2672     ++*depth;
2673     if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) {
2674         return ret;
2675     }
2676
2677     if (ret & BDRV_BLOCK_EOF) {
2678         eof = offset + *pnum;
2679     }
2680
2681     assert(*pnum <= bytes);
2682     bytes = *pnum;
2683
2684     for (p = bdrv_filter_or_cow_bs(bs); include_base || p != base;
2685          p = bdrv_filter_or_cow_bs(p))
2686     {
2687         ret = bdrv_co_do_block_status(p, want_zero, offset, bytes, pnum,
2688                                       map, file);
2689         ++*depth;
2690         if (ret < 0) {
2691             return ret;
2692         }
2693         if (*pnum == 0) {
2694             /*
2695              * The top layer deferred to this layer, and because this layer is
2696              * short, any zeroes that we synthesize beyond EOF behave as if they
2697              * were allocated at this layer.
2698              *
2699              * We don't include BDRV_BLOCK_EOF into ret, as upper layer may be
2700              * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see
2701              * below.
2702              */
2703             assert(ret & BDRV_BLOCK_EOF);
2704             *pnum = bytes;
2705             if (file) {
2706                 *file = p;
2707             }
2708             ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_ALLOCATED;
2709             break;
2710         }
2711         if (ret & BDRV_BLOCK_ALLOCATED) {
2712             /*
2713              * We've found the node and the status, we must break.
2714              *
2715              * Drop BDRV_BLOCK_EOF, as it's not for upper layer, which may be
2716              * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see
2717              * below.
2718              */
2719             ret &= ~BDRV_BLOCK_EOF;
2720             break;
2721         }
2722
2723         if (p == base) {
2724             assert(include_base);
2725             break;
2726         }
2727
2728         /*
2729          * OK, [offset, offset + *pnum) region is unallocated on this layer,
2730          * let's continue the diving.
2731          */
2732         assert(*pnum <= bytes);
2733         bytes = *pnum;
2734     }
2735
2736     if (offset + *pnum == eof) {
2737         ret |= BDRV_BLOCK_EOF;
2738     }
2739
2740     return ret;
2741 }
2742
2743 int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
2744                                             BlockDriverState *base,
2745                                             int64_t offset, int64_t bytes,
2746                                             int64_t *pnum, int64_t *map,
2747                                             BlockDriverState **file)
2748 {
2749     IO_CODE();
2750     return bdrv_co_common_block_status_above(bs, base, false, true, offset,
2751                                              bytes, pnum, map, file, NULL);
2752 }
2753
2754 int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, int64_t offset,
2755                                       int64_t bytes, int64_t *pnum,
2756                                       int64_t *map, BlockDriverState **file)
2757 {
2758     IO_CODE();
2759     return bdrv_co_block_status_above(bs, bdrv_filter_or_cow_bs(bs),
2760                                       offset, bytes, pnum, map, file);
2761 }
2762
2763 /*
2764  * Check @bs (and its backing chain) to see if the range defined
2765  * by @offset and @bytes is known to read as zeroes.
2766  * Return 1 if that is the case, 0 otherwise and -errno on error.
2767  * This test is meant to be fast rather than accurate so returning 0
2768  * does not guarantee non-zero data.
2769  */
2770 int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
2771                                       int64_t bytes)
2772 {
2773     int ret;
2774     int64_t pnum = bytes;
2775     IO_CODE();
2776
2777     if (!bytes) {
2778         return 1;
2779     }
2780
2781     ret = bdrv_co_common_block_status_above(bs, NULL, false, false, offset,
2782                                             bytes, &pnum, NULL, NULL, NULL);
2783
2784     if (ret < 0) {
2785         return ret;
2786     }
2787
2788     return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO);
2789 }
2790
2791 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset,
2792                                       int64_t bytes, int64_t *pnum)
2793 {
2794     int ret;
2795     int64_t dummy;
2796     IO_CODE();
2797
2798     ret = bdrv_co_common_block_status_above(bs, bs, true, false, offset,
2799                                             bytes, pnum ? pnum : &dummy, NULL,
2800                                             NULL, NULL);
2801     if (ret < 0) {
2802         return ret;
2803     }
2804     return !!(ret & BDRV_BLOCK_ALLOCATED);
2805 }
2806
2807 /*
2808  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2809  *
2810  * Return a positive depth if (a prefix of) the given range is allocated
2811  * in any image between BASE and TOP (BASE is only included if include_base
2812  * is set).  Depth 1 is TOP, 2 is the first backing layer, and so forth.
2813  * BASE can be NULL to check if the given offset is allocated in any
2814  * image of the chain.  Return 0 otherwise, or negative errno on
2815  * failure.
2816  *
2817  * 'pnum' is set to the number of bytes (including and immediately
2818  * following the specified offset) that are known to be in the same
2819  * allocated/unallocated state.  Note that a subsequent call starting
2820  * at 'offset + *pnum' may return the same allocation status (in other
2821  * words, the result is not necessarily the maximum possible range);
2822  * but 'pnum' will only be 0 when end of file is reached.
2823  */
2824 int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *bs,
2825                                             BlockDriverState *base,
2826                                             bool include_base, int64_t offset,
2827                                             int64_t bytes, int64_t *pnum)
2828 {
2829     int depth;
2830     int ret;
2831     IO_CODE();
2832
2833     ret = bdrv_co_common_block_status_above(bs, base, include_base, false,
2834                                             offset, bytes, pnum, NULL, NULL,
2835                                             &depth);
2836     if (ret < 0) {
2837         return ret;
2838     }
2839
2840     if (ret & BDRV_BLOCK_ALLOCATED) {
2841         return depth;
2842     }
2843     return 0;
2844 }
2845
2846 int coroutine_fn
2847 bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2848 {
2849     BlockDriver *drv = bs->drv;
2850     BlockDriverState *child_bs = bdrv_primary_bs(bs);
2851     int ret;
2852     IO_CODE();
2853     assert_bdrv_graph_readable();
2854
2855     ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
2856     if (ret < 0) {
2857         return ret;
2858     }
2859
2860     if (!drv) {
2861         return -ENOMEDIUM;
2862     }
2863
2864     bdrv_inc_in_flight(bs);
2865
2866     if (drv->bdrv_co_load_vmstate) {
2867         ret = drv->bdrv_co_load_vmstate(bs, qiov, pos);
2868     } else if (child_bs) {
2869         ret = bdrv_co_readv_vmstate(child_bs, qiov, pos);
2870     } else {
2871         ret = -ENOTSUP;
2872     }
2873
2874     bdrv_dec_in_flight(bs);
2875
2876     return ret;
2877 }
2878
2879 int coroutine_fn
2880 bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2881 {
2882     BlockDriver *drv = bs->drv;
2883     BlockDriverState *child_bs = bdrv_primary_bs(bs);
2884     int ret;
2885     IO_CODE();
2886     assert_bdrv_graph_readable();
2887
2888     ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
2889     if (ret < 0) {
2890         return ret;
2891     }
2892
2893     if (!drv) {
2894         return -ENOMEDIUM;
2895     }
2896
2897     bdrv_inc_in_flight(bs);
2898
2899     if (drv->bdrv_co_save_vmstate) {
2900         ret = drv->bdrv_co_save_vmstate(bs, qiov, pos);
2901     } else if (child_bs) {
2902         ret = bdrv_co_writev_vmstate(child_bs, qiov, pos);
2903     } else {
2904         ret = -ENOTSUP;
2905     }
2906
2907     bdrv_dec_in_flight(bs);
2908
2909     return ret;
2910 }
2911
2912 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2913                       int64_t pos, int size)
2914 {
2915     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2916     int ret = bdrv_writev_vmstate(bs, &qiov, pos);
2917     IO_CODE();
2918
2919     return ret < 0 ? ret : size;
2920 }
2921
2922 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2923                       int64_t pos, int size)
2924 {
2925     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2926     int ret = bdrv_readv_vmstate(bs, &qiov, pos);
2927     IO_CODE();
2928
2929     return ret < 0 ? ret : size;
2930 }
2931
2932 /**************************************************************/
2933 /* async I/Os */
2934
2935 /**
2936  * Synchronously cancels an acb. Must be called with the BQL held and the acb
2937  * must be processed with the BQL held too (IOThreads are not allowed).
2938  *
2939  * Use bdrv_aio_cancel_async() instead when possible.
2940  */
2941 void bdrv_aio_cancel(BlockAIOCB *acb)
2942 {
2943     GLOBAL_STATE_CODE();
2944     qemu_aio_ref(acb);
2945     bdrv_aio_cancel_async(acb);
2946     AIO_WAIT_WHILE_UNLOCKED(NULL, acb->refcnt > 1);
2947     qemu_aio_unref(acb);
2948 }
2949
2950 /* Async version of aio cancel. The caller is not blocked if the acb implements
2951  * cancel_async, otherwise we do nothing and let the request normally complete.
2952  * In either case the completion callback must be called. */
2953 void bdrv_aio_cancel_async(BlockAIOCB *acb)
2954 {
2955     IO_CODE();
2956     if (acb->aiocb_info->cancel_async) {
2957         acb->aiocb_info->cancel_async(acb);
2958     }
2959 }
2960
2961 /**************************************************************/
2962 /* Coroutine block device emulation */
2963
2964 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2965 {
2966     BdrvChild *primary_child = bdrv_primary_child(bs);
2967     BdrvChild *child;
2968     int current_gen;
2969     int ret = 0;
2970     IO_CODE();
2971
2972     assert_bdrv_graph_readable();
2973     bdrv_inc_in_flight(bs);
2974
2975     if (!bdrv_co_is_inserted(bs) || bdrv_is_read_only(bs) ||
2976         bdrv_is_sg(bs)) {
2977         goto early_exit;
2978     }
2979
2980     qemu_mutex_lock(&bs->reqs_lock);
2981     current_gen = qatomic_read(&bs->write_gen);
2982
2983     /* Wait until any previous flushes are completed */
2984     while (bs->active_flush_req) {
2985         qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
2986     }
2987
2988     /* Flushes reach this point in nondecreasing current_gen order.  */
2989     bs->active_flush_req = true;
2990     qemu_mutex_unlock(&bs->reqs_lock);
2991
2992     /* Write back all layers by calling one driver function */
2993     if (bs->drv->bdrv_co_flush) {
2994         ret = bs->drv->bdrv_co_flush(bs);
2995         goto out;
2996     }
2997
2998     /* Write back cached data to the OS even with cache=unsafe */
2999     BLKDBG_CO_EVENT(primary_child, BLKDBG_FLUSH_TO_OS);
3000     if (bs->drv->bdrv_co_flush_to_os) {
3001         ret = bs->drv->bdrv_co_flush_to_os(bs);
3002         if (ret < 0) {
3003             goto out;
3004         }
3005     }
3006
3007     /* But don't actually force it to the disk with cache=unsafe */
3008     if (bs->open_flags & BDRV_O_NO_FLUSH) {
3009         goto flush_children;
3010     }
3011
3012     /* Check if we really need to flush anything */
3013     if (bs->flushed_gen == current_gen) {
3014         goto flush_children;
3015     }
3016
3017     BLKDBG_CO_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK);
3018     if (!bs->drv) {
3019         /* bs->drv->bdrv_co_flush() might have ejected the BDS
3020          * (even in case of apparent success) */
3021         ret = -ENOMEDIUM;
3022         goto out;
3023     }
3024     if (bs->drv->bdrv_co_flush_to_disk) {
3025         ret = bs->drv->bdrv_co_flush_to_disk(bs);
3026     } else if (bs->drv->bdrv_aio_flush) {
3027         BlockAIOCB *acb;
3028         CoroutineIOCompletion co = {
3029             .coroutine = qemu_coroutine_self(),
3030         };
3031
3032         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3033         if (acb == NULL) {
3034             ret = -EIO;
3035         } else {
3036             qemu_coroutine_yield();
3037             ret = co.ret;
3038         }
3039     } else {
3040         /*
3041          * Some block drivers always operate in either writethrough or unsafe
3042          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3043          * know how the server works (because the behaviour is hardcoded or
3044          * depends on server-side configuration), so we can't ensure that
3045          * everything is safe on disk. Returning an error doesn't work because
3046          * that would break guests even if the server operates in writethrough
3047          * mode.
3048          *
3049          * Let's hope the user knows what he's doing.
3050          */
3051         ret = 0;
3052     }
3053
3054     if (ret < 0) {
3055         goto out;
3056     }
3057
3058     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
3059      * in the case of cache=unsafe, so there are no useless flushes.
3060      */
3061 flush_children:
3062     ret = 0;
3063     QLIST_FOREACH(child, &bs->children, next) {
3064         if (child->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) {
3065             int this_child_ret = bdrv_co_flush(child->bs);
3066             if (!ret) {
3067                 ret = this_child_ret;
3068             }
3069         }
3070     }
3071
3072 out:
3073     /* Notify any pending flushes that we have completed */
3074     if (ret == 0) {
3075         bs->flushed_gen = current_gen;
3076     }
3077
3078     qemu_mutex_lock(&bs->reqs_lock);
3079     bs->active_flush_req = false;
3080     /* Return value is ignored - it's ok if wait queue is empty */
3081     qemu_co_queue_next(&bs->flush_queue);
3082     qemu_mutex_unlock(&bs->reqs_lock);
3083
3084 early_exit:
3085     bdrv_dec_in_flight(bs);
3086     return ret;
3087 }
3088
3089 int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
3090                                   int64_t bytes)
3091 {
3092     BdrvTrackedRequest req;
3093     int ret;
3094     int64_t max_pdiscard;
3095     int head, tail, align;
3096     BlockDriverState *bs = child->bs;
3097     IO_CODE();
3098     assert_bdrv_graph_readable();
3099
3100     if (!bs || !bs->drv || !bdrv_co_is_inserted(bs)) {
3101         return -ENOMEDIUM;
3102     }
3103
3104     if (bdrv_has_readonly_bitmaps(bs)) {
3105         return -EPERM;
3106     }
3107
3108     ret = bdrv_check_request(offset, bytes, NULL);
3109     if (ret < 0) {
3110         return ret;
3111     }
3112
3113     /* Do nothing if disabled.  */
3114     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3115         return 0;
3116     }
3117
3118     if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
3119         return 0;
3120     }
3121
3122     /* Invalidate the cached block-status data range if this discard overlaps */
3123     bdrv_bsc_invalidate_range(bs, offset, bytes);
3124
3125     /* Discard is advisory, but some devices track and coalesce
3126      * unaligned requests, so we must pass everything down rather than
3127      * round here.  Still, most devices will just silently ignore
3128      * unaligned requests (by returning -ENOTSUP), so we must fragment
3129      * the request accordingly.  */
3130     align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
3131     assert(align % bs->bl.request_alignment == 0);
3132     head = offset % align;
3133     tail = (offset + bytes) % align;
3134
3135     bdrv_inc_in_flight(bs);
3136     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
3137
3138     ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0);
3139     if (ret < 0) {
3140         goto out;
3141     }
3142
3143     max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT64_MAX),
3144                                    align);
3145     assert(max_pdiscard >= bs->bl.request_alignment);
3146
3147     while (bytes > 0) {
3148         int64_t num = bytes;
3149
3150         if (head) {
3151             /* Make small requests to get to alignment boundaries. */
3152             num = MIN(bytes, align - head);
3153             if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
3154                 num %= bs->bl.request_alignment;
3155             }
3156             head = (head + num) % align;
3157             assert(num < max_pdiscard);
3158         } else if (tail) {
3159             if (num > align) {
3160                 /* Shorten the request to the last aligned cluster.  */
3161                 num -= tail;
3162             } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
3163                        tail > bs->bl.request_alignment) {
3164                 tail %= bs->bl.request_alignment;
3165                 num -= tail;
3166             }
3167         }
3168         /* limit request size */
3169         if (num > max_pdiscard) {
3170             num = max_pdiscard;
3171         }
3172
3173         if (!bs->drv) {
3174             ret = -ENOMEDIUM;
3175             goto out;
3176         }
3177         if (bs->drv->bdrv_co_pdiscard) {
3178             ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
3179         } else {
3180             BlockAIOCB *acb;
3181             CoroutineIOCompletion co = {
3182                 .coroutine = qemu_coroutine_self(),
3183             };
3184
3185             acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
3186                                              bdrv_co_io_em_complete, &co);
3187             if (acb == NULL) {
3188                 ret = -EIO;
3189                 goto out;
3190             } else {
3191                 qemu_coroutine_yield();
3192                 ret = co.ret;
3193             }
3194         }
3195         if (ret && ret != -ENOTSUP) {
3196             goto out;
3197         }
3198
3199         offset += num;
3200         bytes -= num;
3201     }
3202     ret = 0;
3203 out:
3204     bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret);
3205     tracked_request_end(&req);
3206     bdrv_dec_in_flight(bs);
3207     return ret;
3208 }
3209
3210 int coroutine_fn bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
3211 {
3212     BlockDriver *drv = bs->drv;
3213     CoroutineIOCompletion co = {
3214         .coroutine = qemu_coroutine_self(),
3215     };
3216     BlockAIOCB *acb;
3217     IO_CODE();
3218     assert_bdrv_graph_readable();
3219
3220     bdrv_inc_in_flight(bs);
3221     if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
3222         co.ret = -ENOTSUP;
3223         goto out;
3224     }
3225
3226     if (drv->bdrv_co_ioctl) {
3227         co.ret = drv->bdrv_co_ioctl(bs, req, buf);
3228     } else {
3229         acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
3230         if (!acb) {
3231             co.ret = -ENOTSUP;
3232             goto out;
3233         }
3234         qemu_coroutine_yield();
3235     }
3236 out:
3237     bdrv_dec_in_flight(bs);
3238     return co.ret;
3239 }
3240
3241 int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
3242                         unsigned int *nr_zones,
3243                         BlockZoneDescriptor *zones)
3244 {
3245     BlockDriver *drv = bs->drv;
3246     CoroutineIOCompletion co = {
3247             .coroutine = qemu_coroutine_self(),
3248     };
3249     IO_CODE();
3250
3251     bdrv_inc_in_flight(bs);
3252     if (!drv || !drv->bdrv_co_zone_report || bs->bl.zoned == BLK_Z_NONE) {
3253         co.ret = -ENOTSUP;
3254         goto out;
3255     }
3256     co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones);
3257 out:
3258     bdrv_dec_in_flight(bs);
3259     return co.ret;
3260 }
3261
3262 int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
3263         int64_t offset, int64_t len)
3264 {
3265     BlockDriver *drv = bs->drv;
3266     CoroutineIOCompletion co = {
3267             .coroutine = qemu_coroutine_self(),
3268     };
3269     IO_CODE();
3270
3271     bdrv_inc_in_flight(bs);
3272     if (!drv || !drv->bdrv_co_zone_mgmt || bs->bl.zoned == BLK_Z_NONE) {
3273         co.ret = -ENOTSUP;
3274         goto out;
3275     }
3276     co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len);
3277 out:
3278     bdrv_dec_in_flight(bs);
3279     return co.ret;
3280 }
3281
3282 int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
3283                         QEMUIOVector *qiov,
3284                         BdrvRequestFlags flags)
3285 {
3286     int ret;
3287     BlockDriver *drv = bs->drv;
3288     CoroutineIOCompletion co = {
3289             .coroutine = qemu_coroutine_self(),
3290     };
3291     IO_CODE();
3292
3293     ret = bdrv_check_qiov_request(*offset, qiov->size, qiov, 0, NULL);
3294     if (ret < 0) {
3295         return ret;
3296     }
3297
3298     bdrv_inc_in_flight(bs);
3299     if (!drv || !drv->bdrv_co_zone_append || bs->bl.zoned == BLK_Z_NONE) {
3300         co.ret = -ENOTSUP;
3301         goto out;
3302     }
3303     co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
3304 out:
3305     bdrv_dec_in_flight(bs);
3306     return co.ret;
3307 }
3308
3309 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3310 {
3311     IO_CODE();
3312     return qemu_memalign(bdrv_opt_mem_align(bs), size);
3313 }
3314
3315 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
3316 {
3317     IO_CODE();
3318     return memset(qemu_blockalign(bs, size), 0, size);
3319 }
3320
3321 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
3322 {
3323     size_t align = bdrv_opt_mem_align(bs);
3324     IO_CODE();
3325
3326     /* Ensure that NULL is never returned on success */
3327     assert(align > 0);
3328     if (size == 0) {
3329         size = align;
3330     }
3331
3332     return qemu_try_memalign(align, size);
3333 }
3334
3335 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
3336 {
3337     void *mem = qemu_try_blockalign(bs, size);
3338     IO_CODE();
3339
3340     if (mem) {
3341         memset(mem, 0, size);
3342     }
3343
3344     return mem;
3345 }
3346
3347 /* Helper that undoes bdrv_register_buf() when it fails partway through */
3348 static void GRAPH_RDLOCK
3349 bdrv_register_buf_rollback(BlockDriverState *bs, void *host, size_t size,
3350                            BdrvChild *final_child)
3351 {
3352     BdrvChild *child;
3353
3354     GLOBAL_STATE_CODE();
3355     assert_bdrv_graph_readable();
3356
3357     QLIST_FOREACH(child, &bs->children, next) {
3358         if (child == final_child) {
3359             break;
3360         }
3361
3362         bdrv_unregister_buf(child->bs, host, size);
3363     }
3364
3365     if (bs->drv && bs->drv->bdrv_unregister_buf) {
3366         bs->drv->bdrv_unregister_buf(bs, host, size);
3367     }
3368 }
3369
3370 bool bdrv_register_buf(BlockDriverState *bs, void *host, size_t size,
3371                        Error **errp)
3372 {
3373     BdrvChild *child;
3374
3375     GLOBAL_STATE_CODE();
3376     GRAPH_RDLOCK_GUARD_MAINLOOP();
3377
3378     if (bs->drv && bs->drv->bdrv_register_buf) {
3379         if (!bs->drv->bdrv_register_buf(bs, host, size, errp)) {
3380             return false;
3381         }
3382     }
3383     QLIST_FOREACH(child, &bs->children, next) {
3384         if (!bdrv_register_buf(child->bs, host, size, errp)) {
3385             bdrv_register_buf_rollback(bs, host, size, child);
3386             return false;
3387         }
3388     }
3389     return true;
3390 }
3391
3392 void bdrv_unregister_buf(BlockDriverState *bs, void *host, size_t size)
3393 {
3394     BdrvChild *child;
3395
3396     GLOBAL_STATE_CODE();
3397     GRAPH_RDLOCK_GUARD_MAINLOOP();
3398
3399     if (bs->drv && bs->drv->bdrv_unregister_buf) {
3400         bs->drv->bdrv_unregister_buf(bs, host, size);
3401     }
3402     QLIST_FOREACH(child, &bs->children, next) {
3403         bdrv_unregister_buf(child->bs, host, size);
3404     }
3405 }
3406
3407 static int coroutine_fn GRAPH_RDLOCK bdrv_co_copy_range_internal(
3408         BdrvChild *src, int64_t src_offset, BdrvChild *dst,
3409         int64_t dst_offset, int64_t bytes,
3410         BdrvRequestFlags read_flags, BdrvRequestFlags write_flags,
3411         bool recurse_src)
3412 {
3413     BdrvTrackedRequest req;
3414     int ret;
3415     assert_bdrv_graph_readable();
3416
3417     /* TODO We can support BDRV_REQ_NO_FALLBACK here */
3418     assert(!(read_flags & BDRV_REQ_NO_FALLBACK));
3419     assert(!(write_flags & BDRV_REQ_NO_FALLBACK));
3420     assert(!(read_flags & BDRV_REQ_NO_WAIT));
3421     assert(!(write_flags & BDRV_REQ_NO_WAIT));
3422
3423     if (!dst || !dst->bs || !bdrv_co_is_inserted(dst->bs)) {
3424         return -ENOMEDIUM;
3425     }
3426     ret = bdrv_check_request32(dst_offset, bytes, NULL, 0);
3427     if (ret) {
3428         return ret;
3429     }
3430     if (write_flags & BDRV_REQ_ZERO_WRITE) {
3431         return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags);
3432     }
3433
3434     if (!src || !src->bs || !bdrv_co_is_inserted(src->bs)) {
3435         return -ENOMEDIUM;
3436     }
3437     ret = bdrv_check_request32(src_offset, bytes, NULL, 0);
3438     if (ret) {
3439         return ret;
3440     }
3441
3442     if (!src->bs->drv->bdrv_co_copy_range_from
3443         || !dst->bs->drv->bdrv_co_copy_range_to
3444         || src->bs->encrypted || dst->bs->encrypted) {
3445         return -ENOTSUP;
3446     }
3447
3448     if (recurse_src) {
3449         bdrv_inc_in_flight(src->bs);
3450         tracked_request_begin(&req, src->bs, src_offset, bytes,
3451                               BDRV_TRACKED_READ);
3452
3453         /* BDRV_REQ_SERIALISING is only for write operation */
3454         assert(!(read_flags & BDRV_REQ_SERIALISING));
3455         bdrv_wait_serialising_requests(&req);
3456
3457         ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
3458                                                     src, src_offset,
3459                                                     dst, dst_offset,
3460                                                     bytes,
3461                                                     read_flags, write_flags);
3462
3463         tracked_request_end(&req);
3464         bdrv_dec_in_flight(src->bs);
3465     } else {
3466         bdrv_inc_in_flight(dst->bs);
3467         tracked_request_begin(&req, dst->bs, dst_offset, bytes,
3468                               BDRV_TRACKED_WRITE);
3469         ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req,
3470                                         write_flags);
3471         if (!ret) {
3472             ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
3473                                                       src, src_offset,
3474                                                       dst, dst_offset,
3475                                                       bytes,
3476                                                       read_flags, write_flags);
3477         }
3478         bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret);
3479         tracked_request_end(&req);
3480         bdrv_dec_in_flight(dst->bs);
3481     }
3482
3483     return ret;
3484 }
3485
3486 /* Copy range from @src to @dst.
3487  *
3488  * See the comment of bdrv_co_copy_range for the parameter and return value
3489  * semantics. */
3490 int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, int64_t src_offset,
3491                                          BdrvChild *dst, int64_t dst_offset,
3492                                          int64_t bytes,
3493                                          BdrvRequestFlags read_flags,
3494                                          BdrvRequestFlags write_flags)
3495 {
3496     IO_CODE();
3497     assert_bdrv_graph_readable();
3498     trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes,
3499                                   read_flags, write_flags);
3500     return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
3501                                        bytes, read_flags, write_flags, true);
3502 }
3503
3504 /* Copy range from @src to @dst.
3505  *
3506  * See the comment of bdrv_co_copy_range for the parameter and return value
3507  * semantics. */
3508 int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, int64_t src_offset,
3509                                        BdrvChild *dst, int64_t dst_offset,
3510                                        int64_t bytes,
3511                                        BdrvRequestFlags read_flags,
3512                                        BdrvRequestFlags write_flags)
3513 {
3514     IO_CODE();
3515     assert_bdrv_graph_readable();
3516     trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3517                                 read_flags, write_flags);
3518     return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
3519                                        bytes, read_flags, write_flags, false);
3520 }
3521
3522 int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset,
3523                                     BdrvChild *dst, int64_t dst_offset,
3524                                     int64_t bytes, BdrvRequestFlags read_flags,
3525                                     BdrvRequestFlags write_flags)
3526 {
3527     IO_CODE();
3528     assert_bdrv_graph_readable();
3529
3530     return bdrv_co_copy_range_from(src, src_offset,
3531                                    dst, dst_offset,
3532                                    bytes, read_flags, write_flags);
3533 }
3534
3535 static void coroutine_fn GRAPH_RDLOCK
3536 bdrv_parent_cb_resize(BlockDriverState *bs)
3537 {
3538     BdrvChild *c;
3539
3540     assert_bdrv_graph_readable();
3541
3542     QLIST_FOREACH(c, &bs->parents, next_parent) {
3543         if (c->klass->resize) {
3544             c->klass->resize(c);
3545         }
3546     }
3547 }
3548
3549 /**
3550  * Truncate file to 'offset' bytes (needed only for file protocols)
3551  *
3552  * If 'exact' is true, the file must be resized to exactly the given
3553  * 'offset'.  Otherwise, it is sufficient for the node to be at least
3554  * 'offset' bytes in length.
3555  */
3556 int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
3557                                   PreallocMode prealloc, BdrvRequestFlags flags,
3558                                   Error **errp)
3559 {
3560     BlockDriverState *bs = child->bs;
3561     BdrvChild *filtered, *backing;
3562     BlockDriver *drv = bs->drv;
3563     BdrvTrackedRequest req;
3564     int64_t old_size, new_bytes;
3565     int ret;
3566     IO_CODE();
3567     assert_bdrv_graph_readable();
3568
3569     /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
3570     if (!drv) {
3571         error_setg(errp, "No medium inserted");
3572         return -ENOMEDIUM;
3573     }
3574     if (offset < 0) {
3575         error_setg(errp, "Image size cannot be negative");
3576         return -EINVAL;
3577     }
3578
3579     ret = bdrv_check_request(offset, 0, errp);
3580     if (ret < 0) {
3581         return ret;
3582     }
3583
3584     old_size = bdrv_co_getlength(bs);
3585     if (old_size < 0) {
3586         error_setg_errno(errp, -old_size, "Failed to get old image size");
3587         return old_size;
3588     }
3589
3590     if (bdrv_is_read_only(bs)) {
3591         error_setg(errp, "Image is read-only");
3592         return -EACCES;
3593     }
3594
3595     if (offset > old_size) {
3596         new_bytes = offset - old_size;
3597     } else {
3598         new_bytes = 0;
3599     }
3600
3601     bdrv_inc_in_flight(bs);
3602     tracked_request_begin(&req, bs, offset - new_bytes, new_bytes,
3603                           BDRV_TRACKED_TRUNCATE);
3604
3605     /* If we are growing the image and potentially using preallocation for the
3606      * new area, we need to make sure that no write requests are made to it
3607      * concurrently or they might be overwritten by preallocation. */
3608     if (new_bytes) {
3609         bdrv_make_request_serialising(&req, 1);
3610     }
3611     ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req,
3612                                     0);
3613     if (ret < 0) {
3614         error_setg_errno(errp, -ret,
3615                          "Failed to prepare request for truncation");
3616         goto out;
3617     }
3618
3619     filtered = bdrv_filter_child(bs);
3620     backing = bdrv_cow_child(bs);
3621
3622     /*
3623      * If the image has a backing file that is large enough that it would
3624      * provide data for the new area, we cannot leave it unallocated because
3625      * then the backing file content would become visible. Instead, zero-fill
3626      * the new area.
3627      *
3628      * Note that if the image has a backing file, but was opened without the
3629      * backing file, taking care of keeping things consistent with that backing
3630      * file is the user's responsibility.
3631      */
3632     if (new_bytes && backing) {
3633         int64_t backing_len;
3634
3635         backing_len = bdrv_co_getlength(backing->bs);
3636         if (backing_len < 0) {
3637             ret = backing_len;
3638             error_setg_errno(errp, -ret, "Could not get backing file size");
3639             goto out;
3640         }
3641
3642         if (backing_len > old_size) {
3643             flags |= BDRV_REQ_ZERO_WRITE;
3644         }
3645     }
3646
3647     if (drv->bdrv_co_truncate) {
3648         if (flags & ~bs->supported_truncate_flags) {
3649             error_setg(errp, "Block driver does not support requested flags");
3650             ret = -ENOTSUP;
3651             goto out;
3652         }
3653         ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp);
3654     } else if (filtered) {
3655         ret = bdrv_co_truncate(filtered, offset, exact, prealloc, flags, errp);
3656     } else {
3657         error_setg(errp, "Image format driver does not support resize");
3658         ret = -ENOTSUP;
3659         goto out;
3660     }
3661     if (ret < 0) {
3662         goto out;
3663     }
3664
3665     ret = bdrv_co_refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3666     if (ret < 0) {
3667         error_setg_errno(errp, -ret, "Could not refresh total sector count");
3668     } else {
3669         offset = bs->total_sectors * BDRV_SECTOR_SIZE;
3670     }
3671     /*
3672      * It's possible that truncation succeeded but bdrv_refresh_total_sectors
3673      * failed, but the latter doesn't affect how we should finish the request.
3674      * Pass 0 as the last parameter so that dirty bitmaps etc. are handled.
3675      */
3676     bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0);
3677
3678 out:
3679     tracked_request_end(&req);
3680     bdrv_dec_in_flight(bs);
3681
3682     return ret;
3683 }
3684
3685 void bdrv_cancel_in_flight(BlockDriverState *bs)
3686 {
3687     GLOBAL_STATE_CODE();
3688     GRAPH_RDLOCK_GUARD_MAINLOOP();
3689
3690     if (!bs || !bs->drv) {
3691         return;
3692     }
3693
3694     if (bs->drv->bdrv_cancel_in_flight) {
3695         bs->drv->bdrv_cancel_in_flight(bs);
3696     }
3697 }
3698
3699 int coroutine_fn
3700 bdrv_co_preadv_snapshot(BdrvChild *child, int64_t offset, int64_t bytes,
3701                         QEMUIOVector *qiov, size_t qiov_offset)
3702 {
3703     BlockDriverState *bs = child->bs;
3704     BlockDriver *drv = bs->drv;
3705     int ret;
3706     IO_CODE();
3707     assert_bdrv_graph_readable();
3708
3709     if (!drv) {
3710         return -ENOMEDIUM;
3711     }
3712
3713     if (!drv->bdrv_co_preadv_snapshot) {
3714         return -ENOTSUP;
3715     }
3716
3717     bdrv_inc_in_flight(bs);
3718     ret = drv->bdrv_co_preadv_snapshot(bs, offset, bytes, qiov, qiov_offset);
3719     bdrv_dec_in_flight(bs);
3720
3721     return ret;
3722 }
3723
3724 int coroutine_fn
3725 bdrv_co_snapshot_block_status(BlockDriverState *bs,
3726                               bool want_zero, int64_t offset, int64_t bytes,
3727                               int64_t *pnum, int64_t *map,
3728                               BlockDriverState **file)
3729 {
3730     BlockDriver *drv = bs->drv;
3731     int ret;
3732     IO_CODE();
3733     assert_bdrv_graph_readable();
3734
3735     if (!drv) {
3736         return -ENOMEDIUM;
3737     }
3738
3739     if (!drv->bdrv_co_snapshot_block_status) {
3740         return -ENOTSUP;
3741     }
3742
3743     bdrv_inc_in_flight(bs);
3744     ret = drv->bdrv_co_snapshot_block_status(bs, want_zero, offset, bytes,
3745                                              pnum, map, file);
3746     bdrv_dec_in_flight(bs);
3747
3748     return ret;
3749 }
3750
3751 int coroutine_fn
3752 bdrv_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes)
3753 {
3754     BlockDriver *drv = bs->drv;
3755     int ret;
3756     IO_CODE();
3757     assert_bdrv_graph_readable();
3758
3759     if (!drv) {
3760         return -ENOMEDIUM;
3761     }
3762
3763     if (!drv->bdrv_co_pdiscard_snapshot) {
3764         return -ENOTSUP;
3765     }
3766
3767     bdrv_inc_in_flight(bs);
3768     ret = drv->bdrv_co_pdiscard_snapshot(bs, offset, bytes);
3769     bdrv_dec_in_flight(bs);
3770
3771     return ret;
3772 }