block/io.c

   1 /*
   2  * Block layer I/O functions
   3  *
   4  * Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "trace.h"
  27 #include "sysemu/block-backend.h"
  28 #include "block/aio-wait.h"
  29 #include "block/blockjob.h"
  30 #include "block/blockjob_int.h"
  31 #include "block/block_int.h"
  32 #include "qemu/cutils.h"
  33 #include "qapi/error.h"
  34 #include "qemu/error-report.h"
  35
  36 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  37
  38 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
  39 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
  40
  41 static AioWait drain_all_aio_wait;
  42
  43 static void bdrv_parent_cb_resize(BlockDriverState *bs);
  44 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
  45     int64_t offset, int bytes, BdrvRequestFlags flags);
  46
  47 void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
  48                                bool ignore_bds_parents)
  49 {
  50     BdrvChild *c, *next;
  51
  52     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
  53         if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
  54             continue;
  55         }
  56         bdrv_parent_drained_begin_single(c, false);
  57     }
  58 }
  59
  60 void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
  61                              bool ignore_bds_parents)
  62 {
  63     BdrvChild *c, *next;
  64
  65     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
  66         if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
  67             continue;
  68         }
  69         if (c->role->drained_end) {
  70             c->role->drained_end(c);
  71         }
  72     }
  73 }
  74
  75 static bool bdrv_parent_drained_poll_single(BdrvChild *c)
  76 {
  77     if (c->role->drained_poll) {
  78         return c->role->drained_poll(c);
  79     }
  80     return false;
  81 }
  82
  83 static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
  84                                      bool ignore_bds_parents)
  85 {
  86     BdrvChild *c, *next;
  87     bool busy = false;
  88
  89     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
  90         if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
  91             continue;
  92         }
  93         busy |= bdrv_parent_drained_poll_single(c);
  94     }
  95
  96     return busy;
  97 }
  98
  99 void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll)
 100 {
 101     if (c->role->drained_begin) {
 102         c->role->drained_begin(c);
 103     }
 104     if (poll) {
 105         BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c));
 106     }
 107 }
 108
 109 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
 110 {
 111     dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
 112     dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
 113     dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
 114                                  src->opt_mem_alignment);
 115     dst->min_mem_alignment = MAX(dst->min_mem_alignment,
 116                                  src->min_mem_alignment);
 117     dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
 118 }
 119
 120 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
 121 {
 122     BlockDriver *drv = bs->drv;
 123     Error *local_err = NULL;
 124
 125     memset(&bs->bl, 0, sizeof(bs->bl));
 126
 127     if (!drv) {
 128         return;
 129     }
 130
 131     /* Default alignment based on whether driver has byte interface */
 132     bs->bl.request_alignment = (drv->bdrv_co_preadv ||
 133                                 drv->bdrv_aio_preadv) ? 1 : 512;
 134
 135     /* Take some limits from the children as a default */
 136     if (bs->file) {
 137         bdrv_refresh_limits(bs->file->bs, &local_err);
 138         if (local_err) {
 139             error_propagate(errp, local_err);
 140             return;
 141         }
 142         bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
 143     } else {
 144         bs->bl.min_mem_alignment = 512;
 145         bs->bl.opt_mem_alignment = getpagesize();
 146
 147         /* Safe default since most protocols use readv()/writev()/etc */
 148         bs->bl.max_iov = IOV_MAX;
 149     }
 150
 151     if (bs->backing) {
 152         bdrv_refresh_limits(bs->backing->bs, &local_err);
 153         if (local_err) {
 154             error_propagate(errp, local_err);
 155             return;
 156         }
 157         bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
 158     }
 159
 160     /* Then let the driver override it */
 161     if (drv->bdrv_refresh_limits) {
 162         drv->bdrv_refresh_limits(bs, errp);
 163     }
 164 }
 165
 166 /**
 167  * The copy-on-read flag is actually a reference count so multiple users may
 168  * use the feature without worrying about clobbering its previous state.
 169  * Copy-on-read stays enabled until all users have called to disable it.
 170  */
 171 void bdrv_enable_copy_on_read(BlockDriverState *bs)
 172 {
 173     atomic_inc(&bs->copy_on_read);
 174 }
 175
 176 void bdrv_disable_copy_on_read(BlockDriverState *bs)
 177 {
 178     int old = atomic_fetch_dec(&bs->copy_on_read);
 179     assert(old >= 1);
 180 }
 181
 182 typedef struct {
 183     Coroutine *co;
 184     BlockDriverState *bs;
 185     bool done;
 186     bool begin;
 187     bool recursive;
 188     bool poll;
 189     BdrvChild *parent;
 190     bool ignore_bds_parents;
 191 } BdrvCoDrainData;
 192
 193 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
 194 {
 195     BdrvCoDrainData *data = opaque;
 196     BlockDriverState *bs = data->bs;
 197
 198     if (data->begin) {
 199         bs->drv->bdrv_co_drain_begin(bs);
 200     } else {
 201         bs->drv->bdrv_co_drain_end(bs);
 202     }
 203
 204     /* Set data->done before reading bs->wakeup.  */
 205     atomic_mb_set(&data->done, true);
 206     bdrv_dec_in_flight(bs);
 207
 208     if (data->begin) {
 209         g_free(data);
 210     }
 211 }
 212
 213 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
 214 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 215 {
 216     BdrvCoDrainData *data;
 217
 218     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
 219             (!begin && !bs->drv->bdrv_co_drain_end)) {
 220         return;
 221     }
 222
 223     data = g_new(BdrvCoDrainData, 1);
 224     *data = (BdrvCoDrainData) {
 225         .bs = bs,
 226         .done = false,
 227         .begin = begin
 228     };
 229
 230     /* Make sure the driver callback completes during the polling phase for
 231      * drain_begin. */
 232     bdrv_inc_in_flight(bs);
 233     data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
 234     aio_co_schedule(bdrv_get_aio_context(bs), data->co);
 235
 236     if (!begin) {
 237         BDRV_POLL_WHILE(bs, !data->done);
 238         g_free(data);
 239     }
 240 }
 241
 242 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
 243 bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
 244                      BdrvChild *ignore_parent, bool ignore_bds_parents)
 245 {
 246     BdrvChild *child, *next;
 247
 248     if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
 249         return true;
 250     }
 251
 252     if (atomic_read(&bs->in_flight)) {
 253         return true;
 254     }
 255
 256     if (recursive) {
 257         assert(!ignore_bds_parents);
 258         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 259             if (bdrv_drain_poll(child->bs, recursive, child, false)) {
 260                 return true;
 261             }
 262         }
 263     }
 264
 265     return false;
 266 }
 267
 268 static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
 269                                       BdrvChild *ignore_parent)
 270 {
 271     return bdrv_drain_poll(bs, recursive, ignore_parent, false);
 272 }
 273
 274 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 275                                   BdrvChild *parent, bool ignore_bds_parents,
 276                                   bool poll);
 277 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 278                                 BdrvChild *parent, bool ignore_bds_parents);
 279
 280 static void bdrv_co_drain_bh_cb(void *opaque)
 281 {
 282     BdrvCoDrainData *data = opaque;
 283     Coroutine *co = data->co;
 284     BlockDriverState *bs = data->bs;
 285
 286     if (bs) {
 287         AioContext *ctx = bdrv_get_aio_context(bs);
 288         AioContext *co_ctx = qemu_coroutine_get_aio_context(co);
 289
 290         /*
 291          * When the coroutine yielded, the lock for its home context was
 292          * released, so we need to re-acquire it here. If it explicitly
 293          * acquired a different context, the lock is still held and we don't
 294          * want to lock it a second time (or AIO_WAIT_WHILE() would hang).
 295          */
 296         if (ctx == co_ctx) {
 297             aio_context_acquire(ctx);
 298         }
 299         bdrv_dec_in_flight(bs);
 300         if (data->begin) {
 301             bdrv_do_drained_begin(bs, data->recursive, data->parent,
 302                                   data->ignore_bds_parents, data->poll);
 303         } else {
 304             bdrv_do_drained_end(bs, data->recursive, data->parent,
 305                                 data->ignore_bds_parents);
 306         }
 307         if (ctx == co_ctx) {
 308             aio_context_release(ctx);
 309         }
 310     } else {
 311         assert(data->begin);
 312         bdrv_drain_all_begin();
 313     }
 314
 315     data->done = true;
 316     aio_co_wake(co);
 317 }
 318
 319 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
 320                                                 bool begin, bool recursive,
 321                                                 BdrvChild *parent,
 322                                                 bool ignore_bds_parents,
 323                                                 bool poll)
 324 {
 325     BdrvCoDrainData data;
 326
 327     /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
 328      * other coroutines run if they were queued by aio_co_enter(). */
 329
 330     assert(qemu_in_coroutine());
 331     data = (BdrvCoDrainData) {
 332         .co = qemu_coroutine_self(),
 333         .bs = bs,
 334         .done = false,
 335         .begin = begin,
 336         .recursive = recursive,
 337         .parent = parent,
 338         .ignore_bds_parents = ignore_bds_parents,
 339         .poll = poll,
 340     };
 341     if (bs) {
 342         bdrv_inc_in_flight(bs);
 343     }
 344     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
 345                             bdrv_co_drain_bh_cb, &data);
 346
 347     qemu_coroutine_yield();
 348     /* If we are resumed from some other event (such as an aio completion or a
 349      * timer callback), it is a bug in the caller that should be fixed. */
 350     assert(data.done);
 351 }
 352
 353 void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
 354                                    BdrvChild *parent, bool ignore_bds_parents)
 355 {
 356     assert(!qemu_in_coroutine());
 357
 358     /* Stop things in parent-to-child order */
 359     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
 360         aio_disable_external(bdrv_get_aio_context(bs));
 361     }
 362
 363     bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
 364     bdrv_drain_invoke(bs, true);
 365 }
 366
 367 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 368                                   BdrvChild *parent, bool ignore_bds_parents,
 369                                   bool poll)
 370 {
 371     BdrvChild *child, *next;
 372
 373     if (qemu_in_coroutine()) {
 374         bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
 375                                poll);
 376         return;
 377     }
 378
 379     bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
 380
 381     if (recursive) {
 382         assert(!ignore_bds_parents);
 383         bs->recursive_quiesce_counter++;
 384         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 385             bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
 386                                   false);
 387         }
 388     }
 389
 390     /*
 391      * Wait for drained requests to finish.
 392      *
 393      * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
 394      * call is needed so things in this AioContext can make progress even
 395      * though we don't return to the main AioContext loop - this automatically
 396      * includes other nodes in the same AioContext and therefore all child
 397      * nodes.
 398      */
 399     if (poll) {
 400         assert(!ignore_bds_parents);
 401         BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
 402     }
 403 }
 404
 405 void bdrv_drained_begin(BlockDriverState *bs)
 406 {
 407     bdrv_do_drained_begin(bs, false, NULL, false, true);
 408 }
 409
 410 void bdrv_subtree_drained_begin(BlockDriverState *bs)
 411 {
 412     bdrv_do_drained_begin(bs, true, NULL, false, true);
 413 }
 414
 415 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 416                                 BdrvChild *parent, bool ignore_bds_parents)
 417 {
 418     BdrvChild *child, *next;
 419     int old_quiesce_counter;
 420
 421     if (qemu_in_coroutine()) {
 422         bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
 423                                false);
 424         return;
 425     }
 426     assert(bs->quiesce_counter > 0);
 427     old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
 428
 429     /* Re-enable things in child-to-parent order */
 430     bdrv_drain_invoke(bs, false);
 431     bdrv_parent_drained_end(bs, parent, ignore_bds_parents);
 432     if (old_quiesce_counter == 1) {
 433         aio_enable_external(bdrv_get_aio_context(bs));
 434     }
 435
 436     if (recursive) {
 437         assert(!ignore_bds_parents);
 438         bs->recursive_quiesce_counter--;
 439         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 440             bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents);
 441         }
 442     }
 443 }
 444
 445 void bdrv_drained_end(BlockDriverState *bs)
 446 {
 447     bdrv_do_drained_end(bs, false, NULL, false);
 448 }
 449
 450 void bdrv_subtree_drained_end(BlockDriverState *bs)
 451 {
 452     bdrv_do_drained_end(bs, true, NULL, false);
 453 }
 454
 455 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
 456 {
 457     int i;
 458
 459     for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
 460         bdrv_do_drained_begin(child->bs, true, child, false, true);
 461     }
 462 }
 463
 464 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
 465 {
 466     int i;
 467
 468     for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
 469         bdrv_do_drained_end(child->bs, true, child, false);
 470     }
 471 }
 472
 473 /*
 474  * Wait for pending requests to complete on a single BlockDriverState subtree,
 475  * and suspend block driver's internal I/O until next request arrives.
 476  *
 477  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
 478  * AioContext.
 479  */
 480 void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
 481 {
 482     assert(qemu_in_coroutine());
 483     bdrv_drained_begin(bs);
 484     bdrv_drained_end(bs);
 485 }
 486
 487 void bdrv_drain(BlockDriverState *bs)
 488 {
 489     bdrv_drained_begin(bs);
 490     bdrv_drained_end(bs);
 491 }
 492
 493 static void bdrv_drain_assert_idle(BlockDriverState *bs)
 494 {
 495     BdrvChild *child, *next;
 496
 497     assert(atomic_read(&bs->in_flight) == 0);
 498     QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 499         bdrv_drain_assert_idle(child->bs);
 500     }
 501 }
 502
 503 unsigned int bdrv_drain_all_count = 0;
 504
 505 static bool bdrv_drain_all_poll(void)
 506 {
 507     BlockDriverState *bs = NULL;
 508     bool result = false;
 509
 510     /* bdrv_drain_poll() can't make changes to the graph and we are holding the
 511      * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
 512     while ((bs = bdrv_next_all_states(bs))) {
 513         AioContext *aio_context = bdrv_get_aio_context(bs);
 514         aio_context_acquire(aio_context);
 515         result |= bdrv_drain_poll(bs, false, NULL, true);
 516         aio_context_release(aio_context);
 517     }
 518
 519     return result;
 520 }
 521
 522 /*
 523  * Wait for pending requests to complete across all BlockDriverStates
 524  *
 525  * This function does not flush data to disk, use bdrv_flush_all() for that
 526  * after calling this function.
 527  *
 528  * This pauses all block jobs and disables external clients. It must
 529  * be paired with bdrv_drain_all_end().
 530  *
 531  * NOTE: no new block jobs or BlockDriverStates can be created between
 532  * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
 533  */
 534 void bdrv_drain_all_begin(void)
 535 {
 536     BlockDriverState *bs = NULL;
 537
 538     if (qemu_in_coroutine()) {
 539         bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true);
 540         return;
 541     }
 542
 543     /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
 544      * loop AioContext, so make sure we're in the main context. */
 545     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 546     assert(bdrv_drain_all_count < INT_MAX);
 547     bdrv_drain_all_count++;
 548
 549     /* Quiesce all nodes, without polling in-flight requests yet. The graph
 550      * cannot change during this loop. */
 551     while ((bs = bdrv_next_all_states(bs))) {
 552         AioContext *aio_context = bdrv_get_aio_context(bs);
 553
 554         aio_context_acquire(aio_context);
 555         bdrv_do_drained_begin(bs, false, NULL, true, false);
 556         aio_context_release(aio_context);
 557     }
 558
 559     /* Now poll the in-flight requests */
 560     AIO_WAIT_WHILE(&drain_all_aio_wait, NULL, bdrv_drain_all_poll());
 561
 562     while ((bs = bdrv_next_all_states(bs))) {
 563         bdrv_drain_assert_idle(bs);
 564     }
 565 }
 566
 567 void bdrv_drain_all_end(void)
 568 {
 569     BlockDriverState *bs = NULL;
 570
 571     while ((bs = bdrv_next_all_states(bs))) {
 572         AioContext *aio_context = bdrv_get_aio_context(bs);
 573
 574         aio_context_acquire(aio_context);
 575         bdrv_do_drained_end(bs, false, NULL, true);
 576         aio_context_release(aio_context);
 577     }
 578
 579     assert(bdrv_drain_all_count > 0);
 580     bdrv_drain_all_count--;
 581 }
 582
 583 void bdrv_drain_all(void)
 584 {
 585     bdrv_drain_all_begin();
 586     bdrv_drain_all_end();
 587 }
 588
 589 /**
 590  * Remove an active request from the tracked requests list
 591  *
 592  * This function should be called when a tracked request is completing.
 593  */
 594 static void tracked_request_end(BdrvTrackedRequest *req)
 595 {
 596     if (req->serialising) {
 597         atomic_dec(&req->bs->serialising_in_flight);
 598     }
 599
 600     qemu_co_mutex_lock(&req->bs->reqs_lock);
 601     QLIST_REMOVE(req, list);
 602     qemu_co_queue_restart_all(&req->wait_queue);
 603     qemu_co_mutex_unlock(&req->bs->reqs_lock);
 604 }
 605
 606 /**
 607  * Add an active request to the tracked requests list
 608  */
 609 static void tracked_request_begin(BdrvTrackedRequest *req,
 610                                   BlockDriverState *bs,
 611                                   int64_t offset,
 612                                   uint64_t bytes,
 613                                   enum BdrvTrackedRequestType type)
 614 {
 615     assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes);
 616
 617     *req = (BdrvTrackedRequest){
 618         .bs = bs,
 619         .offset         = offset,
 620         .bytes          = bytes,
 621         .type           = type,
 622         .co             = qemu_coroutine_self(),
 623         .serialising    = false,
 624         .overlap_offset = offset,
 625         .overlap_bytes  = bytes,
 626     };
 627
 628     qemu_co_queue_init(&req->wait_queue);
 629
 630     qemu_co_mutex_lock(&bs->reqs_lock);
 631     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
 632     qemu_co_mutex_unlock(&bs->reqs_lock);
 633 }
 634
 635 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
 636 {
 637     int64_t overlap_offset = req->offset & ~(align - 1);
 638     uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
 639                                - overlap_offset;
 640
 641     if (!req->serialising) {
 642         atomic_inc(&req->bs->serialising_in_flight);
 643         req->serialising = true;
 644     }
 645
 646     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
 647     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
 648 }
 649
 650 static bool is_request_serialising_and_aligned(BdrvTrackedRequest *req)
 651 {
 652     /*
 653      * If the request is serialising, overlap_offset and overlap_bytes are set,
 654      * so we can check if the request is aligned. Otherwise, don't care and
 655      * return false.
 656      */
 657
 658     return req->serialising && (req->offset == req->overlap_offset) &&
 659            (req->bytes == req->overlap_bytes);
 660 }
 661
 662 /**
 663  * Round a region to cluster boundaries
 664  */
 665 void bdrv_round_to_clusters(BlockDriverState *bs,
 666                             int64_t offset, int64_t bytes,
 667                             int64_t *cluster_offset,
 668                             int64_t *cluster_bytes)
 669 {
 670     BlockDriverInfo bdi;
 671
 672     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
 673         *cluster_offset = offset;
 674         *cluster_bytes = bytes;
 675     } else {
 676         int64_t c = bdi.cluster_size;
 677         *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
 678         *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
 679     }
 680 }
 681
 682 static int bdrv_get_cluster_size(BlockDriverState *bs)
 683 {
 684     BlockDriverInfo bdi;
 685     int ret;
 686
 687     ret = bdrv_get_info(bs, &bdi);
 688     if (ret < 0 || bdi.cluster_size == 0) {
 689         return bs->bl.request_alignment;
 690     } else {
 691         return bdi.cluster_size;
 692     }
 693 }
 694
 695 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
 696                                      int64_t offset, uint64_t bytes)
 697 {
 698     /*        aaaa   bbbb */
 699     if (offset >= req->overlap_offset + req->overlap_bytes) {
 700         return false;
 701     }
 702     /* bbbb   aaaa        */
 703     if (req->overlap_offset >= offset + bytes) {
 704         return false;
 705     }
 706     return true;
 707 }
 708
 709 void bdrv_inc_in_flight(BlockDriverState *bs)
 710 {
 711     atomic_inc(&bs->in_flight);
 712 }
 713
 714 void bdrv_wakeup(BlockDriverState *bs)
 715 {
 716     aio_wait_kick(bdrv_get_aio_wait(bs));
 717     aio_wait_kick(&drain_all_aio_wait);
 718 }
 719
 720 void bdrv_dec_in_flight(BlockDriverState *bs)
 721 {
 722     atomic_dec(&bs->in_flight);
 723     bdrv_wakeup(bs);
 724 }
 725
 726 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
 727 {
 728     BlockDriverState *bs = self->bs;
 729     BdrvTrackedRequest *req;
 730     bool retry;
 731     bool waited = false;
 732
 733     if (!atomic_read(&bs->serialising_in_flight)) {
 734         return false;
 735     }
 736
 737     do {
 738         retry = false;
 739         qemu_co_mutex_lock(&bs->reqs_lock);
 740         QLIST_FOREACH(req, &bs->tracked_requests, list) {
 741             if (req == self || (!req->serialising && !self->serialising)) {
 742                 continue;
 743             }
 744             if (tracked_request_overlaps(req, self->overlap_offset,
 745                                          self->overlap_bytes))
 746             {
 747                 /* Hitting this means there was a reentrant request, for
 748                  * example, a block driver issuing nested requests.  This must
 749                  * never happen since it means deadlock.
 750                  */
 751                 assert(qemu_coroutine_self() != req->co);
 752
 753                 /* If the request is already (indirectly) waiting for us, or
 754                  * will wait for us as soon as it wakes up, then just go on
 755                  * (instead of producing a deadlock in the former case). */
 756                 if (!req->waiting_for) {
 757                     self->waiting_for = req;
 758                     qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
 759                     self->waiting_for = NULL;
 760                     retry = true;
 761                     waited = true;
 762                     break;
 763                 }
 764             }
 765         }
 766         qemu_co_mutex_unlock(&bs->reqs_lock);
 767     } while (retry);
 768
 769     return waited;
 770 }
 771
 772 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
 773                                    size_t size)
 774 {
 775     if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
 776         return -EIO;
 777     }
 778
 779     if (!bdrv_is_inserted(bs)) {
 780         return -ENOMEDIUM;
 781     }
 782
 783     if (offset < 0) {
 784         return -EIO;
 785     }
 786
 787     return 0;
 788 }
 789
 790 typedef struct RwCo {
 791     BdrvChild *child;
 792     int64_t offset;
 793     QEMUIOVector *qiov;
 794     bool is_write;
 795     int ret;
 796     BdrvRequestFlags flags;
 797 } RwCo;
 798
 799 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
 800 {
 801     RwCo *rwco = opaque;
 802
 803     if (!rwco->is_write) {
 804         rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
 805                                    rwco->qiov->size, rwco->qiov,
 806                                    rwco->flags);
 807     } else {
 808         rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
 809                                     rwco->qiov->size, rwco->qiov,
 810                                     rwco->flags);
 811     }
 812 }
 813
 814 /*
 815  * Process a vectored synchronous request using coroutines
 816  */
 817 static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
 818                         QEMUIOVector *qiov, bool is_write,
 819                         BdrvRequestFlags flags)
 820 {
 821     Coroutine *co;
 822     RwCo rwco = {
 823         .child = child,
 824         .offset = offset,
 825         .qiov = qiov,
 826         .is_write = is_write,
 827         .ret = NOT_DONE,
 828         .flags = flags,
 829     };
 830
 831     if (qemu_in_coroutine()) {
 832         /* Fast-path if already in coroutine context */
 833         bdrv_rw_co_entry(&rwco);
 834     } else {
 835         co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
 836         bdrv_coroutine_enter(child->bs, co);
 837         BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
 838     }
 839     return rwco.ret;
 840 }
 841
 842 /*
 843  * Process a synchronous request using coroutines
 844  */
 845 static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf,
 846                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
 847 {
 848     QEMUIOVector qiov;
 849     struct iovec iov = {
 850         .iov_base = (void *)buf,
 851         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
 852     };
 853
 854     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
 855         return -EINVAL;
 856     }
 857
 858     qemu_iovec_init_external(&qiov, &iov, 1);
 859     return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS,
 860                         &qiov, is_write, flags);
 861 }
 862
 863 /* return < 0 if error. See bdrv_write() for the return codes */
 864 int bdrv_read(BdrvChild *child, int64_t sector_num,
 865               uint8_t *buf, int nb_sectors)
 866 {
 867     return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0);
 868 }
 869
 870 /* Return < 0 if error. Important errors are:
 871   -EIO         generic I/O error (may happen for all errors)
 872   -ENOMEDIUM   No media inserted.
 873   -EINVAL      Invalid sector number or nb_sectors
 874   -EACCES      Trying to write a read-only device
 875 */
 876 int bdrv_write(BdrvChild *child, int64_t sector_num,
 877                const uint8_t *buf, int nb_sectors)
 878 {
 879     return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
 880 }
 881
 882 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
 883                        int bytes, BdrvRequestFlags flags)
 884 {
 885     QEMUIOVector qiov;
 886     struct iovec iov = {
 887         .iov_base = NULL,
 888         .iov_len = bytes,
 889     };
 890
 891     qemu_iovec_init_external(&qiov, &iov, 1);
 892     return bdrv_prwv_co(child, offset, &qiov, true,
 893                         BDRV_REQ_ZERO_WRITE | flags);
 894 }
 895
 896 /*
 897  * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
 898  * The operation is sped up by checking the block status and only writing
 899  * zeroes to the device if they currently do not return zeroes. Optional
 900  * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
 901  * BDRV_REQ_FUA).
 902  *
 903  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
 904  */
 905 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
 906 {
 907     int ret;
 908     int64_t target_size, bytes, offset = 0;
 909     BlockDriverState *bs = child->bs;
 910
 911     target_size = bdrv_getlength(bs);
 912     if (target_size < 0) {
 913         return target_size;
 914     }
 915
 916     for (;;) {
 917         bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
 918         if (bytes <= 0) {
 919             return 0;
 920         }
 921         ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
 922         if (ret < 0) {
 923             error_report("error getting block status at offset %" PRId64 ": %s",
 924                          offset, strerror(-ret));
 925             return ret;
 926         }
 927         if (ret & BDRV_BLOCK_ZERO) {
 928             offset += bytes;
 929             continue;
 930         }
 931         ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
 932         if (ret < 0) {
 933             error_report("error writing zeroes at offset %" PRId64 ": %s",
 934                          offset, strerror(-ret));
 935             return ret;
 936         }
 937         offset += bytes;
 938     }
 939 }
 940
 941 int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
 942 {
 943     int ret;
 944
 945     ret = bdrv_prwv_co(child, offset, qiov, false, 0);
 946     if (ret < 0) {
 947         return ret;
 948     }
 949
 950     return qiov->size;
 951 }
 952
 953 int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
 954 {
 955     QEMUIOVector qiov;
 956     struct iovec iov = {
 957         .iov_base = (void *)buf,
 958         .iov_len = bytes,
 959     };
 960
 961     if (bytes < 0) {
 962         return -EINVAL;
 963     }
 964
 965     qemu_iovec_init_external(&qiov, &iov, 1);
 966     return bdrv_preadv(child, offset, &qiov);
 967 }
 968
 969 int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
 970 {
 971     int ret;
 972
 973     ret = bdrv_prwv_co(child, offset, qiov, true, 0);
 974     if (ret < 0) {
 975         return ret;
 976     }
 977
 978     return qiov->size;
 979 }
 980
 981 int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
 982 {
 983     QEMUIOVector qiov;
 984     struct iovec iov = {
 985         .iov_base   = (void *) buf,
 986         .iov_len    = bytes,
 987     };
 988
 989     if (bytes < 0) {
 990         return -EINVAL;
 991     }
 992
 993     qemu_iovec_init_external(&qiov, &iov, 1);
 994     return bdrv_pwritev(child, offset, &qiov);
 995 }
 996
 997 /*
 998  * Writes to the file and ensures that no writes are reordered across this
 999  * request (acts as a barrier)
1000  *
1001  * Returns 0 on success, -errno in error cases.
1002  */
1003 int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
1004                      const void *buf, int count)
1005 {
1006     int ret;
1007
1008     ret = bdrv_pwrite(child, offset, buf, count);
1009     if (ret < 0) {
1010         return ret;
1011     }
1012
1013     ret = bdrv_flush(child->bs);
1014     if (ret < 0) {
1015         return ret;
1016     }
1017
1018     return 0;
1019 }
1020
1021 typedef struct CoroutineIOCompletion {
1022     Coroutine *coroutine;
1023     int ret;
1024 } CoroutineIOCompletion;
1025
1026 static void bdrv_co_io_em_complete(void *opaque, int ret)
1027 {
1028     CoroutineIOCompletion *co = opaque;
1029
1030     co->ret = ret;
1031     aio_co_wake(co->coroutine);
1032 }
1033
1034 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
1035                                            uint64_t offset, uint64_t bytes,
1036                                            QEMUIOVector *qiov, int flags)
1037 {
1038     BlockDriver *drv = bs->drv;
1039     int64_t sector_num;
1040     unsigned int nb_sectors;
1041
1042     assert(!(flags & ~BDRV_REQ_MASK));
1043
1044     if (!drv) {
1045         return -ENOMEDIUM;
1046     }
1047
1048     if (drv->bdrv_co_preadv) {
1049         return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
1050     }
1051
1052     if (drv->bdrv_aio_preadv) {
1053         BlockAIOCB *acb;
1054         CoroutineIOCompletion co = {
1055             .coroutine = qemu_coroutine_self(),
1056         };
1057
1058         acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
1059                                    bdrv_co_io_em_complete, &co);
1060         if (acb == NULL) {
1061             return -EIO;
1062         } else {
1063             qemu_coroutine_yield();
1064             return co.ret;
1065         }
1066     }
1067
1068     sector_num = offset >> BDRV_SECTOR_BITS;
1069     nb_sectors = bytes >> BDRV_SECTOR_BITS;
1070
1071     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1072     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1073     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
1074     assert(drv->bdrv_co_readv);
1075
1076     return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1077 }
1078
1079 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
1080                                             uint64_t offset, uint64_t bytes,
1081                                             QEMUIOVector *qiov, int flags)
1082 {
1083     BlockDriver *drv = bs->drv;
1084     int64_t sector_num;
1085     unsigned int nb_sectors;
1086     int ret;
1087
1088     assert(!(flags & ~BDRV_REQ_MASK));
1089
1090     if (!drv) {
1091         return -ENOMEDIUM;
1092     }
1093
1094     if (drv->bdrv_co_pwritev) {
1095         ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
1096                                    flags & bs->supported_write_flags);
1097         flags &= ~bs->supported_write_flags;
1098         goto emulate_flags;
1099     }
1100
1101     if (drv->bdrv_aio_pwritev) {
1102         BlockAIOCB *acb;
1103         CoroutineIOCompletion co = {
1104             .coroutine = qemu_coroutine_self(),
1105         };
1106
1107         acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
1108                                     flags & bs->supported_write_flags,
1109                                     bdrv_co_io_em_complete, &co);
1110         flags &= ~bs->supported_write_flags;
1111         if (acb == NULL) {
1112             ret = -EIO;
1113         } else {
1114             qemu_coroutine_yield();
1115             ret = co.ret;
1116         }
1117         goto emulate_flags;
1118     }
1119
1120     sector_num = offset >> BDRV_SECTOR_BITS;
1121     nb_sectors = bytes >> BDRV_SECTOR_BITS;
1122
1123     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1124     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1125     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
1126
1127     assert(drv->bdrv_co_writev);
1128     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
1129                               flags & bs->supported_write_flags);
1130     flags &= ~bs->supported_write_flags;
1131
1132 emulate_flags:
1133     if (ret == 0 && (flags & BDRV_REQ_FUA)) {
1134         ret = bdrv_co_flush(bs);
1135     }
1136
1137     return ret;
1138 }
1139
1140 static int coroutine_fn
1141 bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
1142                                uint64_t bytes, QEMUIOVector *qiov)
1143 {
1144     BlockDriver *drv = bs->drv;
1145
1146     if (!drv) {
1147         return -ENOMEDIUM;
1148     }
1149
1150     if (!drv->bdrv_co_pwritev_compressed) {
1151         return -ENOTSUP;
1152     }
1153
1154     return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
1155 }
1156
1157 static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
1158         int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
1159 {
1160     BlockDriverState *bs = child->bs;
1161
1162     /* Perform I/O through a temporary buffer so that users who scribble over
1163      * their read buffer while the operation is in progress do not end up
1164      * modifying the image file.  This is critical for zero-copy guest I/O
1165      * where anything might happen inside guest memory.
1166      */
1167     void *bounce_buffer;
1168
1169     BlockDriver *drv = bs->drv;
1170     struct iovec iov;
1171     QEMUIOVector local_qiov;
1172     int64_t cluster_offset;
1173     int64_t cluster_bytes;
1174     size_t skip_bytes;
1175     int ret;
1176     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1177                                     BDRV_REQUEST_MAX_BYTES);
1178     unsigned int progress = 0;
1179
1180     if (!drv) {
1181         return -ENOMEDIUM;
1182     }
1183
1184     /* FIXME We cannot require callers to have write permissions when all they
1185      * are doing is a read request. If we did things right, write permissions
1186      * would be obtained anyway, but internally by the copy-on-read code. As
1187      * long as it is implemented here rather than in a separate filter driver,
1188      * the copy-on-read code doesn't have its own BdrvChild, however, for which
1189      * it could request permissions. Therefore we have to bypass the permission
1190      * system for the moment. */
1191     // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1192
1193     /* Cover entire cluster so no additional backing file I/O is required when
1194      * allocating cluster in the image file.  Note that this value may exceed
1195      * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
1196      * is one reason we loop rather than doing it all at once.
1197      */
1198     bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
1199     skip_bytes = offset - cluster_offset;
1200
1201     trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
1202                                    cluster_offset, cluster_bytes);
1203
1204     bounce_buffer = qemu_try_blockalign(bs,
1205                                         MIN(MIN(max_transfer, cluster_bytes),
1206                                             MAX_BOUNCE_BUFFER));
1207     if (bounce_buffer == NULL) {
1208         ret = -ENOMEM;
1209         goto err;
1210     }
1211
1212     while (cluster_bytes) {
1213         int64_t pnum;
1214
1215         ret = bdrv_is_allocated(bs, cluster_offset,
1216                                 MIN(cluster_bytes, max_transfer), &pnum);
1217         if (ret < 0) {
1218             /* Safe to treat errors in querying allocation as if
1219              * unallocated; we'll probably fail again soon on the
1220              * read, but at least that will set a decent errno.
1221              */
1222             pnum = MIN(cluster_bytes, max_transfer);
1223         }
1224
1225         /* Stop at EOF if the image ends in the middle of the cluster */
1226         if (ret == 0 && pnum == 0) {
1227             assert(progress >= bytes);
1228             break;
1229         }
1230
1231         assert(skip_bytes < pnum);
1232
1233         if (ret <= 0) {
1234             /* Must copy-on-read; use the bounce buffer */
1235             iov.iov_base = bounce_buffer;
1236             iov.iov_len = pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
1237             qemu_iovec_init_external(&local_qiov, &iov, 1);
1238
1239             ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
1240                                      &local_qiov, 0);
1241             if (ret < 0) {
1242                 goto err;
1243             }
1244
1245             bdrv_debug_event(bs, BLKDBG_COR_WRITE);
1246             if (drv->bdrv_co_pwrite_zeroes &&
1247                 buffer_is_zero(bounce_buffer, pnum)) {
1248                 /* FIXME: Should we (perhaps conditionally) be setting
1249                  * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
1250                  * that still correctly reads as zero? */
1251                 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
1252                                                BDRV_REQ_WRITE_UNCHANGED);
1253             } else {
1254                 /* This does not change the data on the disk, it is not
1255                  * necessary to flush even in cache=writethrough mode.
1256                  */
1257                 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
1258                                           &local_qiov,
1259                                           BDRV_REQ_WRITE_UNCHANGED);
1260             }
1261
1262             if (ret < 0) {
1263                 /* It might be okay to ignore write errors for guest
1264                  * requests.  If this is a deliberate copy-on-read
1265                  * then we don't want to ignore the error.  Simply
1266                  * report it in all cases.
1267                  */
1268                 goto err;
1269             }
1270
1271             qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes,
1272                                 pnum - skip_bytes);
1273         } else {
1274             /* Read directly into the destination */
1275             qemu_iovec_init(&local_qiov, qiov->niov);
1276             qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes);
1277             ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size,
1278                                      &local_qiov, 0);
1279             qemu_iovec_destroy(&local_qiov);
1280             if (ret < 0) {
1281                 goto err;
1282             }
1283         }
1284
1285         cluster_offset += pnum;
1286         cluster_bytes -= pnum;
1287         progress += pnum - skip_bytes;
1288         skip_bytes = 0;
1289     }
1290     ret = 0;
1291
1292 err:
1293     qemu_vfree(bounce_buffer);
1294     return ret;
1295 }
1296
1297 /*
1298  * Forwards an already correctly aligned request to the BlockDriver. This
1299  * handles copy on read, zeroing after EOF, and fragmentation of large
1300  * reads; any other features must be implemented by the caller.
1301  */
1302 static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
1303     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1304     int64_t align, QEMUIOVector *qiov, int flags)
1305 {
1306     BlockDriverState *bs = child->bs;
1307     int64_t total_bytes, max_bytes;
1308     int ret = 0;
1309     uint64_t bytes_remaining = bytes;
1310     int max_transfer;
1311
1312     assert(is_power_of_2(align));
1313     assert((offset & (align - 1)) == 0);
1314     assert((bytes & (align - 1)) == 0);
1315     assert(!qiov || bytes == qiov->size);
1316     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1317     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1318                                    align);
1319
1320     /* TODO: We would need a per-BDS .supported_read_flags and
1321      * potential fallback support, if we ever implement any read flags
1322      * to pass through to drivers.  For now, there aren't any
1323      * passthrough flags.  */
1324     assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
1325
1326     /* Handle Copy on Read and associated serialisation */
1327     if (flags & BDRV_REQ_COPY_ON_READ) {
1328         /* If we touch the same cluster it counts as an overlap.  This
1329          * guarantees that allocating writes will be serialized and not race
1330          * with each other for the same cluster.  For example, in copy-on-read
1331          * it ensures that the CoR read and write operations are atomic and
1332          * guest writes cannot interleave between them. */
1333         mark_request_serialising(req, bdrv_get_cluster_size(bs));
1334     }
1335
1336     /* BDRV_REQ_SERIALISING is only for write operation */
1337     assert(!(flags & BDRV_REQ_SERIALISING));
1338
1339     if (!(flags & BDRV_REQ_NO_SERIALISING)) {
1340         wait_serialising_requests(req);
1341     }
1342
1343     if (flags & BDRV_REQ_COPY_ON_READ) {
1344         int64_t pnum;
1345
1346         ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
1347         if (ret < 0) {
1348             goto out;
1349         }
1350
1351         if (!ret || pnum != bytes) {
1352             ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
1353             goto out;
1354         }
1355     }
1356
1357     /* Forward the request to the BlockDriver, possibly fragmenting it */
1358     total_bytes = bdrv_getlength(bs);
1359     if (total_bytes < 0) {
1360         ret = total_bytes;
1361         goto out;
1362     }
1363
1364     max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1365     if (bytes <= max_bytes && bytes <= max_transfer) {
1366         ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
1367         goto out;
1368     }
1369
1370     while (bytes_remaining) {
1371         int num;
1372
1373         if (max_bytes) {
1374             QEMUIOVector local_qiov;
1375
1376             num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
1377             assert(num);
1378             qemu_iovec_init(&local_qiov, qiov->niov);
1379             qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1380
1381             ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
1382                                      num, &local_qiov, 0);
1383             max_bytes -= num;
1384             qemu_iovec_destroy(&local_qiov);
1385         } else {
1386             num = bytes_remaining;
1387             ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
1388                                     bytes_remaining);
1389         }
1390         if (ret < 0) {
1391             goto out;
1392         }
1393         bytes_remaining -= num;
1394     }
1395
1396 out:
1397     return ret < 0 ? ret : 0;
1398 }
1399
1400 /*
1401  * Handle a read request in coroutine context
1402  */
1403 int coroutine_fn bdrv_co_preadv(BdrvChild *child,
1404     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1405     BdrvRequestFlags flags)
1406 {
1407     BlockDriverState *bs = child->bs;
1408     BlockDriver *drv = bs->drv;
1409     BdrvTrackedRequest req;
1410
1411     uint64_t align = bs->bl.request_alignment;
1412     uint8_t *head_buf = NULL;
1413     uint8_t *tail_buf = NULL;
1414     QEMUIOVector local_qiov;
1415     bool use_local_qiov = false;
1416     int ret;
1417
1418     trace_bdrv_co_preadv(child->bs, offset, bytes, flags);
1419
1420     if (!drv) {
1421         return -ENOMEDIUM;
1422     }
1423
1424     ret = bdrv_check_byte_request(bs, offset, bytes);
1425     if (ret < 0) {
1426         return ret;
1427     }
1428
1429     bdrv_inc_in_flight(bs);
1430
1431     /* Don't do copy-on-read if we read data before write operation */
1432     if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) {
1433         flags |= BDRV_REQ_COPY_ON_READ;
1434     }
1435
1436     /* Align read if necessary by padding qiov */
1437     if (offset & (align - 1)) {
1438         head_buf = qemu_blockalign(bs, align);
1439         qemu_iovec_init(&local_qiov, qiov->niov + 2);
1440         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1441         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1442         use_local_qiov = true;
1443
1444         bytes += offset & (align - 1);
1445         offset = offset & ~(align - 1);
1446     }
1447
1448     if ((offset + bytes) & (align - 1)) {
1449         if (!use_local_qiov) {
1450             qemu_iovec_init(&local_qiov, qiov->niov + 1);
1451             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1452             use_local_qiov = true;
1453         }
1454         tail_buf = qemu_blockalign(bs, align);
1455         qemu_iovec_add(&local_qiov, tail_buf,
1456                        align - ((offset + bytes) & (align - 1)));
1457
1458         bytes = ROUND_UP(bytes, align);
1459     }
1460
1461     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1462     ret = bdrv_aligned_preadv(child, &req, offset, bytes, align,
1463                               use_local_qiov ? &local_qiov : qiov,
1464                               flags);
1465     tracked_request_end(&req);
1466     bdrv_dec_in_flight(bs);
1467
1468     if (use_local_qiov) {
1469         qemu_iovec_destroy(&local_qiov);
1470         qemu_vfree(head_buf);
1471         qemu_vfree(tail_buf);
1472     }
1473
1474     return ret;
1475 }
1476
1477 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1478     int64_t offset, int bytes, BdrvRequestFlags flags)
1479 {
1480     BlockDriver *drv = bs->drv;
1481     QEMUIOVector qiov;
1482     struct iovec iov = {0};
1483     int ret = 0;
1484     bool need_flush = false;
1485     int head = 0;
1486     int tail = 0;
1487
1488     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1489     int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1490                         bs->bl.request_alignment);
1491     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
1492
1493     if (!drv) {
1494         return -ENOMEDIUM;
1495     }
1496
1497     assert(alignment % bs->bl.request_alignment == 0);
1498     head = offset % alignment;
1499     tail = (offset + bytes) % alignment;
1500     max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1501     assert(max_write_zeroes >= bs->bl.request_alignment);
1502
1503     while (bytes > 0 && !ret) {
1504         int num = bytes;
1505
1506         /* Align request.  Block drivers can expect the "bulk" of the request
1507          * to be aligned, and that unaligned requests do not cross cluster
1508          * boundaries.
1509          */
1510         if (head) {
1511             /* Make a small request up to the first aligned sector. For
1512              * convenience, limit this request to max_transfer even if
1513              * we don't need to fall back to writes.  */
1514             num = MIN(MIN(bytes, max_transfer), alignment - head);
1515             head = (head + num) % alignment;
1516             assert(num < max_write_zeroes);
1517         } else if (tail && num > alignment) {
1518             /* Shorten the request to the last aligned sector.  */
1519             num -= tail;
1520         }
1521
1522         /* limit request size */
1523         if (num > max_write_zeroes) {
1524             num = max_write_zeroes;
1525         }
1526
1527         ret = -ENOTSUP;
1528         /* First try the efficient write zeroes operation */
1529         if (drv->bdrv_co_pwrite_zeroes) {
1530             ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1531                                              flags & bs->supported_zero_flags);
1532             if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1533                 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1534                 need_flush = true;
1535             }
1536         } else {
1537             assert(!bs->supported_zero_flags);
1538         }
1539
1540         if (ret == -ENOTSUP) {
1541             /* Fall back to bounce buffer if write zeroes is unsupported */
1542             BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1543
1544             if ((flags & BDRV_REQ_FUA) &&
1545                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1546                 /* No need for bdrv_driver_pwrite() to do a fallback
1547                  * flush on each chunk; use just one at the end */
1548                 write_flags &= ~BDRV_REQ_FUA;
1549                 need_flush = true;
1550             }
1551             num = MIN(num, max_transfer);
1552             iov.iov_len = num;
1553             if (iov.iov_base == NULL) {
1554                 iov.iov_base = qemu_try_blockalign(bs, num);
1555                 if (iov.iov_base == NULL) {
1556                     ret = -ENOMEM;
1557                     goto fail;
1558                 }
1559                 memset(iov.iov_base, 0, num);
1560             }
1561             qemu_iovec_init_external(&qiov, &iov, 1);
1562
1563             ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
1564
1565             /* Keep bounce buffer around if it is big enough for all
1566              * all future requests.
1567              */
1568             if (num < max_transfer) {
1569                 qemu_vfree(iov.iov_base);
1570                 iov.iov_base = NULL;
1571             }
1572         }
1573
1574         offset += num;
1575         bytes -= num;
1576     }
1577
1578 fail:
1579     if (ret == 0 && need_flush) {
1580         ret = bdrv_co_flush(bs);
1581     }
1582     qemu_vfree(iov.iov_base);
1583     return ret;
1584 }
1585
1586 static inline int coroutine_fn
1587 bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes,
1588                           BdrvTrackedRequest *req, int flags)
1589 {
1590     BlockDriverState *bs = child->bs;
1591     bool waited;
1592     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1593
1594     if (bs->read_only) {
1595         return -EPERM;
1596     }
1597
1598     /* BDRV_REQ_NO_SERIALISING is only for read operation */
1599     assert(!(flags & BDRV_REQ_NO_SERIALISING));
1600     assert(!(bs->open_flags & BDRV_O_INACTIVE));
1601     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1602     assert(!(flags & ~BDRV_REQ_MASK));
1603
1604     if (flags & BDRV_REQ_SERIALISING) {
1605         mark_request_serialising(req, bdrv_get_cluster_size(bs));
1606     }
1607
1608     waited = wait_serialising_requests(req);
1609
1610     assert(!waited || !req->serialising ||
1611            is_request_serialising_and_aligned(req));
1612     assert(req->overlap_offset <= offset);
1613     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1614     assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
1615
1616     switch (req->type) {
1617     case BDRV_TRACKED_WRITE:
1618     case BDRV_TRACKED_DISCARD:
1619         if (flags & BDRV_REQ_WRITE_UNCHANGED) {
1620             assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1621         } else {
1622             assert(child->perm & BLK_PERM_WRITE);
1623         }
1624         return notifier_with_return_list_notify(&bs->before_write_notifiers,
1625                                                 req);
1626     case BDRV_TRACKED_TRUNCATE:
1627         assert(child->perm & BLK_PERM_RESIZE);
1628         return 0;
1629     default:
1630         abort();
1631     }
1632 }
1633
1634 static inline void coroutine_fn
1635 bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes,
1636                          BdrvTrackedRequest *req, int ret)
1637 {
1638     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1639     BlockDriverState *bs = child->bs;
1640
1641     atomic_inc(&bs->write_gen);
1642
1643     /*
1644      * Discard cannot extend the image, but in error handling cases, such as
1645      * when reverting a qcow2 cluster allocation, the discarded range can pass
1646      * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD
1647      * here. Instead, just skip it, since semantically a discard request
1648      * beyond EOF cannot expand the image anyway.
1649      */
1650     if (ret == 0 &&
1651         (req->type == BDRV_TRACKED_TRUNCATE ||
1652          end_sector > bs->total_sectors) &&
1653         req->type != BDRV_TRACKED_DISCARD) {
1654         bs->total_sectors = end_sector;
1655         bdrv_parent_cb_resize(bs);
1656         bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS);
1657     }
1658     if (req->bytes) {
1659         switch (req->type) {
1660         case BDRV_TRACKED_WRITE:
1661             stat64_max(&bs->wr_highest_offset, offset + bytes);
1662             /* fall through, to set dirty bits */
1663         case BDRV_TRACKED_DISCARD:
1664             bdrv_set_dirty(bs, offset, bytes);
1665             break;
1666         default:
1667             break;
1668         }
1669     }
1670 }
1671
1672 /*
1673  * Forwards an already correctly aligned write request to the BlockDriver,
1674  * after possibly fragmenting it.
1675  */
1676 static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
1677     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1678     int64_t align, QEMUIOVector *qiov, int flags)
1679 {
1680     BlockDriverState *bs = child->bs;
1681     BlockDriver *drv = bs->drv;
1682     int ret;
1683
1684     uint64_t bytes_remaining = bytes;
1685     int max_transfer;
1686
1687     if (!drv) {
1688         return -ENOMEDIUM;
1689     }
1690
1691     if (bdrv_has_readonly_bitmaps(bs)) {
1692         return -EPERM;
1693     }
1694
1695     assert(is_power_of_2(align));
1696     assert((offset & (align - 1)) == 0);
1697     assert((bytes & (align - 1)) == 0);
1698     assert(!qiov || bytes == qiov->size);
1699     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1700                                    align);
1701
1702     ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags);
1703
1704     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1705         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
1706         qemu_iovec_is_zero(qiov)) {
1707         flags |= BDRV_REQ_ZERO_WRITE;
1708         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1709             flags |= BDRV_REQ_MAY_UNMAP;
1710         }
1711     }
1712
1713     if (ret < 0) {
1714         /* Do nothing, write notifier decided to fail this request */
1715     } else if (flags & BDRV_REQ_ZERO_WRITE) {
1716         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1717         ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
1718     } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
1719         ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov);
1720     } else if (bytes <= max_transfer) {
1721         bdrv_debug_event(bs, BLKDBG_PWRITEV);
1722         ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
1723     } else {
1724         bdrv_debug_event(bs, BLKDBG_PWRITEV);
1725         while (bytes_remaining) {
1726             int num = MIN(bytes_remaining, max_transfer);
1727             QEMUIOVector local_qiov;
1728             int local_flags = flags;
1729
1730             assert(num);
1731             if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
1732                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1733                 /* If FUA is going to be emulated by flush, we only
1734                  * need to flush on the last iteration */
1735                 local_flags &= ~BDRV_REQ_FUA;
1736             }
1737             qemu_iovec_init(&local_qiov, qiov->niov);
1738             qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1739
1740             ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
1741                                       num, &local_qiov, local_flags);
1742             qemu_iovec_destroy(&local_qiov);
1743             if (ret < 0) {
1744                 break;
1745             }
1746             bytes_remaining -= num;
1747         }
1748     }
1749     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
1750
1751     if (ret >= 0) {
1752         ret = 0;
1753     }
1754     bdrv_co_write_req_finish(child, offset, bytes, req, ret);
1755
1756     return ret;
1757 }
1758
1759 static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
1760                                                 int64_t offset,
1761                                                 unsigned int bytes,
1762                                                 BdrvRequestFlags flags,
1763                                                 BdrvTrackedRequest *req)
1764 {
1765     BlockDriverState *bs = child->bs;
1766     uint8_t *buf = NULL;
1767     QEMUIOVector local_qiov;
1768     struct iovec iov;
1769     uint64_t align = bs->bl.request_alignment;
1770     unsigned int head_padding_bytes, tail_padding_bytes;
1771     int ret = 0;
1772
1773     head_padding_bytes = offset & (align - 1);
1774     tail_padding_bytes = (align - (offset + bytes)) & (align - 1);
1775
1776
1777     assert(flags & BDRV_REQ_ZERO_WRITE);
1778     if (head_padding_bytes || tail_padding_bytes) {
1779         buf = qemu_blockalign(bs, align);
1780         iov = (struct iovec) {
1781             .iov_base   = buf,
1782             .iov_len    = align,
1783         };
1784         qemu_iovec_init_external(&local_qiov, &iov, 1);
1785     }
1786     if (head_padding_bytes) {
1787         uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1788
1789         /* RMW the unaligned part before head. */
1790         mark_request_serialising(req, align);
1791         wait_serialising_requests(req);
1792         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1793         ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align,
1794                                   align, &local_qiov, 0);
1795         if (ret < 0) {
1796             goto fail;
1797         }
1798         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1799
1800         memset(buf + head_padding_bytes, 0, zero_bytes);
1801         ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align,
1802                                    align, &local_qiov,
1803                                    flags & ~BDRV_REQ_ZERO_WRITE);
1804         if (ret < 0) {
1805             goto fail;
1806         }
1807         offset += zero_bytes;
1808         bytes -= zero_bytes;
1809     }
1810
1811     assert(!bytes || (offset & (align - 1)) == 0);
1812     if (bytes >= align) {
1813         /* Write the aligned part in the middle. */
1814         uint64_t aligned_bytes = bytes & ~(align - 1);
1815         ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
1816                                    NULL, flags);
1817         if (ret < 0) {
1818             goto fail;
1819         }
1820         bytes -= aligned_bytes;
1821         offset += aligned_bytes;
1822     }
1823
1824     assert(!bytes || (offset & (align - 1)) == 0);
1825     if (bytes) {
1826         assert(align == tail_padding_bytes + bytes);
1827         /* RMW the unaligned part after tail. */
1828         mark_request_serialising(req, align);
1829         wait_serialising_requests(req);
1830         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1831         ret = bdrv_aligned_preadv(child, req, offset, align,
1832                                   align, &local_qiov, 0);
1833         if (ret < 0) {
1834             goto fail;
1835         }
1836         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1837
1838         memset(buf, 0, bytes);
1839         ret = bdrv_aligned_pwritev(child, req, offset, align, align,
1840                                    &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1841     }
1842 fail:
1843     qemu_vfree(buf);
1844     return ret;
1845
1846 }
1847
1848 /*
1849  * Handle a write request in coroutine context
1850  */
1851 int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
1852     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1853     BdrvRequestFlags flags)
1854 {
1855     BlockDriverState *bs = child->bs;
1856     BdrvTrackedRequest req;
1857     uint64_t align = bs->bl.request_alignment;
1858     uint8_t *head_buf = NULL;
1859     uint8_t *tail_buf = NULL;
1860     QEMUIOVector local_qiov;
1861     bool use_local_qiov = false;
1862     int ret;
1863
1864     trace_bdrv_co_pwritev(child->bs, offset, bytes, flags);
1865
1866     if (!bs->drv) {
1867         return -ENOMEDIUM;
1868     }
1869
1870     ret = bdrv_check_byte_request(bs, offset, bytes);
1871     if (ret < 0) {
1872         return ret;
1873     }
1874
1875     bdrv_inc_in_flight(bs);
1876     /*
1877      * Align write if necessary by performing a read-modify-write cycle.
1878      * Pad qiov with the read parts and be sure to have a tracked request not
1879      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1880      */
1881     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
1882
1883     if (flags & BDRV_REQ_ZERO_WRITE) {
1884         ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
1885         goto out;
1886     }
1887
1888     if (offset & (align - 1)) {
1889         QEMUIOVector head_qiov;
1890         struct iovec head_iov;
1891
1892         mark_request_serialising(&req, align);
1893         wait_serialising_requests(&req);
1894
1895         head_buf = qemu_blockalign(bs, align);
1896         head_iov = (struct iovec) {
1897             .iov_base   = head_buf,
1898             .iov_len    = align,
1899         };
1900         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1901
1902         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1903         ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align,
1904                                   align, &head_qiov, 0);
1905         if (ret < 0) {
1906             goto fail;
1907         }
1908         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1909
1910         qemu_iovec_init(&local_qiov, qiov->niov + 2);
1911         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1912         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1913         use_local_qiov = true;
1914
1915         bytes += offset & (align - 1);
1916         offset = offset & ~(align - 1);
1917
1918         /* We have read the tail already if the request is smaller
1919          * than one aligned block.
1920          */
1921         if (bytes < align) {
1922             qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
1923             bytes = align;
1924         }
1925     }
1926
1927     if ((offset + bytes) & (align - 1)) {
1928         QEMUIOVector tail_qiov;
1929         struct iovec tail_iov;
1930         size_t tail_bytes;
1931         bool waited;
1932
1933         mark_request_serialising(&req, align);
1934         waited = wait_serialising_requests(&req);
1935         assert(!waited || !use_local_qiov);
1936
1937         tail_buf = qemu_blockalign(bs, align);
1938         tail_iov = (struct iovec) {
1939             .iov_base   = tail_buf,
1940             .iov_len    = align,
1941         };
1942         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1943
1944         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1945         ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1),
1946                                   align, align, &tail_qiov, 0);
1947         if (ret < 0) {
1948             goto fail;
1949         }
1950         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1951
1952         if (!use_local_qiov) {
1953             qemu_iovec_init(&local_qiov, qiov->niov + 1);
1954             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1955             use_local_qiov = true;
1956         }
1957
1958         tail_bytes = (offset + bytes) & (align - 1);
1959         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1960
1961         bytes = ROUND_UP(bytes, align);
1962     }
1963
1964     ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
1965                                use_local_qiov ? &local_qiov : qiov,
1966                                flags);
1967
1968 fail:
1969
1970     if (use_local_qiov) {
1971         qemu_iovec_destroy(&local_qiov);
1972     }
1973     qemu_vfree(head_buf);
1974     qemu_vfree(tail_buf);
1975 out:
1976     tracked_request_end(&req);
1977     bdrv_dec_in_flight(bs);
1978     return ret;
1979 }
1980
1981 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
1982                                        int bytes, BdrvRequestFlags flags)
1983 {
1984     trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
1985
1986     if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
1987         flags &= ~BDRV_REQ_MAY_UNMAP;
1988     }
1989
1990     return bdrv_co_pwritev(child, offset, bytes, NULL,
1991                            BDRV_REQ_ZERO_WRITE | flags);
1992 }
1993
1994 /*
1995  * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
1996  */
1997 int bdrv_flush_all(void)
1998 {
1999     BdrvNextIterator it;
2000     BlockDriverState *bs = NULL;
2001     int result = 0;
2002
2003     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
2004         AioContext *aio_context = bdrv_get_aio_context(bs);
2005         int ret;
2006
2007         aio_context_acquire(aio_context);
2008         ret = bdrv_flush(bs);
2009         if (ret < 0 && !result) {
2010             result = ret;
2011         }
2012         aio_context_release(aio_context);
2013     }
2014
2015     return result;
2016 }
2017
2018
2019 typedef struct BdrvCoBlockStatusData {
2020     BlockDriverState *bs;
2021     BlockDriverState *base;
2022     bool want_zero;
2023     int64_t offset;
2024     int64_t bytes;
2025     int64_t *pnum;
2026     int64_t *map;
2027     BlockDriverState **file;
2028     int ret;
2029     bool done;
2030 } BdrvCoBlockStatusData;
2031
2032 int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
2033                                                 bool want_zero,
2034                                                 int64_t offset,
2035                                                 int64_t bytes,
2036                                                 int64_t *pnum,
2037                                                 int64_t *map,
2038                                                 BlockDriverState **file)
2039 {
2040     assert(bs->file && bs->file->bs);
2041     *pnum = bytes;
2042     *map = offset;
2043     *file = bs->file->bs;
2044     return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2045 }
2046
2047 int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs,
2048                                                    bool want_zero,
2049                                                    int64_t offset,
2050                                                    int64_t bytes,
2051                                                    int64_t *pnum,
2052                                                    int64_t *map,
2053                                                    BlockDriverState **file)
2054 {
2055     assert(bs->backing && bs->backing->bs);
2056     *pnum = bytes;
2057     *map = offset;
2058     *file = bs->backing->bs;
2059     return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2060 }
2061
2062 /*
2063  * Returns the allocation status of the specified sectors.
2064  * Drivers not implementing the functionality are assumed to not support
2065  * backing files, hence all their sectors are reported as allocated.
2066  *
2067  * If 'want_zero' is true, the caller is querying for mapping
2068  * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
2069  * _ZERO where possible; otherwise, the result favors larger 'pnum',
2070  * with a focus on accurate BDRV_BLOCK_ALLOCATED.
2071  *
2072  * If 'offset' is beyond the end of the disk image the return value is
2073  * BDRV_BLOCK_EOF and 'pnum' is set to 0.
2074  *
2075  * 'bytes' is the max value 'pnum' should be set to.  If bytes goes
2076  * beyond the end of the disk image it will be clamped; if 'pnum' is set to
2077  * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
2078  *
2079  * 'pnum' is set to the number of bytes (including and immediately
2080  * following the specified offset) that are easily known to be in the
2081  * same allocated/unallocated state.  Note that a second call starting
2082  * at the original offset plus returned pnum may have the same status.
2083  * The returned value is non-zero on success except at end-of-file.
2084  *
2085  * Returns negative errno on failure.  Otherwise, if the
2086  * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
2087  * set to the host mapping and BDS corresponding to the guest offset.
2088  */
2089 static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
2090                                              bool want_zero,
2091                                              int64_t offset, int64_t bytes,
2092                                              int64_t *pnum, int64_t *map,
2093                                              BlockDriverState **file)
2094 {
2095     int64_t total_size;
2096     int64_t n; /* bytes */
2097     int ret;
2098     int64_t local_map = 0;
2099     BlockDriverState *local_file = NULL;
2100     int64_t aligned_offset, aligned_bytes;
2101     uint32_t align;
2102
2103     assert(pnum);
2104     *pnum = 0;
2105     total_size = bdrv_getlength(bs);
2106     if (total_size < 0) {
2107         ret = total_size;
2108         goto early_out;
2109     }
2110
2111     if (offset >= total_size) {
2112         ret = BDRV_BLOCK_EOF;
2113         goto early_out;
2114     }
2115     if (!bytes) {
2116         ret = 0;
2117         goto early_out;
2118     }
2119
2120     n = total_size - offset;
2121     if (n < bytes) {
2122         bytes = n;
2123     }
2124
2125     /* Must be non-NULL or bdrv_getlength() would have failed */
2126     assert(bs->drv);
2127     if (!bs->drv->bdrv_co_block_status) {
2128         *pnum = bytes;
2129         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
2130         if (offset + bytes == total_size) {
2131             ret |= BDRV_BLOCK_EOF;
2132         }
2133         if (bs->drv->protocol_name) {
2134             ret |= BDRV_BLOCK_OFFSET_VALID;
2135             local_map = offset;
2136             local_file = bs;
2137         }
2138         goto early_out;
2139     }
2140
2141     bdrv_inc_in_flight(bs);
2142
2143     /* Round out to request_alignment boundaries */
2144     align = bs->bl.request_alignment;
2145     aligned_offset = QEMU_ALIGN_DOWN(offset, align);
2146     aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
2147
2148     ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
2149                                         aligned_bytes, pnum, &local_map,
2150                                         &local_file);
2151     if (ret < 0) {
2152         *pnum = 0;
2153         goto out;
2154     }
2155
2156     /*
2157      * The driver's result must be a non-zero multiple of request_alignment.
2158      * Clamp pnum and adjust map to original request.
2159      */
2160     assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
2161            align > offset - aligned_offset);
2162     *pnum -= offset - aligned_offset;
2163     if (*pnum > bytes) {
2164         *pnum = bytes;
2165     }
2166     if (ret & BDRV_BLOCK_OFFSET_VALID) {
2167         local_map += offset - aligned_offset;
2168     }
2169
2170     if (ret & BDRV_BLOCK_RAW) {
2171         assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
2172         ret = bdrv_co_block_status(local_file, want_zero, local_map,
2173                                    *pnum, pnum, &local_map, &local_file);
2174         goto out;
2175     }
2176
2177     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
2178         ret |= BDRV_BLOCK_ALLOCATED;
2179     } else if (want_zero) {
2180         if (bdrv_unallocated_blocks_are_zero(bs)) {
2181             ret |= BDRV_BLOCK_ZERO;
2182         } else if (bs->backing) {
2183             BlockDriverState *bs2 = bs->backing->bs;
2184             int64_t size2 = bdrv_getlength(bs2);
2185
2186             if (size2 >= 0 && offset >= size2) {
2187                 ret |= BDRV_BLOCK_ZERO;
2188             }
2189         }
2190     }
2191
2192     if (want_zero && local_file && local_file != bs &&
2193         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
2194         (ret & BDRV_BLOCK_OFFSET_VALID)) {
2195         int64_t file_pnum;
2196         int ret2;
2197
2198         ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
2199                                     *pnum, &file_pnum, NULL, NULL);
2200         if (ret2 >= 0) {
2201             /* Ignore errors.  This is just providing extra information, it
2202              * is useful but not necessary.
2203              */
2204             if (ret2 & BDRV_BLOCK_EOF &&
2205                 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
2206                 /*
2207                  * It is valid for the format block driver to read
2208                  * beyond the end of the underlying file's current
2209                  * size; such areas read as zero.
2210                  */
2211                 ret |= BDRV_BLOCK_ZERO;
2212             } else {
2213                 /* Limit request to the range reported by the protocol driver */
2214                 *pnum = file_pnum;
2215                 ret |= (ret2 & BDRV_BLOCK_ZERO);
2216             }
2217         }
2218     }
2219
2220 out:
2221     bdrv_dec_in_flight(bs);
2222     if (ret >= 0 && offset + *pnum == total_size) {
2223         ret |= BDRV_BLOCK_EOF;
2224     }
2225 early_out:
2226     if (file) {
2227         *file = local_file;
2228     }
2229     if (map) {
2230         *map = local_map;
2231     }
2232     return ret;
2233 }
2234
2235 static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
2236                                                    BlockDriverState *base,
2237                                                    bool want_zero,
2238                                                    int64_t offset,
2239                                                    int64_t bytes,
2240                                                    int64_t *pnum,
2241                                                    int64_t *map,
2242                                                    BlockDriverState **file)
2243 {
2244     BlockDriverState *p;
2245     int ret = 0;
2246     bool first = true;
2247
2248     assert(bs != base);
2249     for (p = bs; p != base; p = backing_bs(p)) {
2250         ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
2251                                    file);
2252         if (ret < 0) {
2253             break;
2254         }
2255         if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
2256             /*
2257              * Reading beyond the end of the file continues to read
2258              * zeroes, but we can only widen the result to the
2259              * unallocated length we learned from an earlier
2260              * iteration.
2261              */
2262             *pnum = bytes;
2263         }
2264         if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
2265             break;
2266         }
2267         /* [offset, pnum] unallocated on this layer, which could be only
2268          * the first part of [offset, bytes].  */
2269         bytes = MIN(bytes, *pnum);
2270         first = false;
2271     }
2272     return ret;
2273 }
2274
2275 /* Coroutine wrapper for bdrv_block_status_above() */
2276 static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque)
2277 {
2278     BdrvCoBlockStatusData *data = opaque;
2279
2280     data->ret = bdrv_co_block_status_above(data->bs, data->base,
2281                                            data->want_zero,
2282                                            data->offset, data->bytes,
2283                                            data->pnum, data->map, data->file);
2284     data->done = true;
2285 }
2286
2287 /*
2288  * Synchronous wrapper around bdrv_co_block_status_above().
2289  *
2290  * See bdrv_co_block_status_above() for details.
2291  */
2292 static int bdrv_common_block_status_above(BlockDriverState *bs,
2293                                           BlockDriverState *base,
2294                                           bool want_zero, int64_t offset,
2295                                           int64_t bytes, int64_t *pnum,
2296                                           int64_t *map,
2297                                           BlockDriverState **file)
2298 {
2299     Coroutine *co;
2300     BdrvCoBlockStatusData data = {
2301         .bs = bs,
2302         .base = base,
2303         .want_zero = want_zero,
2304         .offset = offset,
2305         .bytes = bytes,
2306         .pnum = pnum,
2307         .map = map,
2308         .file = file,
2309         .done = false,
2310     };
2311
2312     if (qemu_in_coroutine()) {
2313         /* Fast-path if already in coroutine context */
2314         bdrv_block_status_above_co_entry(&data);
2315     } else {
2316         co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data);
2317         bdrv_coroutine_enter(bs, co);
2318         BDRV_POLL_WHILE(bs, !data.done);
2319     }
2320     return data.ret;
2321 }
2322
2323 int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
2324                             int64_t offset, int64_t bytes, int64_t *pnum,
2325                             int64_t *map, BlockDriverState **file)
2326 {
2327     return bdrv_common_block_status_above(bs, base, true, offset, bytes,
2328                                           pnum, map, file);
2329 }
2330
2331 int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
2332                       int64_t *pnum, int64_t *map, BlockDriverState **file)
2333 {
2334     return bdrv_block_status_above(bs, backing_bs(bs),
2335                                    offset, bytes, pnum, map, file);
2336 }
2337
2338 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
2339                                    int64_t bytes, int64_t *pnum)
2340 {
2341     int ret;
2342     int64_t dummy;
2343
2344     ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset,
2345                                          bytes, pnum ? pnum : &dummy, NULL,
2346                                          NULL);
2347     if (ret < 0) {
2348         return ret;
2349     }
2350     return !!(ret & BDRV_BLOCK_ALLOCATED);
2351 }
2352
2353 /*
2354  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2355  *
2356  * Return true if (a prefix of) the given range is allocated in any image
2357  * between BASE and TOP (inclusive).  BASE can be NULL to check if the given
2358  * offset is allocated in any image of the chain.  Return false otherwise,
2359  * or negative errno on failure.
2360  *
2361  * 'pnum' is set to the number of bytes (including and immediately
2362  * following the specified offset) that are known to be in the same
2363  * allocated/unallocated state.  Note that a subsequent call starting
2364  * at 'offset + *pnum' may return the same allocation status (in other
2365  * words, the result is not necessarily the maximum possible range);
2366  * but 'pnum' will only be 0 when end of file is reached.
2367  *
2368  */
2369 int bdrv_is_allocated_above(BlockDriverState *top,
2370                             BlockDriverState *base,
2371                             int64_t offset, int64_t bytes, int64_t *pnum)
2372 {
2373     BlockDriverState *intermediate;
2374     int ret;
2375     int64_t n = bytes;
2376
2377     intermediate = top;
2378     while (intermediate && intermediate != base) {
2379         int64_t pnum_inter;
2380         int64_t size_inter;
2381
2382         ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
2383         if (ret < 0) {
2384             return ret;
2385         }
2386         if (ret) {
2387             *pnum = pnum_inter;
2388             return 1;
2389         }
2390
2391         size_inter = bdrv_getlength(intermediate);
2392         if (size_inter < 0) {
2393             return size_inter;
2394         }
2395         if (n > pnum_inter &&
2396             (intermediate == top || offset + pnum_inter < size_inter)) {
2397             n = pnum_inter;
2398         }
2399
2400         intermediate = backing_bs(intermediate);
2401     }
2402
2403     *pnum = n;
2404     return 0;
2405 }
2406
2407 typedef struct BdrvVmstateCo {
2408     BlockDriverState    *bs;
2409     QEMUIOVector        *qiov;
2410     int64_t             pos;
2411     bool                is_read;
2412     int                 ret;
2413 } BdrvVmstateCo;
2414
2415 static int coroutine_fn
2416 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2417                    bool is_read)
2418 {
2419     BlockDriver *drv = bs->drv;
2420     int ret = -ENOTSUP;
2421
2422     bdrv_inc_in_flight(bs);
2423
2424     if (!drv) {
2425         ret = -ENOMEDIUM;
2426     } else if (drv->bdrv_load_vmstate) {
2427         if (is_read) {
2428             ret = drv->bdrv_load_vmstate(bs, qiov, pos);
2429         } else {
2430             ret = drv->bdrv_save_vmstate(bs, qiov, pos);
2431         }
2432     } else if (bs->file) {
2433         ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
2434     }
2435
2436     bdrv_dec_in_flight(bs);
2437     return ret;
2438 }
2439
2440 static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
2441 {
2442     BdrvVmstateCo *co = opaque;
2443     co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
2444 }
2445
2446 static inline int
2447 bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2448                 bool is_read)
2449 {
2450     if (qemu_in_coroutine()) {
2451         return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
2452     } else {
2453         BdrvVmstateCo data = {
2454             .bs         = bs,
2455             .qiov       = qiov,
2456             .pos        = pos,
2457             .is_read    = is_read,
2458             .ret        = -EINPROGRESS,
2459         };
2460         Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
2461
2462         bdrv_coroutine_enter(bs, co);
2463         BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
2464         return data.ret;
2465     }
2466 }
2467
2468 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2469                       int64_t pos, int size)
2470 {
2471     QEMUIOVector qiov;
2472     struct iovec iov = {
2473         .iov_base   = (void *) buf,
2474         .iov_len    = size,
2475     };
2476     int ret;
2477
2478     qemu_iovec_init_external(&qiov, &iov, 1);
2479
2480     ret = bdrv_writev_vmstate(bs, &qiov, pos);
2481     if (ret < 0) {
2482         return ret;
2483     }
2484
2485     return size;
2486 }
2487
2488 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2489 {
2490     return bdrv_rw_vmstate(bs, qiov, pos, false);
2491 }
2492
2493 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2494                       int64_t pos, int size)
2495 {
2496     QEMUIOVector qiov;
2497     struct iovec iov = {
2498         .iov_base   = buf,
2499         .iov_len    = size,
2500     };
2501     int ret;
2502
2503     qemu_iovec_init_external(&qiov, &iov, 1);
2504     ret = bdrv_readv_vmstate(bs, &qiov, pos);
2505     if (ret < 0) {
2506         return ret;
2507     }
2508
2509     return size;
2510 }
2511
2512 int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2513 {
2514     return bdrv_rw_vmstate(bs, qiov, pos, true);
2515 }
2516
2517 /**************************************************************/
2518 /* async I/Os */
2519
2520 void bdrv_aio_cancel(BlockAIOCB *acb)
2521 {
2522     qemu_aio_ref(acb);
2523     bdrv_aio_cancel_async(acb);
2524     while (acb->refcnt > 1) {
2525         if (acb->aiocb_info->get_aio_context) {
2526             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
2527         } else if (acb->bs) {
2528             /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
2529              * assert that we're not using an I/O thread.  Thread-safe
2530              * code should use bdrv_aio_cancel_async exclusively.
2531              */
2532             assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
2533             aio_poll(bdrv_get_aio_context(acb->bs), true);
2534         } else {
2535             abort();
2536         }
2537     }
2538     qemu_aio_unref(acb);
2539 }
2540
2541 /* Async version of aio cancel. The caller is not blocked if the acb implements
2542  * cancel_async, otherwise we do nothing and let the request normally complete.
2543  * In either case the completion callback must be called. */
2544 void bdrv_aio_cancel_async(BlockAIOCB *acb)
2545 {
2546     if (acb->aiocb_info->cancel_async) {
2547         acb->aiocb_info->cancel_async(acb);
2548     }
2549 }
2550
2551 /**************************************************************/
2552 /* Coroutine block device emulation */
2553
2554 typedef struct FlushCo {
2555     BlockDriverState *bs;
2556     int ret;
2557 } FlushCo;
2558
2559
2560 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2561 {
2562     FlushCo *rwco = opaque;
2563
2564     rwco->ret = bdrv_co_flush(rwco->bs);
2565 }
2566
2567 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2568 {
2569     int current_gen;
2570     int ret = 0;
2571
2572     bdrv_inc_in_flight(bs);
2573
2574     if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2575         bdrv_is_sg(bs)) {
2576         goto early_exit;
2577     }
2578
2579     qemu_co_mutex_lock(&bs->reqs_lock);
2580     current_gen = atomic_read(&bs->write_gen);
2581
2582     /* Wait until any previous flushes are completed */
2583     while (bs->active_flush_req) {
2584         qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
2585     }
2586
2587     /* Flushes reach this point in nondecreasing current_gen order.  */
2588     bs->active_flush_req = true;
2589     qemu_co_mutex_unlock(&bs->reqs_lock);
2590
2591     /* Write back all layers by calling one driver function */
2592     if (bs->drv->bdrv_co_flush) {
2593         ret = bs->drv->bdrv_co_flush(bs);
2594         goto out;
2595     }
2596
2597     /* Write back cached data to the OS even with cache=unsafe */
2598     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2599     if (bs->drv->bdrv_co_flush_to_os) {
2600         ret = bs->drv->bdrv_co_flush_to_os(bs);
2601         if (ret < 0) {
2602             goto out;
2603         }
2604     }
2605
2606     /* But don't actually force it to the disk with cache=unsafe */
2607     if (bs->open_flags & BDRV_O_NO_FLUSH) {
2608         goto flush_parent;
2609     }
2610
2611     /* Check if we really need to flush anything */
2612     if (bs->flushed_gen == current_gen) {
2613         goto flush_parent;
2614     }
2615
2616     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2617     if (!bs->drv) {
2618         /* bs->drv->bdrv_co_flush() might have ejected the BDS
2619          * (even in case of apparent success) */
2620         ret = -ENOMEDIUM;
2621         goto out;
2622     }
2623     if (bs->drv->bdrv_co_flush_to_disk) {
2624         ret = bs->drv->bdrv_co_flush_to_disk(bs);
2625     } else if (bs->drv->bdrv_aio_flush) {
2626         BlockAIOCB *acb;
2627         CoroutineIOCompletion co = {
2628             .coroutine = qemu_coroutine_self(),
2629         };
2630
2631         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2632         if (acb == NULL) {
2633             ret = -EIO;
2634         } else {
2635             qemu_coroutine_yield();
2636             ret = co.ret;
2637         }
2638     } else {
2639         /*
2640          * Some block drivers always operate in either writethrough or unsafe
2641          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2642          * know how the server works (because the behaviour is hardcoded or
2643          * depends on server-side configuration), so we can't ensure that
2644          * everything is safe on disk. Returning an error doesn't work because
2645          * that would break guests even if the server operates in writethrough
2646          * mode.
2647          *
2648          * Let's hope the user knows what he's doing.
2649          */
2650         ret = 0;
2651     }
2652
2653     if (ret < 0) {
2654         goto out;
2655     }
2656
2657     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
2658      * in the case of cache=unsafe, so there are no useless flushes.
2659      */
2660 flush_parent:
2661     ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2662 out:
2663     /* Notify any pending flushes that we have completed */
2664     if (ret == 0) {
2665         bs->flushed_gen = current_gen;
2666     }
2667
2668     qemu_co_mutex_lock(&bs->reqs_lock);
2669     bs->active_flush_req = false;
2670     /* Return value is ignored - it's ok if wait queue is empty */
2671     qemu_co_queue_next(&bs->flush_queue);
2672     qemu_co_mutex_unlock(&bs->reqs_lock);
2673
2674 early_exit:
2675     bdrv_dec_in_flight(bs);
2676     return ret;
2677 }
2678
2679 int bdrv_flush(BlockDriverState *bs)
2680 {
2681     Coroutine *co;
2682     FlushCo flush_co = {
2683         .bs = bs,
2684         .ret = NOT_DONE,
2685     };
2686
2687     if (qemu_in_coroutine()) {
2688         /* Fast-path if already in coroutine context */
2689         bdrv_flush_co_entry(&flush_co);
2690     } else {
2691         co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
2692         bdrv_coroutine_enter(bs, co);
2693         BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
2694     }
2695
2696     return flush_co.ret;
2697 }
2698
2699 typedef struct DiscardCo {
2700     BdrvChild *child;
2701     int64_t offset;
2702     int bytes;
2703     int ret;
2704 } DiscardCo;
2705 static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
2706 {
2707     DiscardCo *rwco = opaque;
2708
2709     rwco->ret = bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes);
2710 }
2711
2712 int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes)
2713 {
2714     BdrvTrackedRequest req;
2715     int max_pdiscard, ret;
2716     int head, tail, align;
2717     BlockDriverState *bs = child->bs;
2718
2719     if (!bs || !bs->drv) {
2720         return -ENOMEDIUM;
2721     }
2722
2723     if (bdrv_has_readonly_bitmaps(bs)) {
2724         return -EPERM;
2725     }
2726
2727     ret = bdrv_check_byte_request(bs, offset, bytes);
2728     if (ret < 0) {
2729         return ret;
2730     }
2731
2732     /* Do nothing if disabled.  */
2733     if (!(bs->open_flags & BDRV_O_UNMAP)) {
2734         return 0;
2735     }
2736
2737     if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
2738         return 0;
2739     }
2740
2741     /* Discard is advisory, but some devices track and coalesce
2742      * unaligned requests, so we must pass everything down rather than
2743      * round here.  Still, most devices will just silently ignore
2744      * unaligned requests (by returning -ENOTSUP), so we must fragment
2745      * the request accordingly.  */
2746     align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
2747     assert(align % bs->bl.request_alignment == 0);
2748     head = offset % align;
2749     tail = (offset + bytes) % align;
2750
2751     bdrv_inc_in_flight(bs);
2752     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
2753
2754     ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0);
2755     if (ret < 0) {
2756         goto out;
2757     }
2758
2759     max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
2760                                    align);
2761     assert(max_pdiscard >= bs->bl.request_alignment);
2762
2763     while (bytes > 0) {
2764         int num = bytes;
2765
2766         if (head) {
2767             /* Make small requests to get to alignment boundaries. */
2768             num = MIN(bytes, align - head);
2769             if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
2770                 num %= bs->bl.request_alignment;
2771             }
2772             head = (head + num) % align;
2773             assert(num < max_pdiscard);
2774         } else if (tail) {
2775             if (num > align) {
2776                 /* Shorten the request to the last aligned cluster.  */
2777                 num -= tail;
2778             } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
2779                        tail > bs->bl.request_alignment) {
2780                 tail %= bs->bl.request_alignment;
2781                 num -= tail;
2782             }
2783         }
2784         /* limit request size */
2785         if (num > max_pdiscard) {
2786             num = max_pdiscard;
2787         }
2788
2789         if (!bs->drv) {
2790             ret = -ENOMEDIUM;
2791             goto out;
2792         }
2793         if (bs->drv->bdrv_co_pdiscard) {
2794             ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
2795         } else {
2796             BlockAIOCB *acb;
2797             CoroutineIOCompletion co = {
2798                 .coroutine = qemu_coroutine_self(),
2799             };
2800
2801             acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
2802                                              bdrv_co_io_em_complete, &co);
2803             if (acb == NULL) {
2804                 ret = -EIO;
2805                 goto out;
2806             } else {
2807                 qemu_coroutine_yield();
2808                 ret = co.ret;
2809             }
2810         }
2811         if (ret && ret != -ENOTSUP) {
2812             goto out;
2813         }
2814
2815         offset += num;
2816         bytes -= num;
2817     }
2818     ret = 0;
2819 out:
2820     bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret);
2821     tracked_request_end(&req);
2822     bdrv_dec_in_flight(bs);
2823     return ret;
2824 }
2825
2826 int bdrv_pdiscard(BdrvChild *child, int64_t offset, int bytes)
2827 {
2828     Coroutine *co;
2829     DiscardCo rwco = {
2830         .child = child,
2831         .offset = offset,
2832         .bytes = bytes,
2833         .ret = NOT_DONE,
2834     };
2835
2836     if (qemu_in_coroutine()) {
2837         /* Fast-path if already in coroutine context */
2838         bdrv_pdiscard_co_entry(&rwco);
2839     } else {
2840         co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
2841         bdrv_coroutine_enter(child->bs, co);
2842         BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
2843     }
2844
2845     return rwco.ret;
2846 }
2847
2848 int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
2849 {
2850     BlockDriver *drv = bs->drv;
2851     CoroutineIOCompletion co = {
2852         .coroutine = qemu_coroutine_self(),
2853     };
2854     BlockAIOCB *acb;
2855
2856     bdrv_inc_in_flight(bs);
2857     if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
2858         co.ret = -ENOTSUP;
2859         goto out;
2860     }
2861
2862     if (drv->bdrv_co_ioctl) {
2863         co.ret = drv->bdrv_co_ioctl(bs, req, buf);
2864     } else {
2865         acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2866         if (!acb) {
2867             co.ret = -ENOTSUP;
2868             goto out;
2869         }
2870         qemu_coroutine_yield();
2871     }
2872 out:
2873     bdrv_dec_in_flight(bs);
2874     return co.ret;
2875 }
2876
2877 void *qemu_blockalign(BlockDriverState *bs, size_t size)
2878 {
2879     return qemu_memalign(bdrv_opt_mem_align(bs), size);
2880 }
2881
2882 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2883 {
2884     return memset(qemu_blockalign(bs, size), 0, size);
2885 }
2886
2887 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2888 {
2889     size_t align = bdrv_opt_mem_align(bs);
2890
2891     /* Ensure that NULL is never returned on success */
2892     assert(align > 0);
2893     if (size == 0) {
2894         size = align;
2895     }
2896
2897     return qemu_try_memalign(align, size);
2898 }
2899
2900 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2901 {
2902     void *mem = qemu_try_blockalign(bs, size);
2903
2904     if (mem) {
2905         memset(mem, 0, size);
2906     }
2907
2908     return mem;
2909 }
2910
2911 /*
2912  * Check if all memory in this vector is sector aligned.
2913  */
2914 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2915 {
2916     int i;
2917     size_t alignment = bdrv_min_mem_align(bs);
2918
2919     for (i = 0; i < qiov->niov; i++) {
2920         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2921             return false;
2922         }
2923         if (qiov->iov[i].iov_len % alignment) {
2924             return false;
2925         }
2926     }
2927
2928     return true;
2929 }
2930
2931 void bdrv_add_before_write_notifier(BlockDriverState *bs,
2932                                     NotifierWithReturn *notifier)
2933 {
2934     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2935 }
2936
2937 void bdrv_io_plug(BlockDriverState *bs)
2938 {
2939     BdrvChild *child;
2940
2941     QLIST_FOREACH(child, &bs->children, next) {
2942         bdrv_io_plug(child->bs);
2943     }
2944
2945     if (atomic_fetch_inc(&bs->io_plugged) == 0) {
2946         BlockDriver *drv = bs->drv;
2947         if (drv && drv->bdrv_io_plug) {
2948             drv->bdrv_io_plug(bs);
2949         }
2950     }
2951 }
2952
2953 void bdrv_io_unplug(BlockDriverState *bs)
2954 {
2955     BdrvChild *child;
2956
2957     assert(bs->io_plugged);
2958     if (atomic_fetch_dec(&bs->io_plugged) == 1) {
2959         BlockDriver *drv = bs->drv;
2960         if (drv && drv->bdrv_io_unplug) {
2961             drv->bdrv_io_unplug(bs);
2962         }
2963     }
2964
2965     QLIST_FOREACH(child, &bs->children, next) {
2966         bdrv_io_unplug(child->bs);
2967     }
2968 }
2969
2970 void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size)
2971 {
2972     BdrvChild *child;
2973
2974     if (bs->drv && bs->drv->bdrv_register_buf) {
2975         bs->drv->bdrv_register_buf(bs, host, size);
2976     }
2977     QLIST_FOREACH(child, &bs->children, next) {
2978         bdrv_register_buf(child->bs, host, size);
2979     }
2980 }
2981
2982 void bdrv_unregister_buf(BlockDriverState *bs, void *host)
2983 {
2984     BdrvChild *child;
2985
2986     if (bs->drv && bs->drv->bdrv_unregister_buf) {
2987         bs->drv->bdrv_unregister_buf(bs, host);
2988     }
2989     QLIST_FOREACH(child, &bs->children, next) {
2990         bdrv_unregister_buf(child->bs, host);
2991     }
2992 }
2993
2994 static int coroutine_fn bdrv_co_copy_range_internal(
2995         BdrvChild *src, uint64_t src_offset, BdrvChild *dst,
2996         uint64_t dst_offset, uint64_t bytes,
2997         BdrvRequestFlags read_flags, BdrvRequestFlags write_flags,
2998         bool recurse_src)
2999 {
3000     BdrvTrackedRequest req;
3001     int ret;
3002
3003     if (!dst || !dst->bs) {
3004         return -ENOMEDIUM;
3005     }
3006     ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes);
3007     if (ret) {
3008         return ret;
3009     }
3010     if (write_flags & BDRV_REQ_ZERO_WRITE) {
3011         return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags);
3012     }
3013
3014     if (!src || !src->bs) {
3015         return -ENOMEDIUM;
3016     }
3017     ret = bdrv_check_byte_request(src->bs, src_offset, bytes);
3018     if (ret) {
3019         return ret;
3020     }
3021
3022     if (!src->bs->drv->bdrv_co_copy_range_from
3023         || !dst->bs->drv->bdrv_co_copy_range_to
3024         || src->bs->encrypted || dst->bs->encrypted) {
3025         return -ENOTSUP;
3026     }
3027
3028     if (recurse_src) {
3029         bdrv_inc_in_flight(src->bs);
3030         tracked_request_begin(&req, src->bs, src_offset, bytes,
3031                               BDRV_TRACKED_READ);
3032
3033         /* BDRV_REQ_SERIALISING is only for write operation */
3034         assert(!(read_flags & BDRV_REQ_SERIALISING));
3035         if (!(read_flags & BDRV_REQ_NO_SERIALISING)) {
3036             wait_serialising_requests(&req);
3037         }
3038
3039         ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
3040                                                     src, src_offset,
3041                                                     dst, dst_offset,
3042                                                     bytes,
3043                                                     read_flags, write_flags);
3044
3045         tracked_request_end(&req);
3046         bdrv_dec_in_flight(src->bs);
3047     } else {
3048         bdrv_inc_in_flight(dst->bs);
3049         tracked_request_begin(&req, dst->bs, dst_offset, bytes,
3050                               BDRV_TRACKED_WRITE);
3051         ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req,
3052                                         write_flags);
3053         if (!ret) {
3054             ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
3055                                                       src, src_offset,
3056                                                       dst, dst_offset,
3057                                                       bytes,
3058                                                       read_flags, write_flags);
3059         }
3060         bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret);
3061         tracked_request_end(&req);
3062         bdrv_dec_in_flight(dst->bs);
3063     }
3064
3065     return ret;
3066 }
3067
3068 /* Copy range from @src to @dst.
3069  *
3070  * See the comment of bdrv_co_copy_range for the parameter and return value
3071  * semantics. */
3072 int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset,
3073                                          BdrvChild *dst, uint64_t dst_offset,
3074                                          uint64_t bytes,
3075                                          BdrvRequestFlags read_flags,
3076                                          BdrvRequestFlags write_flags)
3077 {
3078     trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes,
3079                                   read_flags, write_flags);
3080     return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
3081                                        bytes, read_flags, write_flags, true);
3082 }
3083
3084 /* Copy range from @src to @dst.
3085  *
3086  * See the comment of bdrv_co_copy_range for the parameter and return value
3087  * semantics. */
3088 int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset,
3089                                        BdrvChild *dst, uint64_t dst_offset,
3090                                        uint64_t bytes,
3091                                        BdrvRequestFlags read_flags,
3092                                        BdrvRequestFlags write_flags)
3093 {
3094     trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3095                                 read_flags, write_flags);
3096     return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
3097                                        bytes, read_flags, write_flags, false);
3098 }
3099
3100 int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset,
3101                                     BdrvChild *dst, uint64_t dst_offset,
3102                                     uint64_t bytes, BdrvRequestFlags read_flags,
3103                                     BdrvRequestFlags write_flags)
3104 {
3105     return bdrv_co_copy_range_from(src, src_offset,
3106                                    dst, dst_offset,
3107                                    bytes, read_flags, write_flags);
3108 }
3109
3110 static void bdrv_parent_cb_resize(BlockDriverState *bs)
3111 {
3112     BdrvChild *c;
3113     QLIST_FOREACH(c, &bs->parents, next_parent) {
3114         if (c->role->resize) {
3115             c->role->resize(c);
3116         }
3117     }
3118 }
3119
3120 /**
3121  * Truncate file to 'offset' bytes (needed only for file protocols)
3122  */
3123 int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset,
3124                                   PreallocMode prealloc, Error **errp)
3125 {
3126     BlockDriverState *bs = child->bs;
3127     BlockDriver *drv = bs->drv;
3128     BdrvTrackedRequest req;
3129     int64_t old_size, new_bytes;
3130     int ret;
3131
3132
3133     /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
3134     if (!drv) {
3135         error_setg(errp, "No medium inserted");
3136         return -ENOMEDIUM;
3137     }
3138     if (offset < 0) {
3139         error_setg(errp, "Image size cannot be negative");
3140         return -EINVAL;
3141     }
3142
3143     old_size = bdrv_getlength(bs);
3144     if (old_size < 0) {
3145         error_setg_errno(errp, -old_size, "Failed to get old image size");
3146         return old_size;
3147     }
3148
3149     if (offset > old_size) {
3150         new_bytes = offset - old_size;
3151     } else {
3152         new_bytes = 0;
3153     }
3154
3155     bdrv_inc_in_flight(bs);
3156     tracked_request_begin(&req, bs, offset - new_bytes, new_bytes,
3157                           BDRV_TRACKED_TRUNCATE);
3158
3159     /* If we are growing the image and potentially using preallocation for the
3160      * new area, we need to make sure that no write requests are made to it
3161      * concurrently or they might be overwritten by preallocation. */
3162     if (new_bytes) {
3163         mark_request_serialising(&req, 1);
3164     }
3165     if (bs->read_only) {
3166         error_setg(errp, "Image is read-only");
3167         ret = -EACCES;
3168         goto out;
3169     }
3170     ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req,
3171                                     0);
3172     if (ret < 0) {
3173         error_setg_errno(errp, -ret,
3174                          "Failed to prepare request for truncation");
3175         goto out;
3176     }
3177
3178     if (!drv->bdrv_co_truncate) {
3179         if (bs->file && drv->is_filter) {
3180             ret = bdrv_co_truncate(bs->file, offset, prealloc, errp);
3181             goto out;
3182         }
3183         error_setg(errp, "Image format driver does not support resize");
3184         ret = -ENOTSUP;
3185         goto out;
3186     }
3187
3188     ret = drv->bdrv_co_truncate(bs, offset, prealloc, errp);
3189     if (ret < 0) {
3190         goto out;
3191     }
3192     ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3193     if (ret < 0) {
3194         error_setg_errno(errp, -ret, "Could not refresh total sector count");
3195     } else {
3196         offset = bs->total_sectors * BDRV_SECTOR_SIZE;
3197     }
3198     /* It's possible that truncation succeeded but refresh_total_sectors
3199      * failed, but the latter doesn't affect how we should finish the request.
3200      * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */
3201     bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0);
3202
3203 out:
3204     tracked_request_end(&req);
3205     bdrv_dec_in_flight(bs);
3206
3207     return ret;
3208 }
3209
3210 typedef struct TruncateCo {
3211     BdrvChild *child;
3212     int64_t offset;
3213     PreallocMode prealloc;
3214     Error **errp;
3215     int ret;
3216 } TruncateCo;
3217
3218 static void coroutine_fn bdrv_truncate_co_entry(void *opaque)
3219 {
3220     TruncateCo *tco = opaque;
3221     tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->prealloc,
3222                                 tco->errp);
3223 }
3224
3225 int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc,
3226                   Error **errp)
3227 {
3228     Coroutine *co;
3229     TruncateCo tco = {
3230         .child      = child,
3231         .offset     = offset,
3232         .prealloc   = prealloc,
3233         .errp       = errp,
3234         .ret        = NOT_DONE,
3235     };
3236
3237     if (qemu_in_coroutine()) {
3238         /* Fast-path if already in coroutine context */
3239         bdrv_truncate_co_entry(&tco);
3240     } else {
3241         co = qemu_coroutine_create(bdrv_truncate_co_entry, &tco);
3242         qemu_coroutine_enter(co);
3243         BDRV_POLL_WHILE(child->bs, tco.ret == NOT_DONE);
3244     }
3245
3246     return tco.ret;
3247 }