block/io.c

   1 /*
   2  * Block layer I/O functions
   3  *
   4  * Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "trace.h"
  26 #include "sysemu/qtest.h"
  27 #include "block/blockjob.h"
  28 #include "block/block_int.h"
  29
  30 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  31
  32 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
  33         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
  34         BlockCompletionFunc *cb, void *opaque);
  35 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
  36         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
  37         BlockCompletionFunc *cb, void *opaque);
  38 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
  39                                          int64_t sector_num, int nb_sectors,
  40                                          QEMUIOVector *iov);
  41 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
  42                                          int64_t sector_num, int nb_sectors,
  43                                          QEMUIOVector *iov);
  44 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
  45     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
  46     BdrvRequestFlags flags);
  47 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
  48     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
  49     BdrvRequestFlags flags);
  50 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
  51                                          int64_t sector_num,
  52                                          QEMUIOVector *qiov,
  53                                          int nb_sectors,
  54                                          BdrvRequestFlags flags,
  55                                          BlockCompletionFunc *cb,
  56                                          void *opaque,
  57                                          bool is_write);
  58 static void coroutine_fn bdrv_co_do_rw(void *opaque);
  59 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
  60     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
  61
  62 /* throttling disk I/O limits */
  63 void bdrv_set_io_limits(BlockDriverState *bs,
  64                         ThrottleConfig *cfg)
  65 {
  66     int i;
  67
  68     throttle_config(&bs->throttle_state, cfg);
  69
  70     for (i = 0; i < 2; i++) {
  71         qemu_co_enter_next(&bs->throttled_reqs[i]);
  72     }
  73 }
  74
  75 /* this function drain all the throttled IOs */
  76 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
  77 {
  78     bool drained = false;
  79     bool enabled = bs->io_limits_enabled;
  80     int i;
  81
  82     bs->io_limits_enabled = false;
  83
  84     for (i = 0; i < 2; i++) {
  85         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
  86             drained = true;
  87         }
  88     }
  89
  90     bs->io_limits_enabled = enabled;
  91
  92     return drained;
  93 }
  94
  95 void bdrv_io_limits_disable(BlockDriverState *bs)
  96 {
  97     bs->io_limits_enabled = false;
  98
  99     bdrv_start_throttled_reqs(bs);
 100
 101     throttle_destroy(&bs->throttle_state);
 102 }
 103
 104 static void bdrv_throttle_read_timer_cb(void *opaque)
 105 {
 106     BlockDriverState *bs = opaque;
 107     qemu_co_enter_next(&bs->throttled_reqs[0]);
 108 }
 109
 110 static void bdrv_throttle_write_timer_cb(void *opaque)
 111 {
 112     BlockDriverState *bs = opaque;
 113     qemu_co_enter_next(&bs->throttled_reqs[1]);
 114 }
 115
 116 /* should be called before bdrv_set_io_limits if a limit is set */
 117 void bdrv_io_limits_enable(BlockDriverState *bs)
 118 {
 119     int clock_type = QEMU_CLOCK_REALTIME;
 120
 121     if (qtest_enabled()) {
 122         /* For testing block IO throttling only */
 123         clock_type = QEMU_CLOCK_VIRTUAL;
 124     }
 125     assert(!bs->io_limits_enabled);
 126     throttle_init(&bs->throttle_state,
 127                   bdrv_get_aio_context(bs),
 128                   clock_type,
 129                   bdrv_throttle_read_timer_cb,
 130                   bdrv_throttle_write_timer_cb,
 131                   bs);
 132     bs->io_limits_enabled = true;
 133 }
 134
 135 /* This function makes an IO wait if needed
 136  *
 137  * @nb_sectors: the number of sectors of the IO
 138  * @is_write:   is the IO a write
 139  */
 140 static void bdrv_io_limits_intercept(BlockDriverState *bs,
 141                                      unsigned int bytes,
 142                                      bool is_write)
 143 {
 144     /* does this io must wait */
 145     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
 146
 147     /* if must wait or any request of this type throttled queue the IO */
 148     if (must_wait ||
 149         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
 150         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
 151     }
 152
 153     /* the IO will be executed, do the accounting */
 154     throttle_account(&bs->throttle_state, is_write, bytes);
 155
 156
 157     /* if the next request must wait -> do nothing */
 158     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
 159         return;
 160     }
 161
 162     /* else queue next request for execution */
 163     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
 164 }
 165
 166 void bdrv_setup_io_funcs(BlockDriver *bdrv)
 167 {
 168     /* Block drivers without coroutine functions need emulation */
 169     if (!bdrv->bdrv_co_readv) {
 170         bdrv->bdrv_co_readv = bdrv_co_readv_em;
 171         bdrv->bdrv_co_writev = bdrv_co_writev_em;
 172
 173         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
 174          * the block driver lacks aio we need to emulate that too.
 175          */
 176         if (!bdrv->bdrv_aio_readv) {
 177             /* add AIO emulation layer */
 178             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
 179             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
 180         }
 181     }
 182 }
 183
 184 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
 185 {
 186     BlockDriver *drv = bs->drv;
 187     Error *local_err = NULL;
 188
 189     memset(&bs->bl, 0, sizeof(bs->bl));
 190
 191     if (!drv) {
 192         return;
 193     }
 194
 195     /* Take some limits from the children as a default */
 196     if (bs->file) {
 197         bdrv_refresh_limits(bs->file, &local_err);
 198         if (local_err) {
 199             error_propagate(errp, local_err);
 200             return;
 201         }
 202         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
 203         bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
 204         bs->bl.min_mem_alignment = bs->file->bl.min_mem_alignment;
 205         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
 206     } else {
 207         bs->bl.min_mem_alignment = 512;
 208         bs->bl.opt_mem_alignment = getpagesize();
 209     }
 210
 211     if (bs->backing_hd) {
 212         bdrv_refresh_limits(bs->backing_hd, &local_err);
 213         if (local_err) {
 214             error_propagate(errp, local_err);
 215             return;
 216         }
 217         bs->bl.opt_transfer_length =
 218             MAX(bs->bl.opt_transfer_length,
 219                 bs->backing_hd->bl.opt_transfer_length);
 220         bs->bl.max_transfer_length =
 221             MIN_NON_ZERO(bs->bl.max_transfer_length,
 222                          bs->backing_hd->bl.max_transfer_length);
 223         bs->bl.opt_mem_alignment =
 224             MAX(bs->bl.opt_mem_alignment,
 225                 bs->backing_hd->bl.opt_mem_alignment);
 226         bs->bl.min_mem_alignment =
 227             MAX(bs->bl.min_mem_alignment,
 228                 bs->backing_hd->bl.min_mem_alignment);
 229     }
 230
 231     /* Then let the driver override it */
 232     if (drv->bdrv_refresh_limits) {
 233         drv->bdrv_refresh_limits(bs, errp);
 234     }
 235 }
 236
 237 /**
 238  * The copy-on-read flag is actually a reference count so multiple users may
 239  * use the feature without worrying about clobbering its previous state.
 240  * Copy-on-read stays enabled until all users have called to disable it.
 241  */
 242 void bdrv_enable_copy_on_read(BlockDriverState *bs)
 243 {
 244     bs->copy_on_read++;
 245 }
 246
 247 void bdrv_disable_copy_on_read(BlockDriverState *bs)
 248 {
 249     assert(bs->copy_on_read > 0);
 250     bs->copy_on_read--;
 251 }
 252
 253 /* Check if any requests are in-flight (including throttled requests) */
 254 static bool bdrv_requests_pending(BlockDriverState *bs)
 255 {
 256     if (!QLIST_EMPTY(&bs->tracked_requests)) {
 257         return true;
 258     }
 259     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
 260         return true;
 261     }
 262     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
 263         return true;
 264     }
 265     if (bs->file && bdrv_requests_pending(bs->file)) {
 266         return true;
 267     }
 268     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
 269         return true;
 270     }
 271     return false;
 272 }
 273
 274 static bool bdrv_drain_one(BlockDriverState *bs)
 275 {
 276     bool bs_busy;
 277
 278     bdrv_flush_io_queue(bs);
 279     bdrv_start_throttled_reqs(bs);
 280     bs_busy = bdrv_requests_pending(bs);
 281     bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
 282     return bs_busy;
 283 }
 284
 285 /*
 286  * Wait for pending requests to complete on a single BlockDriverState subtree
 287  *
 288  * See the warning in bdrv_drain_all().  This function can only be called if
 289  * you are sure nothing can generate I/O because you have op blockers
 290  * installed.
 291  *
 292  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
 293  * AioContext.
 294  */
 295 void bdrv_drain(BlockDriverState *bs)
 296 {
 297     while (bdrv_drain_one(bs)) {
 298         /* Keep iterating */
 299     }
 300 }
 301
 302 /*
 303  * Wait for pending requests to complete across all BlockDriverStates
 304  *
 305  * This function does not flush data to disk, use bdrv_flush_all() for that
 306  * after calling this function.
 307  *
 308  * Note that completion of an asynchronous I/O operation can trigger any
 309  * number of other I/O operations on other devices---for example a coroutine
 310  * can be arbitrarily complex and a constant flow of I/O can come until the
 311  * coroutine is complete.  Because of this, it is not possible to have a
 312  * function to drain a single device's I/O queue.
 313  */
 314 void bdrv_drain_all(void)
 315 {
 316     /* Always run first iteration so any pending completion BHs run */
 317     bool busy = true;
 318     BlockDriverState *bs = NULL;
 319
 320     while ((bs = bdrv_next(bs))) {
 321         AioContext *aio_context = bdrv_get_aio_context(bs);
 322
 323         aio_context_acquire(aio_context);
 324         if (bs->job) {
 325             block_job_pause(bs->job);
 326         }
 327         aio_context_release(aio_context);
 328     }
 329
 330     while (busy) {
 331         busy = false;
 332         bs = NULL;
 333
 334         while ((bs = bdrv_next(bs))) {
 335             AioContext *aio_context = bdrv_get_aio_context(bs);
 336
 337             aio_context_acquire(aio_context);
 338             busy |= bdrv_drain_one(bs);
 339             aio_context_release(aio_context);
 340         }
 341     }
 342
 343     bs = NULL;
 344     while ((bs = bdrv_next(bs))) {
 345         AioContext *aio_context = bdrv_get_aio_context(bs);
 346
 347         aio_context_acquire(aio_context);
 348         if (bs->job) {
 349             block_job_resume(bs->job);
 350         }
 351         aio_context_release(aio_context);
 352     }
 353 }
 354
 355 /**
 356  * Remove an active request from the tracked requests list
 357  *
 358  * This function should be called when a tracked request is completing.
 359  */
 360 static void tracked_request_end(BdrvTrackedRequest *req)
 361 {
 362     if (req->serialising) {
 363         req->bs->serialising_in_flight--;
 364     }
 365
 366     QLIST_REMOVE(req, list);
 367     qemu_co_queue_restart_all(&req->wait_queue);
 368 }
 369
 370 /**
 371  * Add an active request to the tracked requests list
 372  */
 373 static void tracked_request_begin(BdrvTrackedRequest *req,
 374                                   BlockDriverState *bs,
 375                                   int64_t offset,
 376                                   unsigned int bytes, bool is_write)
 377 {
 378     *req = (BdrvTrackedRequest){
 379         .bs = bs,
 380         .offset         = offset,
 381         .bytes          = bytes,
 382         .is_write       = is_write,
 383         .co             = qemu_coroutine_self(),
 384         .serialising    = false,
 385         .overlap_offset = offset,
 386         .overlap_bytes  = bytes,
 387     };
 388
 389     qemu_co_queue_init(&req->wait_queue);
 390
 391     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
 392 }
 393
 394 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
 395 {
 396     int64_t overlap_offset = req->offset & ~(align - 1);
 397     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
 398                                - overlap_offset;
 399
 400     if (!req->serialising) {
 401         req->bs->serialising_in_flight++;
 402         req->serialising = true;
 403     }
 404
 405     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
 406     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
 407 }
 408
 409 /**
 410  * Round a region to cluster boundaries
 411  */
 412 void bdrv_round_to_clusters(BlockDriverState *bs,
 413                             int64_t sector_num, int nb_sectors,
 414                             int64_t *cluster_sector_num,
 415                             int *cluster_nb_sectors)
 416 {
 417     BlockDriverInfo bdi;
 418
 419     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
 420         *cluster_sector_num = sector_num;
 421         *cluster_nb_sectors = nb_sectors;
 422     } else {
 423         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
 424         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
 425         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
 426                                             nb_sectors, c);
 427     }
 428 }
 429
 430 static int bdrv_get_cluster_size(BlockDriverState *bs)
 431 {
 432     BlockDriverInfo bdi;
 433     int ret;
 434
 435     ret = bdrv_get_info(bs, &bdi);
 436     if (ret < 0 || bdi.cluster_size == 0) {
 437         return bs->request_alignment;
 438     } else {
 439         return bdi.cluster_size;
 440     }
 441 }
 442
 443 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
 444                                      int64_t offset, unsigned int bytes)
 445 {
 446     /*        aaaa   bbbb */
 447     if (offset >= req->overlap_offset + req->overlap_bytes) {
 448         return false;
 449     }
 450     /* bbbb   aaaa        */
 451     if (req->overlap_offset >= offset + bytes) {
 452         return false;
 453     }
 454     return true;
 455 }
 456
 457 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
 458 {
 459     BlockDriverState *bs = self->bs;
 460     BdrvTrackedRequest *req;
 461     bool retry;
 462     bool waited = false;
 463
 464     if (!bs->serialising_in_flight) {
 465         return false;
 466     }
 467
 468     do {
 469         retry = false;
 470         QLIST_FOREACH(req, &bs->tracked_requests, list) {
 471             if (req == self || (!req->serialising && !self->serialising)) {
 472                 continue;
 473             }
 474             if (tracked_request_overlaps(req, self->overlap_offset,
 475                                          self->overlap_bytes))
 476             {
 477                 /* Hitting this means there was a reentrant request, for
 478                  * example, a block driver issuing nested requests.  This must
 479                  * never happen since it means deadlock.
 480                  */
 481                 assert(qemu_coroutine_self() != req->co);
 482
 483                 /* If the request is already (indirectly) waiting for us, or
 484                  * will wait for us as soon as it wakes up, then just go on
 485                  * (instead of producing a deadlock in the former case). */
 486                 if (!req->waiting_for) {
 487                     self->waiting_for = req;
 488                     qemu_co_queue_wait(&req->wait_queue);
 489                     self->waiting_for = NULL;
 490                     retry = true;
 491                     waited = true;
 492                     break;
 493                 }
 494             }
 495         }
 496     } while (retry);
 497
 498     return waited;
 499 }
 500
 501 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
 502                                    size_t size)
 503 {
 504     if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
 505         return -EIO;
 506     }
 507
 508     if (!bdrv_is_inserted(bs)) {
 509         return -ENOMEDIUM;
 510     }
 511
 512     if (offset < 0) {
 513         return -EIO;
 514     }
 515
 516     return 0;
 517 }
 518
 519 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
 520                               int nb_sectors)
 521 {
 522     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
 523         return -EIO;
 524     }
 525
 526     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
 527                                    nb_sectors * BDRV_SECTOR_SIZE);
 528 }
 529
 530 typedef struct RwCo {
 531     BlockDriverState *bs;
 532     int64_t offset;
 533     QEMUIOVector *qiov;
 534     bool is_write;
 535     int ret;
 536     BdrvRequestFlags flags;
 537 } RwCo;
 538
 539 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
 540 {
 541     RwCo *rwco = opaque;
 542
 543     if (!rwco->is_write) {
 544         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
 545                                       rwco->qiov->size, rwco->qiov,
 546                                       rwco->flags);
 547     } else {
 548         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
 549                                        rwco->qiov->size, rwco->qiov,
 550                                        rwco->flags);
 551     }
 552 }
 553
 554 /*
 555  * Process a vectored synchronous request using coroutines
 556  */
 557 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
 558                         QEMUIOVector *qiov, bool is_write,
 559                         BdrvRequestFlags flags)
 560 {
 561     Coroutine *co;
 562     RwCo rwco = {
 563         .bs = bs,
 564         .offset = offset,
 565         .qiov = qiov,
 566         .is_write = is_write,
 567         .ret = NOT_DONE,
 568         .flags = flags,
 569     };
 570
 571     /**
 572      * In sync call context, when the vcpu is blocked, this throttling timer
 573      * will not fire; so the I/O throttling function has to be disabled here
 574      * if it has been enabled.
 575      */
 576     if (bs->io_limits_enabled) {
 577         fprintf(stderr, "Disabling I/O throttling on '%s' due "
 578                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
 579         bdrv_io_limits_disable(bs);
 580     }
 581
 582     if (qemu_in_coroutine()) {
 583         /* Fast-path if already in coroutine context */
 584         bdrv_rw_co_entry(&rwco);
 585     } else {
 586         AioContext *aio_context = bdrv_get_aio_context(bs);
 587
 588         co = qemu_coroutine_create(bdrv_rw_co_entry);
 589         qemu_coroutine_enter(co, &rwco);
 590         while (rwco.ret == NOT_DONE) {
 591             aio_poll(aio_context, true);
 592         }
 593     }
 594     return rwco.ret;
 595 }
 596
 597 /*
 598  * Process a synchronous request using coroutines
 599  */
 600 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
 601                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
 602 {
 603     QEMUIOVector qiov;
 604     struct iovec iov = {
 605         .iov_base = (void *)buf,
 606         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
 607     };
 608
 609     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
 610         return -EINVAL;
 611     }
 612
 613     qemu_iovec_init_external(&qiov, &iov, 1);
 614     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
 615                         &qiov, is_write, flags);
 616 }
 617
 618 /* return < 0 if error. See bdrv_write() for the return codes */
 619 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
 620               uint8_t *buf, int nb_sectors)
 621 {
 622     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
 623 }
 624
 625 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
 626 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
 627                           uint8_t *buf, int nb_sectors)
 628 {
 629     bool enabled;
 630     int ret;
 631
 632     enabled = bs->io_limits_enabled;
 633     bs->io_limits_enabled = false;
 634     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
 635     bs->io_limits_enabled = enabled;
 636     return ret;
 637 }
 638
 639 /* Return < 0 if error. Important errors are:
 640   -EIO         generic I/O error (may happen for all errors)
 641   -ENOMEDIUM   No media inserted.
 642   -EINVAL      Invalid sector number or nb_sectors
 643   -EACCES      Trying to write a read-only device
 644 */
 645 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
 646                const uint8_t *buf, int nb_sectors)
 647 {
 648     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
 649 }
 650
 651 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
 652                       int nb_sectors, BdrvRequestFlags flags)
 653 {
 654     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
 655                       BDRV_REQ_ZERO_WRITE | flags);
 656 }
 657
 658 /*
 659  * Completely zero out a block device with the help of bdrv_write_zeroes.
 660  * The operation is sped up by checking the block status and only writing
 661  * zeroes to the device if they currently do not return zeroes. Optional
 662  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
 663  *
 664  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
 665  */
 666 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
 667 {
 668     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
 669     int n;
 670
 671     target_sectors = bdrv_nb_sectors(bs);
 672     if (target_sectors < 0) {
 673         return target_sectors;
 674     }
 675
 676     for (;;) {
 677         nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
 678         if (nb_sectors <= 0) {
 679             return 0;
 680         }
 681         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
 682         if (ret < 0) {
 683             error_report("error getting block status at sector %" PRId64 ": %s",
 684                          sector_num, strerror(-ret));
 685             return ret;
 686         }
 687         if (ret & BDRV_BLOCK_ZERO) {
 688             sector_num += n;
 689             continue;
 690         }
 691         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
 692         if (ret < 0) {
 693             error_report("error writing zeroes at sector %" PRId64 ": %s",
 694                          sector_num, strerror(-ret));
 695             return ret;
 696         }
 697         sector_num += n;
 698     }
 699 }
 700
 701 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
 702 {
 703     QEMUIOVector qiov;
 704     struct iovec iov = {
 705         .iov_base = (void *)buf,
 706         .iov_len = bytes,
 707     };
 708     int ret;
 709
 710     if (bytes < 0) {
 711         return -EINVAL;
 712     }
 713
 714     qemu_iovec_init_external(&qiov, &iov, 1);
 715     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
 716     if (ret < 0) {
 717         return ret;
 718     }
 719
 720     return bytes;
 721 }
 722
 723 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
 724 {
 725     int ret;
 726
 727     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
 728     if (ret < 0) {
 729         return ret;
 730     }
 731
 732     return qiov->size;
 733 }
 734
 735 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
 736                 const void *buf, int bytes)
 737 {
 738     QEMUIOVector qiov;
 739     struct iovec iov = {
 740         .iov_base   = (void *) buf,
 741         .iov_len    = bytes,
 742     };
 743
 744     if (bytes < 0) {
 745         return -EINVAL;
 746     }
 747
 748     qemu_iovec_init_external(&qiov, &iov, 1);
 749     return bdrv_pwritev(bs, offset, &qiov);
 750 }
 751
 752 /*
 753  * Writes to the file and ensures that no writes are reordered across this
 754  * request (acts as a barrier)
 755  *
 756  * Returns 0 on success, -errno in error cases.
 757  */
 758 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
 759     const void *buf, int count)
 760 {
 761     int ret;
 762
 763     ret = bdrv_pwrite(bs, offset, buf, count);
 764     if (ret < 0) {
 765         return ret;
 766     }
 767
 768     /* No flush needed for cache modes that already do it */
 769     if (bs->enable_write_cache) {
 770         bdrv_flush(bs);
 771     }
 772
 773     return 0;
 774 }
 775
 776 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
 777         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 778 {
 779     /* Perform I/O through a temporary buffer so that users who scribble over
 780      * their read buffer while the operation is in progress do not end up
 781      * modifying the image file.  This is critical for zero-copy guest I/O
 782      * where anything might happen inside guest memory.
 783      */
 784     void *bounce_buffer;
 785
 786     BlockDriver *drv = bs->drv;
 787     struct iovec iov;
 788     QEMUIOVector bounce_qiov;
 789     int64_t cluster_sector_num;
 790     int cluster_nb_sectors;
 791     size_t skip_bytes;
 792     int ret;
 793
 794     /* Cover entire cluster so no additional backing file I/O is required when
 795      * allocating cluster in the image file.
 796      */
 797     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
 798                            &cluster_sector_num, &cluster_nb_sectors);
 799
 800     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
 801                                    cluster_sector_num, cluster_nb_sectors);
 802
 803     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
 804     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
 805     if (bounce_buffer == NULL) {
 806         ret = -ENOMEM;
 807         goto err;
 808     }
 809
 810     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
 811
 812     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
 813                              &bounce_qiov);
 814     if (ret < 0) {
 815         goto err;
 816     }
 817
 818     if (drv->bdrv_co_write_zeroes &&
 819         buffer_is_zero(bounce_buffer, iov.iov_len)) {
 820         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
 821                                       cluster_nb_sectors, 0);
 822     } else {
 823         /* This does not change the data on the disk, it is not necessary
 824          * to flush even in cache=writethrough mode.
 825          */
 826         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
 827                                   &bounce_qiov);
 828     }
 829
 830     if (ret < 0) {
 831         /* It might be okay to ignore write errors for guest requests.  If this
 832          * is a deliberate copy-on-read then we don't want to ignore the error.
 833          * Simply report it in all cases.
 834          */
 835         goto err;
 836     }
 837
 838     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
 839     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
 840                         nb_sectors * BDRV_SECTOR_SIZE);
 841
 842 err:
 843     qemu_vfree(bounce_buffer);
 844     return ret;
 845 }
 846
 847 /*
 848  * Forwards an already correctly aligned request to the BlockDriver. This
 849  * handles copy on read and zeroing after EOF; any other features must be
 850  * implemented by the caller.
 851  */
 852 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
 853     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
 854     int64_t align, QEMUIOVector *qiov, int flags)
 855 {
 856     BlockDriver *drv = bs->drv;
 857     int ret;
 858
 859     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
 860     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
 861
 862     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
 863     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 864     assert(!qiov || bytes == qiov->size);
 865
 866     /* Handle Copy on Read and associated serialisation */
 867     if (flags & BDRV_REQ_COPY_ON_READ) {
 868         /* If we touch the same cluster it counts as an overlap.  This
 869          * guarantees that allocating writes will be serialized and not race
 870          * with each other for the same cluster.  For example, in copy-on-read
 871          * it ensures that the CoR read and write operations are atomic and
 872          * guest writes cannot interleave between them. */
 873         mark_request_serialising(req, bdrv_get_cluster_size(bs));
 874     }
 875
 876     wait_serialising_requests(req);
 877
 878     if (flags & BDRV_REQ_COPY_ON_READ) {
 879         int pnum;
 880
 881         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
 882         if (ret < 0) {
 883             goto out;
 884         }
 885
 886         if (!ret || pnum != nb_sectors) {
 887             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
 888             goto out;
 889         }
 890     }
 891
 892     /* Forward the request to the BlockDriver */
 893     if (!bs->zero_beyond_eof) {
 894         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
 895     } else {
 896         /* Read zeros after EOF */
 897         int64_t total_sectors, max_nb_sectors;
 898
 899         total_sectors = bdrv_nb_sectors(bs);
 900         if (total_sectors < 0) {
 901             ret = total_sectors;
 902             goto out;
 903         }
 904
 905         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
 906                                   align >> BDRV_SECTOR_BITS);
 907         if (nb_sectors < max_nb_sectors) {
 908             ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
 909         } else if (max_nb_sectors > 0) {
 910             QEMUIOVector local_qiov;
 911
 912             qemu_iovec_init(&local_qiov, qiov->niov);
 913             qemu_iovec_concat(&local_qiov, qiov, 0,
 914                               max_nb_sectors * BDRV_SECTOR_SIZE);
 915
 916             ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
 917                                      &local_qiov);
 918
 919             qemu_iovec_destroy(&local_qiov);
 920         } else {
 921             ret = 0;
 922         }
 923
 924         /* Reading beyond end of file is supposed to produce zeroes */
 925         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
 926             uint64_t offset = MAX(0, total_sectors - sector_num);
 927             uint64_t bytes = (sector_num + nb_sectors - offset) *
 928                               BDRV_SECTOR_SIZE;
 929             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
 930         }
 931     }
 932
 933 out:
 934     return ret;
 935 }
 936
 937 /*
 938  * Handle a read request in coroutine context
 939  */
 940 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
 941     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
 942     BdrvRequestFlags flags)
 943 {
 944     BlockDriver *drv = bs->drv;
 945     BdrvTrackedRequest req;
 946
 947     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
 948     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
 949     uint8_t *head_buf = NULL;
 950     uint8_t *tail_buf = NULL;
 951     QEMUIOVector local_qiov;
 952     bool use_local_qiov = false;
 953     int ret;
 954
 955     if (!drv) {
 956         return -ENOMEDIUM;
 957     }
 958
 959     ret = bdrv_check_byte_request(bs, offset, bytes);
 960     if (ret < 0) {
 961         return ret;
 962     }
 963
 964     if (bs->copy_on_read) {
 965         flags |= BDRV_REQ_COPY_ON_READ;
 966     }
 967
 968     /* throttling disk I/O */
 969     if (bs->io_limits_enabled) {
 970         bdrv_io_limits_intercept(bs, bytes, false);
 971     }
 972
 973     /* Align read if necessary by padding qiov */
 974     if (offset & (align - 1)) {
 975         head_buf = qemu_blockalign(bs, align);
 976         qemu_iovec_init(&local_qiov, qiov->niov + 2);
 977         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
 978         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
 979         use_local_qiov = true;
 980
 981         bytes += offset & (align - 1);
 982         offset = offset & ~(align - 1);
 983     }
 984
 985     if ((offset + bytes) & (align - 1)) {
 986         if (!use_local_qiov) {
 987             qemu_iovec_init(&local_qiov, qiov->niov + 1);
 988             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
 989             use_local_qiov = true;
 990         }
 991         tail_buf = qemu_blockalign(bs, align);
 992         qemu_iovec_add(&local_qiov, tail_buf,
 993                        align - ((offset + bytes) & (align - 1)));
 994
 995         bytes = ROUND_UP(bytes, align);
 996     }
 997
 998     tracked_request_begin(&req, bs, offset, bytes, false);
 999     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1000                               use_local_qiov ? &local_qiov : qiov,
1001                               flags);
1002     tracked_request_end(&req);
1003
1004     if (use_local_qiov) {
1005         qemu_iovec_destroy(&local_qiov);
1006         qemu_vfree(head_buf);
1007         qemu_vfree(tail_buf);
1008     }
1009
1010     return ret;
1011 }
1012
1013 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1014     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1015     BdrvRequestFlags flags)
1016 {
1017     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1018         return -EINVAL;
1019     }
1020
1021     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
1022                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1023 }
1024
1025 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1026     int nb_sectors, QEMUIOVector *qiov)
1027 {
1028     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1029
1030     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1031 }
1032
1033 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1034     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1035 {
1036     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1037
1038     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1039                             BDRV_REQ_COPY_ON_READ);
1040 }
1041
1042 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
1043
1044 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1045     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
1046 {
1047     BlockDriver *drv = bs->drv;
1048     QEMUIOVector qiov;
1049     struct iovec iov = {0};
1050     int ret = 0;
1051
1052     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
1053                                         BDRV_REQUEST_MAX_SECTORS);
1054
1055     while (nb_sectors > 0 && !ret) {
1056         int num = nb_sectors;
1057
1058         /* Align request.  Block drivers can expect the "bulk" of the request
1059          * to be aligned.
1060          */
1061         if (bs->bl.write_zeroes_alignment
1062             && num > bs->bl.write_zeroes_alignment) {
1063             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
1064                 /* Make a small request up to the first aligned sector.  */
1065                 num = bs->bl.write_zeroes_alignment;
1066                 num -= sector_num % bs->bl.write_zeroes_alignment;
1067             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
1068                 /* Shorten the request to the last aligned sector.  num cannot
1069                  * underflow because num > bs->bl.write_zeroes_alignment.
1070                  */
1071                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
1072             }
1073         }
1074
1075         /* limit request size */
1076         if (num > max_write_zeroes) {
1077             num = max_write_zeroes;
1078         }
1079
1080         ret = -ENOTSUP;
1081         /* First try the efficient write zeroes operation */
1082         if (drv->bdrv_co_write_zeroes) {
1083             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
1084         }
1085
1086         if (ret == -ENOTSUP) {
1087             /* Fall back to bounce buffer if write zeroes is unsupported */
1088             int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
1089                                             MAX_WRITE_ZEROES_BOUNCE_BUFFER);
1090             num = MIN(num, max_xfer_len);
1091             iov.iov_len = num * BDRV_SECTOR_SIZE;
1092             if (iov.iov_base == NULL) {
1093                 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
1094                 if (iov.iov_base == NULL) {
1095                     ret = -ENOMEM;
1096                     goto fail;
1097                 }
1098                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
1099             }
1100             qemu_iovec_init_external(&qiov, &iov, 1);
1101
1102             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
1103
1104             /* Keep bounce buffer around if it is big enough for all
1105              * all future requests.
1106              */
1107             if (num < max_xfer_len) {
1108                 qemu_vfree(iov.iov_base);
1109                 iov.iov_base = NULL;
1110             }
1111         }
1112
1113         sector_num += num;
1114         nb_sectors -= num;
1115     }
1116
1117 fail:
1118     qemu_vfree(iov.iov_base);
1119     return ret;
1120 }
1121
1122 /*
1123  * Forwards an already correctly aligned write request to the BlockDriver.
1124  */
1125 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
1126     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1127     QEMUIOVector *qiov, int flags)
1128 {
1129     BlockDriver *drv = bs->drv;
1130     bool waited;
1131     int ret;
1132
1133     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
1134     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
1135
1136     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1137     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1138     assert(!qiov || bytes == qiov->size);
1139
1140     waited = wait_serialising_requests(req);
1141     assert(!waited || !req->serialising);
1142     assert(req->overlap_offset <= offset);
1143     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1144
1145     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1146
1147     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1148         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
1149         qemu_iovec_is_zero(qiov)) {
1150         flags |= BDRV_REQ_ZERO_WRITE;
1151         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1152             flags |= BDRV_REQ_MAY_UNMAP;
1153         }
1154     }
1155
1156     if (ret < 0) {
1157         /* Do nothing, write notifier decided to fail this request */
1158     } else if (flags & BDRV_REQ_ZERO_WRITE) {
1159         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
1160         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
1161     } else {
1162         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
1163         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1164     }
1165     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
1166
1167     if (ret == 0 && !bs->enable_write_cache) {
1168         ret = bdrv_co_flush(bs);
1169     }
1170
1171     bdrv_set_dirty(bs, sector_num, nb_sectors);
1172
1173     block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
1174
1175     if (ret >= 0) {
1176         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
1177     }
1178
1179     return ret;
1180 }
1181
1182 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
1183                                                 int64_t offset,
1184                                                 unsigned int bytes,
1185                                                 BdrvRequestFlags flags,
1186                                                 BdrvTrackedRequest *req)
1187 {
1188     uint8_t *buf = NULL;
1189     QEMUIOVector local_qiov;
1190     struct iovec iov;
1191     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1192     unsigned int head_padding_bytes, tail_padding_bytes;
1193     int ret = 0;
1194
1195     head_padding_bytes = offset & (align - 1);
1196     tail_padding_bytes = align - ((offset + bytes) & (align - 1));
1197
1198
1199     assert(flags & BDRV_REQ_ZERO_WRITE);
1200     if (head_padding_bytes || tail_padding_bytes) {
1201         buf = qemu_blockalign(bs, align);
1202         iov = (struct iovec) {
1203             .iov_base   = buf,
1204             .iov_len    = align,
1205         };
1206         qemu_iovec_init_external(&local_qiov, &iov, 1);
1207     }
1208     if (head_padding_bytes) {
1209         uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1210
1211         /* RMW the unaligned part before head. */
1212         mark_request_serialising(req, align);
1213         wait_serialising_requests(req);
1214         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
1215         ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
1216                                   align, &local_qiov, 0);
1217         if (ret < 0) {
1218             goto fail;
1219         }
1220         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1221
1222         memset(buf + head_padding_bytes, 0, zero_bytes);
1223         ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
1224                                    &local_qiov,
1225                                    flags & ~BDRV_REQ_ZERO_WRITE);
1226         if (ret < 0) {
1227             goto fail;
1228         }
1229         offset += zero_bytes;
1230         bytes -= zero_bytes;
1231     }
1232
1233     assert(!bytes || (offset & (align - 1)) == 0);
1234     if (bytes >= align) {
1235         /* Write the aligned part in the middle. */
1236         uint64_t aligned_bytes = bytes & ~(align - 1);
1237         ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes,
1238                                    NULL, flags);
1239         if (ret < 0) {
1240             goto fail;
1241         }
1242         bytes -= aligned_bytes;
1243         offset += aligned_bytes;
1244     }
1245
1246     assert(!bytes || (offset & (align - 1)) == 0);
1247     if (bytes) {
1248         assert(align == tail_padding_bytes + bytes);
1249         /* RMW the unaligned part after tail. */
1250         mark_request_serialising(req, align);
1251         wait_serialising_requests(req);
1252         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
1253         ret = bdrv_aligned_preadv(bs, req, offset, align,
1254                                   align, &local_qiov, 0);
1255         if (ret < 0) {
1256             goto fail;
1257         }
1258         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1259
1260         memset(buf, 0, bytes);
1261         ret = bdrv_aligned_pwritev(bs, req, offset, align,
1262                                    &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1263     }
1264 fail:
1265     qemu_vfree(buf);
1266     return ret;
1267
1268 }
1269
1270 /*
1271  * Handle a write request in coroutine context
1272  */
1273 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
1274     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1275     BdrvRequestFlags flags)
1276 {
1277     BdrvTrackedRequest req;
1278     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
1279     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1280     uint8_t *head_buf = NULL;
1281     uint8_t *tail_buf = NULL;
1282     QEMUIOVector local_qiov;
1283     bool use_local_qiov = false;
1284     int ret;
1285
1286     if (!bs->drv) {
1287         return -ENOMEDIUM;
1288     }
1289     if (bs->read_only) {
1290         return -EPERM;
1291     }
1292
1293     ret = bdrv_check_byte_request(bs, offset, bytes);
1294     if (ret < 0) {
1295         return ret;
1296     }
1297
1298     /* throttling disk I/O */
1299     if (bs->io_limits_enabled) {
1300         bdrv_io_limits_intercept(bs, bytes, true);
1301     }
1302
1303     /*
1304      * Align write if necessary by performing a read-modify-write cycle.
1305      * Pad qiov with the read parts and be sure to have a tracked request not
1306      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1307      */
1308     tracked_request_begin(&req, bs, offset, bytes, true);
1309
1310     if (!qiov) {
1311         ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
1312         goto out;
1313     }
1314
1315     if (offset & (align - 1)) {
1316         QEMUIOVector head_qiov;
1317         struct iovec head_iov;
1318
1319         mark_request_serialising(&req, align);
1320         wait_serialising_requests(&req);
1321
1322         head_buf = qemu_blockalign(bs, align);
1323         head_iov = (struct iovec) {
1324             .iov_base   = head_buf,
1325             .iov_len    = align,
1326         };
1327         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1328
1329         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
1330         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
1331                                   align, &head_qiov, 0);
1332         if (ret < 0) {
1333             goto fail;
1334         }
1335         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1336
1337         qemu_iovec_init(&local_qiov, qiov->niov + 2);
1338         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1339         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1340         use_local_qiov = true;
1341
1342         bytes += offset & (align - 1);
1343         offset = offset & ~(align - 1);
1344     }
1345
1346     if ((offset + bytes) & (align - 1)) {
1347         QEMUIOVector tail_qiov;
1348         struct iovec tail_iov;
1349         size_t tail_bytes;
1350         bool waited;
1351
1352         mark_request_serialising(&req, align);
1353         waited = wait_serialising_requests(&req);
1354         assert(!waited || !use_local_qiov);
1355
1356         tail_buf = qemu_blockalign(bs, align);
1357         tail_iov = (struct iovec) {
1358             .iov_base   = tail_buf,
1359             .iov_len    = align,
1360         };
1361         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1362
1363         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
1364         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
1365                                   align, &tail_qiov, 0);
1366         if (ret < 0) {
1367             goto fail;
1368         }
1369         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1370
1371         if (!use_local_qiov) {
1372             qemu_iovec_init(&local_qiov, qiov->niov + 1);
1373             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1374             use_local_qiov = true;
1375         }
1376
1377         tail_bytes = (offset + bytes) & (align - 1);
1378         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1379
1380         bytes = ROUND_UP(bytes, align);
1381     }
1382
1383     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
1384                                use_local_qiov ? &local_qiov : qiov,
1385                                flags);
1386
1387 fail:
1388
1389     if (use_local_qiov) {
1390         qemu_iovec_destroy(&local_qiov);
1391     }
1392     qemu_vfree(head_buf);
1393     qemu_vfree(tail_buf);
1394 out:
1395     tracked_request_end(&req);
1396     return ret;
1397 }
1398
1399 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1400     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1401     BdrvRequestFlags flags)
1402 {
1403     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1404         return -EINVAL;
1405     }
1406
1407     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
1408                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1409 }
1410
1411 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1412     int nb_sectors, QEMUIOVector *qiov)
1413 {
1414     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1415
1416     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1417 }
1418
1419 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1420                                       int64_t sector_num, int nb_sectors,
1421                                       BdrvRequestFlags flags)
1422 {
1423     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
1424
1425     if (!(bs->open_flags & BDRV_O_UNMAP)) {
1426         flags &= ~BDRV_REQ_MAY_UNMAP;
1427     }
1428
1429     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1430                              BDRV_REQ_ZERO_WRITE | flags);
1431 }
1432
1433 int bdrv_flush_all(void)
1434 {
1435     BlockDriverState *bs = NULL;
1436     int result = 0;
1437
1438     while ((bs = bdrv_next(bs))) {
1439         AioContext *aio_context = bdrv_get_aio_context(bs);
1440         int ret;
1441
1442         aio_context_acquire(aio_context);
1443         ret = bdrv_flush(bs);
1444         if (ret < 0 && !result) {
1445             result = ret;
1446         }
1447         aio_context_release(aio_context);
1448     }
1449
1450     return result;
1451 }
1452
1453 typedef struct BdrvCoGetBlockStatusData {
1454     BlockDriverState *bs;
1455     BlockDriverState *base;
1456     int64_t sector_num;
1457     int nb_sectors;
1458     int *pnum;
1459     int64_t ret;
1460     bool done;
1461 } BdrvCoGetBlockStatusData;
1462
1463 /*
1464  * Returns the allocation status of the specified sectors.
1465  * Drivers not implementing the functionality are assumed to not support
1466  * backing files, hence all their sectors are reported as allocated.
1467  *
1468  * If 'sector_num' is beyond the end of the disk image the return value is 0
1469  * and 'pnum' is set to 0.
1470  *
1471  * 'pnum' is set to the number of sectors (including and immediately following
1472  * the specified sector) that are known to be in the same
1473  * allocated/unallocated state.
1474  *
1475  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1476  * beyond the end of the disk image it will be clamped.
1477  */
1478 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
1479                                                      int64_t sector_num,
1480                                                      int nb_sectors, int *pnum)
1481 {
1482     int64_t total_sectors;
1483     int64_t n;
1484     int64_t ret, ret2;
1485
1486     total_sectors = bdrv_nb_sectors(bs);
1487     if (total_sectors < 0) {
1488         return total_sectors;
1489     }
1490
1491     if (sector_num >= total_sectors) {
1492         *pnum = 0;
1493         return 0;
1494     }
1495
1496     n = total_sectors - sector_num;
1497     if (n < nb_sectors) {
1498         nb_sectors = n;
1499     }
1500
1501     if (!bs->drv->bdrv_co_get_block_status) {
1502         *pnum = nb_sectors;
1503         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1504         if (bs->drv->protocol_name) {
1505             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
1506         }
1507         return ret;
1508     }
1509
1510     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
1511     if (ret < 0) {
1512         *pnum = 0;
1513         return ret;
1514     }
1515
1516     if (ret & BDRV_BLOCK_RAW) {
1517         assert(ret & BDRV_BLOCK_OFFSET_VALID);
1518         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
1519                                      *pnum, pnum);
1520     }
1521
1522     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
1523         ret |= BDRV_BLOCK_ALLOCATED;
1524     } else {
1525         if (bdrv_unallocated_blocks_are_zero(bs)) {
1526             ret |= BDRV_BLOCK_ZERO;
1527         } else if (bs->backing_hd) {
1528             BlockDriverState *bs2 = bs->backing_hd;
1529             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
1530             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
1531                 ret |= BDRV_BLOCK_ZERO;
1532             }
1533         }
1534     }
1535
1536     if (bs->file &&
1537         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
1538         (ret & BDRV_BLOCK_OFFSET_VALID)) {
1539         int file_pnum;
1540
1541         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
1542                                         *pnum, &file_pnum);
1543         if (ret2 >= 0) {
1544             /* Ignore errors.  This is just providing extra information, it
1545              * is useful but not necessary.
1546              */
1547             if (!file_pnum) {
1548                 /* !file_pnum indicates an offset at or beyond the EOF; it is
1549                  * perfectly valid for the format block driver to point to such
1550                  * offsets, so catch it and mark everything as zero */
1551                 ret |= BDRV_BLOCK_ZERO;
1552             } else {
1553                 /* Limit request to the range reported by the protocol driver */
1554                 *pnum = file_pnum;
1555                 ret |= (ret2 & BDRV_BLOCK_ZERO);
1556             }
1557         }
1558     }
1559
1560     return ret;
1561 }
1562
1563 /* Coroutine wrapper for bdrv_get_block_status() */
1564 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
1565 {
1566     BdrvCoGetBlockStatusData *data = opaque;
1567     BlockDriverState *bs = data->bs;
1568
1569     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
1570                                          data->pnum);
1571     data->done = true;
1572 }
1573
1574 /*
1575  * Synchronous wrapper around bdrv_co_get_block_status().
1576  *
1577  * See bdrv_co_get_block_status() for details.
1578  */
1579 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
1580                               int nb_sectors, int *pnum)
1581 {
1582     Coroutine *co;
1583     BdrvCoGetBlockStatusData data = {
1584         .bs = bs,
1585         .sector_num = sector_num,
1586         .nb_sectors = nb_sectors,
1587         .pnum = pnum,
1588         .done = false,
1589     };
1590
1591     if (qemu_in_coroutine()) {
1592         /* Fast-path if already in coroutine context */
1593         bdrv_get_block_status_co_entry(&data);
1594     } else {
1595         AioContext *aio_context = bdrv_get_aio_context(bs);
1596
1597         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
1598         qemu_coroutine_enter(co, &data);
1599         while (!data.done) {
1600             aio_poll(aio_context, true);
1601         }
1602     }
1603     return data.ret;
1604 }
1605
1606 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
1607                                    int nb_sectors, int *pnum)
1608 {
1609     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
1610     if (ret < 0) {
1611         return ret;
1612     }
1613     return !!(ret & BDRV_BLOCK_ALLOCATED);
1614 }
1615
1616 /*
1617  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
1618  *
1619  * Return true if the given sector is allocated in any image between
1620  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
1621  * sector is allocated in any image of the chain.  Return false otherwise.
1622  *
1623  * 'pnum' is set to the number of sectors (including and immediately following
1624  *  the specified sector) that are known to be in the same
1625  *  allocated/unallocated state.
1626  *
1627  */
1628 int bdrv_is_allocated_above(BlockDriverState *top,
1629                             BlockDriverState *base,
1630                             int64_t sector_num,
1631                             int nb_sectors, int *pnum)
1632 {
1633     BlockDriverState *intermediate;
1634     int ret, n = nb_sectors;
1635
1636     intermediate = top;
1637     while (intermediate && intermediate != base) {
1638         int pnum_inter;
1639         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
1640                                 &pnum_inter);
1641         if (ret < 0) {
1642             return ret;
1643         } else if (ret) {
1644             *pnum = pnum_inter;
1645             return 1;
1646         }
1647
1648         /*
1649          * [sector_num, nb_sectors] is unallocated on top but intermediate
1650          * might have
1651          *
1652          * [sector_num+x, nr_sectors] allocated.
1653          */
1654         if (n > pnum_inter &&
1655             (intermediate == top ||
1656              sector_num + pnum_inter < intermediate->total_sectors)) {
1657             n = pnum_inter;
1658         }
1659
1660         intermediate = intermediate->backing_hd;
1661     }
1662
1663     *pnum = n;
1664     return 0;
1665 }
1666
1667 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
1668                           const uint8_t *buf, int nb_sectors)
1669 {
1670     BlockDriver *drv = bs->drv;
1671     int ret;
1672
1673     if (!drv) {
1674         return -ENOMEDIUM;
1675     }
1676     if (!drv->bdrv_write_compressed) {
1677         return -ENOTSUP;
1678     }
1679     ret = bdrv_check_request(bs, sector_num, nb_sectors);
1680     if (ret < 0) {
1681         return ret;
1682     }
1683
1684     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1685
1686     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
1687 }
1688
1689 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
1690                       int64_t pos, int size)
1691 {
1692     QEMUIOVector qiov;
1693     struct iovec iov = {
1694         .iov_base   = (void *) buf,
1695         .iov_len    = size,
1696     };
1697
1698     qemu_iovec_init_external(&qiov, &iov, 1);
1699     return bdrv_writev_vmstate(bs, &qiov, pos);
1700 }
1701
1702 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
1703 {
1704     BlockDriver *drv = bs->drv;
1705
1706     if (!drv) {
1707         return -ENOMEDIUM;
1708     } else if (drv->bdrv_save_vmstate) {
1709         return drv->bdrv_save_vmstate(bs, qiov, pos);
1710     } else if (bs->file) {
1711         return bdrv_writev_vmstate(bs->file, qiov, pos);
1712     }
1713
1714     return -ENOTSUP;
1715 }
1716
1717 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1718                       int64_t pos, int size)
1719 {
1720     BlockDriver *drv = bs->drv;
1721     if (!drv)
1722         return -ENOMEDIUM;
1723     if (drv->bdrv_load_vmstate)
1724         return drv->bdrv_load_vmstate(bs, buf, pos, size);
1725     if (bs->file)
1726         return bdrv_load_vmstate(bs->file, buf, pos, size);
1727     return -ENOTSUP;
1728 }
1729
1730 /**************************************************************/
1731 /* async I/Os */
1732
1733 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
1734                            QEMUIOVector *qiov, int nb_sectors,
1735                            BlockCompletionFunc *cb, void *opaque)
1736 {
1737     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
1738
1739     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1740                                  cb, opaque, false);
1741 }
1742
1743 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
1744                             QEMUIOVector *qiov, int nb_sectors,
1745                             BlockCompletionFunc *cb, void *opaque)
1746 {
1747     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
1748
1749     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1750                                  cb, opaque, true);
1751 }
1752
1753 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
1754         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
1755         BlockCompletionFunc *cb, void *opaque)
1756 {
1757     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
1758
1759     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
1760                                  BDRV_REQ_ZERO_WRITE | flags,
1761                                  cb, opaque, true);
1762 }
1763
1764
1765 typedef struct MultiwriteCB {
1766     int error;
1767     int num_requests;
1768     int num_callbacks;
1769     struct {
1770         BlockCompletionFunc *cb;
1771         void *opaque;
1772         QEMUIOVector *free_qiov;
1773     } callbacks[];
1774 } MultiwriteCB;
1775
1776 static void multiwrite_user_cb(MultiwriteCB *mcb)
1777 {
1778     int i;
1779
1780     for (i = 0; i < mcb->num_callbacks; i++) {
1781         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1782         if (mcb->callbacks[i].free_qiov) {
1783             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
1784         }
1785         g_free(mcb->callbacks[i].free_qiov);
1786     }
1787 }
1788
1789 static void multiwrite_cb(void *opaque, int ret)
1790 {
1791     MultiwriteCB *mcb = opaque;
1792
1793     trace_multiwrite_cb(mcb, ret);
1794
1795     if (ret < 0 && !mcb->error) {
1796         mcb->error = ret;
1797     }
1798
1799     mcb->num_requests--;
1800     if (mcb->num_requests == 0) {
1801         multiwrite_user_cb(mcb);
1802         g_free(mcb);
1803     }
1804 }
1805
1806 static int multiwrite_req_compare(const void *a, const void *b)
1807 {
1808     const BlockRequest *req1 = a, *req2 = b;
1809
1810     /*
1811      * Note that we can't simply subtract req2->sector from req1->sector
1812      * here as that could overflow the return value.
1813      */
1814     if (req1->sector > req2->sector) {
1815         return 1;
1816     } else if (req1->sector < req2->sector) {
1817         return -1;
1818     } else {
1819         return 0;
1820     }
1821 }
1822
1823 /*
1824  * Takes a bunch of requests and tries to merge them. Returns the number of
1825  * requests that remain after merging.
1826  */
1827 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
1828     int num_reqs, MultiwriteCB *mcb)
1829 {
1830     int i, outidx;
1831
1832     // Sort requests by start sector
1833     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
1834
1835     // Check if adjacent requests touch the same clusters. If so, combine them,
1836     // filling up gaps with zero sectors.
1837     outidx = 0;
1838     for (i = 1; i < num_reqs; i++) {
1839         int merge = 0;
1840         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
1841
1842         // Handle exactly sequential writes and overlapping writes.
1843         if (reqs[i].sector <= oldreq_last) {
1844             merge = 1;
1845         }
1846
1847         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
1848             merge = 0;
1849         }
1850
1851         if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
1852             reqs[i].nb_sectors > bs->bl.max_transfer_length) {
1853             merge = 0;
1854         }
1855
1856         if (merge) {
1857             size_t size;
1858             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
1859             qemu_iovec_init(qiov,
1860                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
1861
1862             // Add the first request to the merged one. If the requests are
1863             // overlapping, drop the last sectors of the first request.
1864             size = (reqs[i].sector - reqs[outidx].sector) << 9;
1865             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
1866
1867             // We should need to add any zeros between the two requests
1868             assert (reqs[i].sector <= oldreq_last);
1869
1870             // Add the second request
1871             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
1872
1873             // Add tail of first request, if necessary
1874             if (qiov->size < reqs[outidx].qiov->size) {
1875                 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
1876                                   reqs[outidx].qiov->size - qiov->size);
1877             }
1878
1879             reqs[outidx].nb_sectors = qiov->size >> 9;
1880             reqs[outidx].qiov = qiov;
1881
1882             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
1883         } else {
1884             outidx++;
1885             reqs[outidx].sector     = reqs[i].sector;
1886             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
1887             reqs[outidx].qiov       = reqs[i].qiov;
1888         }
1889     }
1890
1891     block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
1892
1893     return outidx + 1;
1894 }
1895
1896 /*
1897  * Submit multiple AIO write requests at once.
1898  *
1899  * On success, the function returns 0 and all requests in the reqs array have
1900  * been submitted. In error case this function returns -1, and any of the
1901  * requests may or may not be submitted yet. In particular, this means that the
1902  * callback will be called for some of the requests, for others it won't. The
1903  * caller must check the error field of the BlockRequest to wait for the right
1904  * callbacks (if error != 0, no callback will be called).
1905  *
1906  * The implementation may modify the contents of the reqs array, e.g. to merge
1907  * requests. However, the fields opaque and error are left unmodified as they
1908  * are used to signal failure for a single request to the caller.
1909  */
1910 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
1911 {
1912     MultiwriteCB *mcb;
1913     int i;
1914
1915     /* don't submit writes if we don't have a medium */
1916     if (bs->drv == NULL) {
1917         for (i = 0; i < num_reqs; i++) {
1918             reqs[i].error = -ENOMEDIUM;
1919         }
1920         return -1;
1921     }
1922
1923     if (num_reqs == 0) {
1924         return 0;
1925     }
1926
1927     // Create MultiwriteCB structure
1928     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
1929     mcb->num_requests = 0;
1930     mcb->num_callbacks = num_reqs;
1931
1932     for (i = 0; i < num_reqs; i++) {
1933         mcb->callbacks[i].cb = reqs[i].cb;
1934         mcb->callbacks[i].opaque = reqs[i].opaque;
1935     }
1936
1937     // Check for mergable requests
1938     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
1939
1940     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
1941
1942     /* Run the aio requests. */
1943     mcb->num_requests = num_reqs;
1944     for (i = 0; i < num_reqs; i++) {
1945         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
1946                               reqs[i].nb_sectors, reqs[i].flags,
1947                               multiwrite_cb, mcb,
1948                               true);
1949     }
1950
1951     return 0;
1952 }
1953
1954 void bdrv_aio_cancel(BlockAIOCB *acb)
1955 {
1956     qemu_aio_ref(acb);
1957     bdrv_aio_cancel_async(acb);
1958     while (acb->refcnt > 1) {
1959         if (acb->aiocb_info->get_aio_context) {
1960             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
1961         } else if (acb->bs) {
1962             aio_poll(bdrv_get_aio_context(acb->bs), true);
1963         } else {
1964             abort();
1965         }
1966     }
1967     qemu_aio_unref(acb);
1968 }
1969
1970 /* Async version of aio cancel. The caller is not blocked if the acb implements
1971  * cancel_async, otherwise we do nothing and let the request normally complete.
1972  * In either case the completion callback must be called. */
1973 void bdrv_aio_cancel_async(BlockAIOCB *acb)
1974 {
1975     if (acb->aiocb_info->cancel_async) {
1976         acb->aiocb_info->cancel_async(acb);
1977     }
1978 }
1979
1980 /**************************************************************/
1981 /* async block device emulation */
1982
1983 typedef struct BlockAIOCBSync {
1984     BlockAIOCB common;
1985     QEMUBH *bh;
1986     int ret;
1987     /* vector translation state */
1988     QEMUIOVector *qiov;
1989     uint8_t *bounce;
1990     int is_write;
1991 } BlockAIOCBSync;
1992
1993 static const AIOCBInfo bdrv_em_aiocb_info = {
1994     .aiocb_size         = sizeof(BlockAIOCBSync),
1995 };
1996
1997 static void bdrv_aio_bh_cb(void *opaque)
1998 {
1999     BlockAIOCBSync *acb = opaque;
2000
2001     if (!acb->is_write && acb->ret >= 0) {
2002         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
2003     }
2004     qemu_vfree(acb->bounce);
2005     acb->common.cb(acb->common.opaque, acb->ret);
2006     qemu_bh_delete(acb->bh);
2007     acb->bh = NULL;
2008     qemu_aio_unref(acb);
2009 }
2010
2011 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
2012                                       int64_t sector_num,
2013                                       QEMUIOVector *qiov,
2014                                       int nb_sectors,
2015                                       BlockCompletionFunc *cb,
2016                                       void *opaque,
2017                                       int is_write)
2018
2019 {
2020     BlockAIOCBSync *acb;
2021
2022     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
2023     acb->is_write = is_write;
2024     acb->qiov = qiov;
2025     acb->bounce = qemu_try_blockalign(bs, qiov->size);
2026     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
2027
2028     if (acb->bounce == NULL) {
2029         acb->ret = -ENOMEM;
2030     } else if (is_write) {
2031         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
2032         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
2033     } else {
2034         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
2035     }
2036
2037     qemu_bh_schedule(acb->bh);
2038
2039     return &acb->common;
2040 }
2041
2042 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
2043         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2044         BlockCompletionFunc *cb, void *opaque)
2045 {
2046     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
2047 }
2048
2049 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
2050         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2051         BlockCompletionFunc *cb, void *opaque)
2052 {
2053     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
2054 }
2055
2056
2057 typedef struct BlockAIOCBCoroutine {
2058     BlockAIOCB common;
2059     BlockRequest req;
2060     bool is_write;
2061     bool need_bh;
2062     bool *done;
2063     QEMUBH* bh;
2064 } BlockAIOCBCoroutine;
2065
2066 static const AIOCBInfo bdrv_em_co_aiocb_info = {
2067     .aiocb_size         = sizeof(BlockAIOCBCoroutine),
2068 };
2069
2070 static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
2071 {
2072     if (!acb->need_bh) {
2073         acb->common.cb(acb->common.opaque, acb->req.error);
2074         qemu_aio_unref(acb);
2075     }
2076 }
2077
2078 static void bdrv_co_em_bh(void *opaque)
2079 {
2080     BlockAIOCBCoroutine *acb = opaque;
2081
2082     assert(!acb->need_bh);
2083     qemu_bh_delete(acb->bh);
2084     bdrv_co_complete(acb);
2085 }
2086
2087 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
2088 {
2089     acb->need_bh = false;
2090     if (acb->req.error != -EINPROGRESS) {
2091         BlockDriverState *bs = acb->common.bs;
2092
2093         acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
2094         qemu_bh_schedule(acb->bh);
2095     }
2096 }
2097
2098 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
2099 static void coroutine_fn bdrv_co_do_rw(void *opaque)
2100 {
2101     BlockAIOCBCoroutine *acb = opaque;
2102     BlockDriverState *bs = acb->common.bs;
2103
2104     if (!acb->is_write) {
2105         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
2106             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
2107     } else {
2108         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
2109             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
2110     }
2111
2112     bdrv_co_complete(acb);
2113 }
2114
2115 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
2116                                          int64_t sector_num,
2117                                          QEMUIOVector *qiov,
2118                                          int nb_sectors,
2119                                          BdrvRequestFlags flags,
2120                                          BlockCompletionFunc *cb,
2121                                          void *opaque,
2122                                          bool is_write)
2123 {
2124     Coroutine *co;
2125     BlockAIOCBCoroutine *acb;
2126
2127     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2128     acb->need_bh = true;
2129     acb->req.error = -EINPROGRESS;
2130     acb->req.sector = sector_num;
2131     acb->req.nb_sectors = nb_sectors;
2132     acb->req.qiov = qiov;
2133     acb->req.flags = flags;
2134     acb->is_write = is_write;
2135
2136     co = qemu_coroutine_create(bdrv_co_do_rw);
2137     qemu_coroutine_enter(co, acb);
2138
2139     bdrv_co_maybe_schedule_bh(acb);
2140     return &acb->common;
2141 }
2142
2143 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
2144 {
2145     BlockAIOCBCoroutine *acb = opaque;
2146     BlockDriverState *bs = acb->common.bs;
2147
2148     acb->req.error = bdrv_co_flush(bs);
2149     bdrv_co_complete(acb);
2150 }
2151
2152 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2153         BlockCompletionFunc *cb, void *opaque)
2154 {
2155     trace_bdrv_aio_flush(bs, opaque);
2156
2157     Coroutine *co;
2158     BlockAIOCBCoroutine *acb;
2159
2160     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2161     acb->need_bh = true;
2162     acb->req.error = -EINPROGRESS;
2163
2164     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
2165     qemu_coroutine_enter(co, acb);
2166
2167     bdrv_co_maybe_schedule_bh(acb);
2168     return &acb->common;
2169 }
2170
2171 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
2172 {
2173     BlockAIOCBCoroutine *acb = opaque;
2174     BlockDriverState *bs = acb->common.bs;
2175
2176     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2177     bdrv_co_complete(acb);
2178 }
2179
2180 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
2181         int64_t sector_num, int nb_sectors,
2182         BlockCompletionFunc *cb, void *opaque)
2183 {
2184     Coroutine *co;
2185     BlockAIOCBCoroutine *acb;
2186
2187     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
2188
2189     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2190     acb->need_bh = true;
2191     acb->req.error = -EINPROGRESS;
2192     acb->req.sector = sector_num;
2193     acb->req.nb_sectors = nb_sectors;
2194     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
2195     qemu_coroutine_enter(co, acb);
2196
2197     bdrv_co_maybe_schedule_bh(acb);
2198     return &acb->common;
2199 }
2200
2201 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
2202                    BlockCompletionFunc *cb, void *opaque)
2203 {
2204     BlockAIOCB *acb;
2205
2206     acb = g_slice_alloc(aiocb_info->aiocb_size);
2207     acb->aiocb_info = aiocb_info;
2208     acb->bs = bs;
2209     acb->cb = cb;
2210     acb->opaque = opaque;
2211     acb->refcnt = 1;
2212     return acb;
2213 }
2214
2215 void qemu_aio_ref(void *p)
2216 {
2217     BlockAIOCB *acb = p;
2218     acb->refcnt++;
2219 }
2220
2221 void qemu_aio_unref(void *p)
2222 {
2223     BlockAIOCB *acb = p;
2224     assert(acb->refcnt > 0);
2225     if (--acb->refcnt == 0) {
2226         g_slice_free1(acb->aiocb_info->aiocb_size, acb);
2227     }
2228 }
2229
2230 /**************************************************************/
2231 /* Coroutine block device emulation */
2232
2233 typedef struct CoroutineIOCompletion {
2234     Coroutine *coroutine;
2235     int ret;
2236 } CoroutineIOCompletion;
2237
2238 static void bdrv_co_io_em_complete(void *opaque, int ret)
2239 {
2240     CoroutineIOCompletion *co = opaque;
2241
2242     co->ret = ret;
2243     qemu_coroutine_enter(co->coroutine, NULL);
2244 }
2245
2246 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
2247                                       int nb_sectors, QEMUIOVector *iov,
2248                                       bool is_write)
2249 {
2250     CoroutineIOCompletion co = {
2251         .coroutine = qemu_coroutine_self(),
2252     };
2253     BlockAIOCB *acb;
2254
2255     if (is_write) {
2256         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
2257                                        bdrv_co_io_em_complete, &co);
2258     } else {
2259         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
2260                                       bdrv_co_io_em_complete, &co);
2261     }
2262
2263     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
2264     if (!acb) {
2265         return -EIO;
2266     }
2267     qemu_coroutine_yield();
2268
2269     return co.ret;
2270 }
2271
2272 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
2273                                          int64_t sector_num, int nb_sectors,
2274                                          QEMUIOVector *iov)
2275 {
2276     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
2277 }
2278
2279 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
2280                                          int64_t sector_num, int nb_sectors,
2281                                          QEMUIOVector *iov)
2282 {
2283     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
2284 }
2285
2286 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2287 {
2288     RwCo *rwco = opaque;
2289
2290     rwco->ret = bdrv_co_flush(rwco->bs);
2291 }
2292
2293 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2294 {
2295     int ret;
2296
2297     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2298         return 0;
2299     }
2300
2301     /* Write back cached data to the OS even with cache=unsafe */
2302     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2303     if (bs->drv->bdrv_co_flush_to_os) {
2304         ret = bs->drv->bdrv_co_flush_to_os(bs);
2305         if (ret < 0) {
2306             return ret;
2307         }
2308     }
2309
2310     /* But don't actually force it to the disk with cache=unsafe */
2311     if (bs->open_flags & BDRV_O_NO_FLUSH) {
2312         goto flush_parent;
2313     }
2314
2315     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2316     if (bs->drv->bdrv_co_flush_to_disk) {
2317         ret = bs->drv->bdrv_co_flush_to_disk(bs);
2318     } else if (bs->drv->bdrv_aio_flush) {
2319         BlockAIOCB *acb;
2320         CoroutineIOCompletion co = {
2321             .coroutine = qemu_coroutine_self(),
2322         };
2323
2324         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2325         if (acb == NULL) {
2326             ret = -EIO;
2327         } else {
2328             qemu_coroutine_yield();
2329             ret = co.ret;
2330         }
2331     } else {
2332         /*
2333          * Some block drivers always operate in either writethrough or unsafe
2334          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2335          * know how the server works (because the behaviour is hardcoded or
2336          * depends on server-side configuration), so we can't ensure that
2337          * everything is safe on disk. Returning an error doesn't work because
2338          * that would break guests even if the server operates in writethrough
2339          * mode.
2340          *
2341          * Let's hope the user knows what he's doing.
2342          */
2343         ret = 0;
2344     }
2345     if (ret < 0) {
2346         return ret;
2347     }
2348
2349     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
2350      * in the case of cache=unsafe, so there are no useless flushes.
2351      */
2352 flush_parent:
2353     return bdrv_co_flush(bs->file);
2354 }
2355
2356 int bdrv_flush(BlockDriverState *bs)
2357 {
2358     Coroutine *co;
2359     RwCo rwco = {
2360         .bs = bs,
2361         .ret = NOT_DONE,
2362     };
2363
2364     if (qemu_in_coroutine()) {
2365         /* Fast-path if already in coroutine context */
2366         bdrv_flush_co_entry(&rwco);
2367     } else {
2368         AioContext *aio_context = bdrv_get_aio_context(bs);
2369
2370         co = qemu_coroutine_create(bdrv_flush_co_entry);
2371         qemu_coroutine_enter(co, &rwco);
2372         while (rwco.ret == NOT_DONE) {
2373             aio_poll(aio_context, true);
2374         }
2375     }
2376
2377     return rwco.ret;
2378 }
2379
2380 typedef struct DiscardCo {
2381     BlockDriverState *bs;
2382     int64_t sector_num;
2383     int nb_sectors;
2384     int ret;
2385 } DiscardCo;
2386 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
2387 {
2388     DiscardCo *rwco = opaque;
2389
2390     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
2391 }
2392
2393 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
2394                                  int nb_sectors)
2395 {
2396     int max_discard, ret;
2397
2398     if (!bs->drv) {
2399         return -ENOMEDIUM;
2400     }
2401
2402     ret = bdrv_check_request(bs, sector_num, nb_sectors);
2403     if (ret < 0) {
2404         return ret;
2405     } else if (bs->read_only) {
2406         return -EPERM;
2407     }
2408
2409     bdrv_reset_dirty(bs, sector_num, nb_sectors);
2410
2411     /* Do nothing if disabled.  */
2412     if (!(bs->open_flags & BDRV_O_UNMAP)) {
2413         return 0;
2414     }
2415
2416     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
2417         return 0;
2418     }
2419
2420     max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
2421     while (nb_sectors > 0) {
2422         int ret;
2423         int num = nb_sectors;
2424
2425         /* align request */
2426         if (bs->bl.discard_alignment &&
2427             num >= bs->bl.discard_alignment &&
2428             sector_num % bs->bl.discard_alignment) {
2429             if (num > bs->bl.discard_alignment) {
2430                 num = bs->bl.discard_alignment;
2431             }
2432             num -= sector_num % bs->bl.discard_alignment;
2433         }
2434
2435         /* limit request size */
2436         if (num > max_discard) {
2437             num = max_discard;
2438         }
2439
2440         if (bs->drv->bdrv_co_discard) {
2441             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
2442         } else {
2443             BlockAIOCB *acb;
2444             CoroutineIOCompletion co = {
2445                 .coroutine = qemu_coroutine_self(),
2446             };
2447
2448             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
2449                                             bdrv_co_io_em_complete, &co);
2450             if (acb == NULL) {
2451                 return -EIO;
2452             } else {
2453                 qemu_coroutine_yield();
2454                 ret = co.ret;
2455             }
2456         }
2457         if (ret && ret != -ENOTSUP) {
2458             return ret;
2459         }
2460
2461         sector_num += num;
2462         nb_sectors -= num;
2463     }
2464     return 0;
2465 }
2466
2467 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
2468 {
2469     Coroutine *co;
2470     DiscardCo rwco = {
2471         .bs = bs,
2472         .sector_num = sector_num,
2473         .nb_sectors = nb_sectors,
2474         .ret = NOT_DONE,
2475     };
2476
2477     if (qemu_in_coroutine()) {
2478         /* Fast-path if already in coroutine context */
2479         bdrv_discard_co_entry(&rwco);
2480     } else {
2481         AioContext *aio_context = bdrv_get_aio_context(bs);
2482
2483         co = qemu_coroutine_create(bdrv_discard_co_entry);
2484         qemu_coroutine_enter(co, &rwco);
2485         while (rwco.ret == NOT_DONE) {
2486             aio_poll(aio_context, true);
2487         }
2488     }
2489
2490     return rwco.ret;
2491 }
2492
2493 /* needed for generic scsi interface */
2494
2495 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
2496 {
2497     BlockDriver *drv = bs->drv;
2498
2499     if (drv && drv->bdrv_ioctl)
2500         return drv->bdrv_ioctl(bs, req, buf);
2501     return -ENOTSUP;
2502 }
2503
2504 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
2505         unsigned long int req, void *buf,
2506         BlockCompletionFunc *cb, void *opaque)
2507 {
2508     BlockDriver *drv = bs->drv;
2509
2510     if (drv && drv->bdrv_aio_ioctl)
2511         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
2512     return NULL;
2513 }
2514
2515 void *qemu_blockalign(BlockDriverState *bs, size_t size)
2516 {
2517     return qemu_memalign(bdrv_opt_mem_align(bs), size);
2518 }
2519
2520 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2521 {
2522     return memset(qemu_blockalign(bs, size), 0, size);
2523 }
2524
2525 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2526 {
2527     size_t align = bdrv_opt_mem_align(bs);
2528
2529     /* Ensure that NULL is never returned on success */
2530     assert(align > 0);
2531     if (size == 0) {
2532         size = align;
2533     }
2534
2535     return qemu_try_memalign(align, size);
2536 }
2537
2538 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2539 {
2540     void *mem = qemu_try_blockalign(bs, size);
2541
2542     if (mem) {
2543         memset(mem, 0, size);
2544     }
2545
2546     return mem;
2547 }
2548
2549 /*
2550  * Check if all memory in this vector is sector aligned.
2551  */
2552 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2553 {
2554     int i;
2555     size_t alignment = bdrv_min_mem_align(bs);
2556
2557     for (i = 0; i < qiov->niov; i++) {
2558         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2559             return false;
2560         }
2561         if (qiov->iov[i].iov_len % alignment) {
2562             return false;
2563         }
2564     }
2565
2566     return true;
2567 }
2568
2569 void bdrv_add_before_write_notifier(BlockDriverState *bs,
2570                                     NotifierWithReturn *notifier)
2571 {
2572     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2573 }
2574
2575 void bdrv_io_plug(BlockDriverState *bs)
2576 {
2577     BlockDriver *drv = bs->drv;
2578     if (drv && drv->bdrv_io_plug) {
2579         drv->bdrv_io_plug(bs);
2580     } else if (bs->file) {
2581         bdrv_io_plug(bs->file);
2582     }
2583 }
2584
2585 void bdrv_io_unplug(BlockDriverState *bs)
2586 {
2587     BlockDriver *drv = bs->drv;
2588     if (drv && drv->bdrv_io_unplug) {
2589         drv->bdrv_io_unplug(bs);
2590     } else if (bs->file) {
2591         bdrv_io_unplug(bs->file);
2592     }
2593 }
2594
2595 void bdrv_flush_io_queue(BlockDriverState *bs)
2596 {
2597     BlockDriver *drv = bs->drv;
2598     if (drv && drv->bdrv_flush_io_queue) {
2599         drv->bdrv_flush_io_queue(bs);
2600     } else if (bs->file) {
2601         bdrv_flush_io_queue(bs->file);
2602     }
2603 }