block/commit.c

   1 /*
   2  * Live block commit
   3  *
   4  * Copyright Red Hat, Inc. 2012
   5  *
   6  * Authors:
   7  *  Jeff Cody   <jcody@redhat.com>
   8  *  Based on stream.c by Stefan Hajnoczi
   9  *
  10  * This work is licensed under the terms of the GNU LGPL, version 2 or later.
  11  * See the COPYING.LIB file in the top-level directory.
  12  *
  13  */
  14
  15 #include "qemu/osdep.h"
  16 #include "qemu/cutils.h"
  17 #include "trace.h"
  18 #include "block/block_int.h"
  19 #include "block/blockjob_int.h"
  20 #include "qapi/error.h"
  21 #include "qemu/ratelimit.h"
  22 #include "qemu/memalign.h"
  23 #include "sysemu/block-backend.h"
  24
  25 enum {
  26     /*
  27      * Size of data buffer for populating the image file.  This should be large
  28      * enough to process multiple clusters in a single call, so that populating
  29      * contiguous regions of the image is efficient.
  30      */
  31     COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
  32 };
  33
  34 typedef struct CommitBlockJob {
  35     BlockJob common;
  36     BlockDriverState *commit_top_bs;
  37     BlockBackend *top;
  38     BlockBackend *base;
  39     BlockDriverState *base_bs;
  40     BlockDriverState *base_overlay;
  41     BlockdevOnError on_error;
  42     bool base_read_only;
  43     bool chain_frozen;
  44     char *backing_file_str;
  45     bool backing_mask_protocol;
  46 } CommitBlockJob;
  47
  48 static int commit_prepare(Job *job)
  49 {
  50     CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
  51
  52     bdrv_graph_rdlock_main_loop();
  53     bdrv_unfreeze_backing_chain(s->commit_top_bs, s->base_bs);
  54     s->chain_frozen = false;
  55     bdrv_graph_rdunlock_main_loop();
  56
  57     /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
  58      * the normal backing chain can be restored. */
  59     blk_unref(s->base);
  60     s->base = NULL;
  61
  62     /* FIXME: bdrv_drop_intermediate treats total failures and partial failures
  63      * identically. Further work is needed to disambiguate these cases. */
  64     return bdrv_drop_intermediate(s->commit_top_bs, s->base_bs,
  65                                   s->backing_file_str,
  66                                   s->backing_mask_protocol);
  67 }
  68
  69 static void commit_abort(Job *job)
  70 {
  71     CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
  72     BlockDriverState *top_bs = blk_bs(s->top);
  73     BlockDriverState *commit_top_backing_bs;
  74
  75     if (s->chain_frozen) {
  76         bdrv_graph_rdlock_main_loop();
  77         bdrv_unfreeze_backing_chain(s->commit_top_bs, s->base_bs);
  78         bdrv_graph_rdunlock_main_loop();
  79     }
  80
  81     /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
  82     bdrv_ref(top_bs);
  83     bdrv_ref(s->commit_top_bs);
  84
  85     if (s->base) {
  86         blk_unref(s->base);
  87     }
  88
  89     /* free the blockers on the intermediate nodes so that bdrv_replace_nodes
  90      * can succeed */
  91     block_job_remove_all_bdrv(&s->common);
  92
  93     /* If bdrv_drop_intermediate() failed (or was not invoked), remove the
  94      * commit filter driver from the backing chain now. Do this as the final
  95      * step so that the 'consistent read' permission can be granted.
  96      *
  97      * XXX Can (or should) we somehow keep 'consistent read' blocked even
  98      * after the failed/cancelled commit job is gone? If we already wrote
  99      * something to base, the intermediate images aren't valid any more. */
 100     bdrv_graph_rdlock_main_loop();
 101     commit_top_backing_bs = s->commit_top_bs->backing->bs;
 102     bdrv_graph_rdunlock_main_loop();
 103
 104     bdrv_drained_begin(commit_top_backing_bs);
 105     bdrv_graph_wrlock();
 106     bdrv_replace_node(s->commit_top_bs, commit_top_backing_bs, &error_abort);
 107     bdrv_graph_wrunlock();
 108     bdrv_drained_end(commit_top_backing_bs);
 109
 110     bdrv_unref(s->commit_top_bs);
 111     bdrv_unref(top_bs);
 112 }
 113
 114 static void commit_clean(Job *job)
 115 {
 116     CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
 117
 118     /* restore base open flags here if appropriate (e.g., change the base back
 119      * to r/o). These reopens do not need to be atomic, since we won't abort
 120      * even on failure here */
 121     if (s->base_read_only) {
 122         bdrv_reopen_set_read_only(s->base_bs, true, NULL);
 123     }
 124
 125     g_free(s->backing_file_str);
 126     blk_unref(s->top);
 127 }
 128
 129 static int coroutine_fn commit_run(Job *job, Error **errp)
 130 {
 131     CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
 132     int64_t offset;
 133     int ret = 0;
 134     int64_t n = 0; /* bytes */
 135     QEMU_AUTO_VFREE void *buf = NULL;
 136     int64_t len, base_len;
 137
 138     len = blk_co_getlength(s->top);
 139     if (len < 0) {
 140         return len;
 141     }
 142     job_progress_set_remaining(&s->common.job, len);
 143
 144     base_len = blk_co_getlength(s->base);
 145     if (base_len < 0) {
 146         return base_len;
 147     }
 148
 149     if (base_len < len) {
 150         ret = blk_co_truncate(s->base, len, false, PREALLOC_MODE_OFF, 0, NULL);
 151         if (ret) {
 152             return ret;
 153         }
 154     }
 155
 156     buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);
 157
 158     for (offset = 0; offset < len; offset += n) {
 159         bool copy;
 160         bool error_in_source = true;
 161
 162         /* Note that even when no rate limit is applied we need to yield
 163          * with no pending I/O here so that bdrv_drain_all() returns.
 164          */
 165         block_job_ratelimit_sleep(&s->common);
 166         if (job_is_cancelled(&s->common.job)) {
 167             break;
 168         }
 169         /* Copy if allocated above the base */
 170         ret = blk_co_is_allocated_above(s->top, s->base_overlay, true,
 171                                         offset, COMMIT_BUFFER_SIZE, &n);
 172         copy = (ret > 0);
 173         trace_commit_one_iteration(s, offset, n, ret);
 174         if (copy) {
 175             assert(n < SIZE_MAX);
 176
 177             ret = blk_co_pread(s->top, offset, n, buf, 0);
 178             if (ret >= 0) {
 179                 ret = blk_co_pwrite(s->base, offset, n, buf, 0);
 180                 if (ret < 0) {
 181                     error_in_source = false;
 182                 }
 183             }
 184         }
 185         if (ret < 0) {
 186             BlockErrorAction action =
 187                 block_job_error_action(&s->common, s->on_error,
 188                                        error_in_source, -ret);
 189             if (action == BLOCK_ERROR_ACTION_REPORT) {
 190                 return ret;
 191             } else {
 192                 n = 0;
 193                 continue;
 194             }
 195         }
 196         /* Publish progress */
 197         job_progress_update(&s->common.job, n);
 198
 199         if (copy) {
 200             block_job_ratelimit_processed_bytes(&s->common, n);
 201         }
 202     }
 203
 204     return 0;
 205 }
 206
 207 static const BlockJobDriver commit_job_driver = {
 208     .job_driver = {
 209         .instance_size = sizeof(CommitBlockJob),
 210         .job_type      = JOB_TYPE_COMMIT,
 211         .free          = block_job_free,
 212         .user_resume   = block_job_user_resume,
 213         .run           = commit_run,
 214         .prepare       = commit_prepare,
 215         .abort         = commit_abort,
 216         .clean         = commit_clean
 217     },
 218 };
 219
 220 static int coroutine_fn GRAPH_RDLOCK
 221 bdrv_commit_top_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
 222                        QEMUIOVector *qiov, BdrvRequestFlags flags)
 223 {
 224     return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
 225 }
 226
 227 static GRAPH_RDLOCK void bdrv_commit_top_refresh_filename(BlockDriverState *bs)
 228 {
 229     pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
 230             bs->backing->bs->filename);
 231 }
 232
 233 static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c,
 234                                        BdrvChildRole role,
 235                                        BlockReopenQueue *reopen_queue,
 236                                        uint64_t perm, uint64_t shared,
 237                                        uint64_t *nperm, uint64_t *nshared)
 238 {
 239     *nperm = 0;
 240     *nshared = BLK_PERM_ALL;
 241 }
 242
 243 /* Dummy node that provides consistent read to its users without requiring it
 244  * from its backing file and that allows writes on the backing file chain. */
 245 static BlockDriver bdrv_commit_top = {
 246     .format_name                = "commit_top",
 247     .bdrv_co_preadv             = bdrv_commit_top_preadv,
 248     .bdrv_refresh_filename      = bdrv_commit_top_refresh_filename,
 249     .bdrv_child_perm            = bdrv_commit_top_child_perm,
 250
 251     .is_filter                  = true,
 252     .filtered_child_is_backing  = true,
 253 };
 254
 255 void commit_start(const char *job_id, BlockDriverState *bs,
 256                   BlockDriverState *base, BlockDriverState *top,
 257                   int creation_flags, int64_t speed,
 258                   BlockdevOnError on_error, const char *backing_file_str,
 259                   bool backing_mask_protocol,
 260                   const char *filter_node_name, Error **errp)
 261 {
 262     CommitBlockJob *s;
 263     BlockDriverState *iter;
 264     BlockDriverState *commit_top_bs = NULL;
 265     BlockDriverState *filtered_base;
 266     int64_t base_size, top_size;
 267     uint64_t base_perms, iter_shared_perms;
 268     int ret;
 269
 270     GLOBAL_STATE_CODE();
 271
 272     assert(top != bs);
 273     bdrv_graph_rdlock_main_loop();
 274     if (bdrv_skip_filters(top) == bdrv_skip_filters(base)) {
 275         error_setg(errp, "Invalid files for merge: top and base are the same");
 276         bdrv_graph_rdunlock_main_loop();
 277         return;
 278     }
 279     bdrv_graph_rdunlock_main_loop();
 280
 281     base_size = bdrv_getlength(base);
 282     if (base_size < 0) {
 283         error_setg_errno(errp, -base_size, "Could not inquire base image size");
 284         return;
 285     }
 286
 287     top_size = bdrv_getlength(top);
 288     if (top_size < 0) {
 289         error_setg_errno(errp, -top_size, "Could not inquire top image size");
 290         return;
 291     }
 292
 293     base_perms = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
 294     if (base_size < top_size) {
 295         base_perms |= BLK_PERM_RESIZE;
 296     }
 297
 298     s = block_job_create(job_id, &commit_job_driver, NULL, bs, 0, BLK_PERM_ALL,
 299                          speed, creation_flags, NULL, NULL, errp);
 300     if (!s) {
 301         return;
 302     }
 303
 304     /* convert base to r/w, if necessary */
 305     s->base_read_only = bdrv_is_read_only(base);
 306     if (s->base_read_only) {
 307         if (bdrv_reopen_set_read_only(base, false, errp) != 0) {
 308             goto fail;
 309         }
 310     }
 311
 312     /* Insert commit_top block node above top, so we can block consistent read
 313      * on the backing chain below it */
 314     commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, filter_node_name, 0,
 315                                          errp);
 316     if (commit_top_bs == NULL) {
 317         goto fail;
 318     }
 319     if (!filter_node_name) {
 320         commit_top_bs->implicit = true;
 321     }
 322
 323     /* So that we can always drop this node */
 324     commit_top_bs->never_freeze = true;
 325
 326     commit_top_bs->total_sectors = top->total_sectors;
 327
 328     ret = bdrv_append(commit_top_bs, top, errp);
 329     bdrv_unref(commit_top_bs); /* referenced by new parents or failed */
 330     if (ret < 0) {
 331         commit_top_bs = NULL;
 332         goto fail;
 333     }
 334
 335     s->commit_top_bs = commit_top_bs;
 336
 337     /*
 338      * Block all nodes between top and base, because they will
 339      * disappear from the chain after this operation.
 340      * Note that this assumes that the user is fine with removing all
 341      * nodes (including R/W filters) between top and base.  Assuring
 342      * this is the responsibility of the interface (i.e. whoever calls
 343      * commit_start()).
 344      */
 345     bdrv_graph_wrlock();
 346     s->base_overlay = bdrv_find_overlay(top, base);
 347     assert(s->base_overlay);
 348
 349     /*
 350      * The topmost node with
 351      * bdrv_skip_filters(filtered_base) == bdrv_skip_filters(base)
 352      */
 353     filtered_base = bdrv_cow_bs(s->base_overlay);
 354     assert(bdrv_skip_filters(filtered_base) == bdrv_skip_filters(base));
 355
 356     /*
 357      * XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves
 358      * at s->base (if writes are blocked for a node, they are also blocked
 359      * for its backing file). The other options would be a second filter
 360      * driver above s->base.
 361      */
 362     iter_shared_perms = BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE;
 363
 364     for (iter = top; iter != base; iter = bdrv_filter_or_cow_bs(iter)) {
 365         if (iter == filtered_base) {
 366             /*
 367              * From here on, all nodes are filters on the base.  This
 368              * allows us to share BLK_PERM_CONSISTENT_READ.
 369              */
 370             iter_shared_perms |= BLK_PERM_CONSISTENT_READ;
 371         }
 372
 373         ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
 374                                  iter_shared_perms, errp);
 375         if (ret < 0) {
 376             bdrv_graph_wrunlock();
 377             goto fail;
 378         }
 379     }
 380
 381     if (bdrv_freeze_backing_chain(commit_top_bs, base, errp) < 0) {
 382         bdrv_graph_wrunlock();
 383         goto fail;
 384     }
 385     s->chain_frozen = true;
 386
 387     ret = block_job_add_bdrv(&s->common, "base", base, 0, BLK_PERM_ALL, errp);
 388     bdrv_graph_wrunlock();
 389
 390     if (ret < 0) {
 391         goto fail;
 392     }
 393
 394     s->base = blk_new(s->common.job.aio_context,
 395                       base_perms,
 396                       BLK_PERM_CONSISTENT_READ
 397                       | BLK_PERM_WRITE_UNCHANGED);
 398     ret = blk_insert_bs(s->base, base, errp);
 399     if (ret < 0) {
 400         goto fail;
 401     }
 402     blk_set_disable_request_queuing(s->base, true);
 403     s->base_bs = base;
 404
 405     /* Required permissions are already taken with block_job_add_bdrv() */
 406     s->top = blk_new(s->common.job.aio_context, 0, BLK_PERM_ALL);
 407     ret = blk_insert_bs(s->top, top, errp);
 408     if (ret < 0) {
 409         goto fail;
 410     }
 411     blk_set_disable_request_queuing(s->top, true);
 412
 413     s->backing_file_str = g_strdup(backing_file_str);
 414     s->backing_mask_protocol = backing_mask_protocol;
 415     s->on_error = on_error;
 416
 417     trace_commit_start(bs, base, top, s);
 418     job_start(&s->common.job);
 419     return;
 420
 421 fail:
 422     if (s->chain_frozen) {
 423         bdrv_graph_rdlock_main_loop();
 424         bdrv_unfreeze_backing_chain(commit_top_bs, base);
 425         bdrv_graph_rdunlock_main_loop();
 426     }
 427     if (s->base) {
 428         blk_unref(s->base);
 429     }
 430     if (s->top) {
 431         blk_unref(s->top);
 432     }
 433     if (s->base_read_only) {
 434         bdrv_reopen_set_read_only(base, true, NULL);
 435     }
 436     job_early_fail(&s->common.job);
 437     /* commit_top_bs has to be replaced after deleting the block job,
 438      * otherwise this would fail because of lack of permissions. */
 439     if (commit_top_bs) {
 440         bdrv_drained_begin(top);
 441         bdrv_graph_wrlock();
 442         bdrv_replace_node(commit_top_bs, top, &error_abort);
 443         bdrv_graph_wrunlock();
 444         bdrv_drained_end(top);
 445     }
 446 }
 447
 448
 449 #define COMMIT_BUF_SIZE (2048 * BDRV_SECTOR_SIZE)
 450
 451 /* commit COW file into the raw image */
 452 int bdrv_commit(BlockDriverState *bs)
 453 {
 454     BlockBackend *src, *backing;
 455     BlockDriverState *backing_file_bs = NULL;
 456     BlockDriverState *commit_top_bs = NULL;
 457     BlockDriver *drv = bs->drv;
 458     AioContext *ctx;
 459     int64_t offset, length, backing_length;
 460     int ro;
 461     int64_t n;
 462     int ret = 0;
 463     QEMU_AUTO_VFREE uint8_t *buf = NULL;
 464     Error *local_err = NULL;
 465
 466     GLOBAL_STATE_CODE();
 467     GRAPH_RDLOCK_GUARD_MAINLOOP();
 468
 469     if (!drv)
 470         return -ENOMEDIUM;
 471
 472     backing_file_bs = bdrv_cow_bs(bs);
 473
 474     if (!backing_file_bs) {
 475         return -ENOTSUP;
 476     }
 477
 478     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
 479         bdrv_op_is_blocked(backing_file_bs, BLOCK_OP_TYPE_COMMIT_TARGET, NULL))
 480     {
 481         return -EBUSY;
 482     }
 483
 484     ro = bdrv_is_read_only(backing_file_bs);
 485
 486     if (ro) {
 487         if (bdrv_reopen_set_read_only(backing_file_bs, false, NULL)) {
 488             return -EACCES;
 489         }
 490     }
 491
 492     ctx = bdrv_get_aio_context(bs);
 493     /* WRITE_UNCHANGED is required for bdrv_make_empty() */
 494     src = blk_new(ctx, BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED,
 495                   BLK_PERM_ALL);
 496     backing = blk_new(ctx, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
 497
 498     ret = blk_insert_bs(src, bs, &local_err);
 499     if (ret < 0) {
 500         error_report_err(local_err);
 501         goto ro_cleanup;
 502     }
 503
 504     /* Insert commit_top block node above backing, so we can write to it */
 505     commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, NULL, BDRV_O_RDWR,
 506                                          &local_err);
 507     if (commit_top_bs == NULL) {
 508         error_report_err(local_err);
 509         goto ro_cleanup;
 510     }
 511
 512     bdrv_set_backing_hd(commit_top_bs, backing_file_bs, &error_abort);
 513     bdrv_set_backing_hd(bs, commit_top_bs, &error_abort);
 514
 515     ret = blk_insert_bs(backing, backing_file_bs, &local_err);
 516     if (ret < 0) {
 517         error_report_err(local_err);
 518         goto ro_cleanup;
 519     }
 520
 521     length = blk_getlength(src);
 522     if (length < 0) {
 523         ret = length;
 524         goto ro_cleanup;
 525     }
 526
 527     backing_length = blk_getlength(backing);
 528     if (backing_length < 0) {
 529         ret = backing_length;
 530         goto ro_cleanup;
 531     }
 532
 533     /* If our top snapshot is larger than the backing file image,
 534      * grow the backing file image if possible.  If not possible,
 535      * we must return an error */
 536     if (length > backing_length) {
 537         ret = blk_truncate(backing, length, false, PREALLOC_MODE_OFF, 0,
 538                            &local_err);
 539         if (ret < 0) {
 540             error_report_err(local_err);
 541             goto ro_cleanup;
 542         }
 543     }
 544
 545     /* blk_try_blockalign() for src will choose an alignment that works for
 546      * backing as well, so no need to compare the alignment manually. */
 547     buf = blk_try_blockalign(src, COMMIT_BUF_SIZE);
 548     if (buf == NULL) {
 549         ret = -ENOMEM;
 550         goto ro_cleanup;
 551     }
 552
 553     for (offset = 0; offset < length; offset += n) {
 554         ret = bdrv_is_allocated(bs, offset, COMMIT_BUF_SIZE, &n);
 555         if (ret < 0) {
 556             goto ro_cleanup;
 557         }
 558         if (ret) {
 559             ret = blk_pread(src, offset, n, buf, 0);
 560             if (ret < 0) {
 561                 goto ro_cleanup;
 562             }
 563
 564             ret = blk_pwrite(backing, offset, n, buf, 0);
 565             if (ret < 0) {
 566                 goto ro_cleanup;
 567             }
 568         }
 569     }
 570
 571     ret = blk_make_empty(src, NULL);
 572     /* Ignore -ENOTSUP */
 573     if (ret < 0 && ret != -ENOTSUP) {
 574         goto ro_cleanup;
 575     }
 576
 577     blk_flush(src);
 578
 579     /*
 580      * Make sure all data we wrote to the backing device is actually
 581      * stable on disk.
 582      */
 583     blk_flush(backing);
 584
 585     ret = 0;
 586 ro_cleanup:
 587     blk_unref(backing);
 588     if (bdrv_cow_bs(bs) != backing_file_bs) {
 589         bdrv_set_backing_hd(bs, backing_file_bs, &error_abort);
 590     }
 591     bdrv_unref(commit_top_bs);
 592     blk_unref(src);
 593
 594     if (ro) {
 595         /* ignoring error return here */
 596         bdrv_reopen_set_read_only(backing_file_bs, true, NULL);
 597     }
 598
 599     return ret;
 600 }