block/block-copy.c

   1 /*
   2  * block_copy API
   3  *
   4  * Copyright (C) 2013 Proxmox Server Solutions
   5  * Copyright (c) 2019 Virtuozzo International GmbH.
   6  *
   7  * Authors:
   8  *  Dietmar Maurer (dietmar@proxmox.com)
   9  *  Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
  10  *
  11  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  12  * See the COPYING file in the top-level directory.
  13  */
  14
  15 #include "qemu/osdep.h"
  16
  17 #include "trace.h"
  18 #include "qapi/error.h"
  19 #include "block/block-copy.h"
  20 #include "sysemu/block-backend.h"
  21 #include "qemu/units.h"
  22
  23 #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
  24 #define BLOCK_COPY_MAX_BUFFER (1 * MiB)
  25 #define BLOCK_COPY_MAX_MEM (128 * MiB)
  26
  27 typedef struct BlockCopyTask {
  28     int64_t offset;
  29     int64_t bytes;
  30     QLIST_ENTRY(BlockCopyTask) list;
  31     CoQueue wait_queue; /* coroutines blocked on this task */
  32 } BlockCopyTask;
  33
  34 typedef struct BlockCopyState {
  35     /*
  36      * BdrvChild objects are not owned or managed by block-copy. They are
  37      * provided by block-copy user and user is responsible for appropriate
  38      * permissions on these children.
  39      */
  40     BdrvChild *source;
  41     BdrvChild *target;
  42     BdrvDirtyBitmap *copy_bitmap;
  43     int64_t in_flight_bytes;
  44     int64_t cluster_size;
  45     bool use_copy_range;
  46     int64_t copy_size;
  47     uint64_t len;
  48     QLIST_HEAD(, BlockCopyTask) tasks;
  49
  50     BdrvRequestFlags write_flags;
  51
  52     /*
  53      * skip_unallocated:
  54      *
  55      * Used by sync=top jobs, which first scan the source node for unallocated
  56      * areas and clear them in the copy_bitmap.  During this process, the bitmap
  57      * is thus not fully initialized: It may still have bits set for areas that
  58      * are unallocated and should actually not be copied.
  59      *
  60      * This is indicated by skip_unallocated.
  61      *
  62      * In this case, block_copy() will query the source’s allocation status,
  63      * skip unallocated regions, clear them in the copy_bitmap, and invoke
  64      * block_copy_reset_unallocated() every time it does.
  65      */
  66     bool skip_unallocated;
  67
  68     ProgressMeter *progress;
  69     /* progress_bytes_callback: called when some copying progress is done. */
  70     ProgressBytesCallbackFunc progress_bytes_callback;
  71     void *progress_opaque;
  72
  73     SharedResource *mem;
  74 } BlockCopyState;
  75
  76 static BlockCopyTask *find_conflicting_task(BlockCopyState *s,
  77                                             int64_t offset, int64_t bytes)
  78 {
  79     BlockCopyTask *t;
  80
  81     QLIST_FOREACH(t, &s->tasks, list) {
  82         if (offset + bytes > t->offset && offset < t->offset + t->bytes) {
  83             return t;
  84         }
  85     }
  86
  87     return NULL;
  88 }
  89
  90 /*
  91  * If there are no intersecting tasks return false. Otherwise, wait for the
  92  * first found intersecting tasks to finish and return true.
  93  */
  94 static bool coroutine_fn block_copy_wait_one(BlockCopyState *s, int64_t offset,
  95                                              int64_t bytes)
  96 {
  97     BlockCopyTask *task = find_conflicting_task(s, offset, bytes);
  98
  99     if (!task) {
 100         return false;
 101     }
 102
 103     qemu_co_queue_wait(&task->wait_queue, NULL);
 104
 105     return true;
 106 }
 107
 108 /* Called only on full-dirty region */
 109 static BlockCopyTask *block_copy_task_create(BlockCopyState *s,
 110                                              int64_t offset, int64_t bytes)
 111 {
 112     BlockCopyTask *task = g_new(BlockCopyTask, 1);
 113
 114     assert(!find_conflicting_task(s, offset, bytes));
 115
 116     bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
 117     s->in_flight_bytes += bytes;
 118
 119     task->offset = offset;
 120     task->bytes = bytes;
 121     qemu_co_queue_init(&task->wait_queue);
 122     QLIST_INSERT_HEAD(&s->tasks, task, list);
 123
 124     return task;
 125 }
 126
 127 /*
 128  * block_copy_task_shrink
 129  *
 130  * Drop the tail of the task to be handled later. Set dirty bits back and
 131  * wake up all tasks waiting for us (may be some of them are not intersecting
 132  * with shrunk task)
 133  */
 134 static void coroutine_fn block_copy_task_shrink(BlockCopyState *s,
 135                                                 BlockCopyTask *task,
 136                                                 int64_t new_bytes)
 137 {
 138     if (new_bytes == task->bytes) {
 139         return;
 140     }
 141
 142     assert(new_bytes > 0 && new_bytes < task->bytes);
 143
 144     s->in_flight_bytes -= task->bytes - new_bytes;
 145     bdrv_set_dirty_bitmap(s->copy_bitmap,
 146                           task->offset + new_bytes, task->bytes - new_bytes);
 147
 148     task->bytes = new_bytes;
 149     qemu_co_queue_restart_all(&task->wait_queue);
 150 }
 151
 152 static void coroutine_fn block_copy_task_end(BlockCopyState *s,
 153                                              BlockCopyTask *task, int ret)
 154 {
 155     s->in_flight_bytes -= task->bytes;
 156     if (ret < 0) {
 157         bdrv_set_dirty_bitmap(s->copy_bitmap, task->offset, task->bytes);
 158     }
 159     QLIST_REMOVE(task, list);
 160     qemu_co_queue_restart_all(&task->wait_queue);
 161 }
 162
 163 void block_copy_state_free(BlockCopyState *s)
 164 {
 165     if (!s) {
 166         return;
 167     }
 168
 169     bdrv_release_dirty_bitmap(s->copy_bitmap);
 170     shres_destroy(s->mem);
 171     g_free(s);
 172 }
 173
 174 static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target)
 175 {
 176     return MIN_NON_ZERO(INT_MAX,
 177                         MIN_NON_ZERO(source->bs->bl.max_transfer,
 178                                      target->bs->bl.max_transfer));
 179 }
 180
 181 BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
 182                                      int64_t cluster_size,
 183                                      BdrvRequestFlags write_flags, Error **errp)
 184 {
 185     BlockCopyState *s;
 186     BdrvDirtyBitmap *copy_bitmap;
 187
 188     copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
 189                                            errp);
 190     if (!copy_bitmap) {
 191         return NULL;
 192     }
 193     bdrv_disable_dirty_bitmap(copy_bitmap);
 194
 195     s = g_new(BlockCopyState, 1);
 196     *s = (BlockCopyState) {
 197         .source = source,
 198         .target = target,
 199         .copy_bitmap = copy_bitmap,
 200         .cluster_size = cluster_size,
 201         .len = bdrv_dirty_bitmap_size(copy_bitmap),
 202         .write_flags = write_flags,
 203         .mem = shres_create(BLOCK_COPY_MAX_MEM),
 204     };
 205
 206     if (block_copy_max_transfer(source, target) < cluster_size) {
 207         /*
 208          * copy_range does not respect max_transfer. We don't want to bother
 209          * with requests smaller than block-copy cluster size, so fallback to
 210          * buffered copying (read and write respect max_transfer on their
 211          * behalf).
 212          */
 213         s->use_copy_range = false;
 214         s->copy_size = cluster_size;
 215     } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) {
 216         /* Compression supports only cluster-size writes and no copy-range. */
 217         s->use_copy_range = false;
 218         s->copy_size = cluster_size;
 219     } else {
 220         /*
 221          * We enable copy-range, but keep small copy_size, until first
 222          * successful copy_range (look at block_copy_do_copy).
 223          */
 224         s->use_copy_range = true;
 225         s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
 226     }
 227
 228     QLIST_INIT(&s->tasks);
 229
 230     return s;
 231 }
 232
 233 void block_copy_set_progress_callback(
 234         BlockCopyState *s,
 235         ProgressBytesCallbackFunc progress_bytes_callback,
 236         void *progress_opaque)
 237 {
 238     s->progress_bytes_callback = progress_bytes_callback;
 239     s->progress_opaque = progress_opaque;
 240 }
 241
 242 void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm)
 243 {
 244     s->progress = pm;
 245 }
 246
 247 /*
 248  * block_copy_do_copy
 249  *
 250  * Do copy of cluster-aligned chunk. Requested region is allowed to exceed
 251  * s->len only to cover last cluster when s->len is not aligned to clusters.
 252  *
 253  * No sync here: nor bitmap neighter intersecting requests handling, only copy.
 254  *
 255  * Returns 0 on success.
 256  */
 257 static int coroutine_fn block_copy_do_copy(BlockCopyState *s,
 258                                            int64_t offset, int64_t bytes,
 259                                            bool zeroes, bool *error_is_read)
 260 {
 261     int ret;
 262     int64_t nbytes = MIN(offset + bytes, s->len) - offset;
 263     void *bounce_buffer = NULL;
 264
 265     assert(offset >= 0 && bytes > 0 && INT64_MAX - offset >= bytes);
 266     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
 267     assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
 268     assert(offset < s->len);
 269     assert(offset + bytes <= s->len ||
 270            offset + bytes == QEMU_ALIGN_UP(s->len, s->cluster_size));
 271     assert(nbytes < INT_MAX);
 272
 273     if (zeroes) {
 274         ret = bdrv_co_pwrite_zeroes(s->target, offset, nbytes, s->write_flags &
 275                                     ~BDRV_REQ_WRITE_COMPRESSED);
 276         if (ret < 0) {
 277             trace_block_copy_write_zeroes_fail(s, offset, ret);
 278             if (error_is_read) {
 279                 *error_is_read = false;
 280             }
 281         }
 282         return ret;
 283     }
 284
 285     if (s->use_copy_range) {
 286         ret = bdrv_co_copy_range(s->source, offset, s->target, offset, nbytes,
 287                                  0, s->write_flags);
 288         if (ret < 0) {
 289             trace_block_copy_copy_range_fail(s, offset, ret);
 290             s->use_copy_range = false;
 291             s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
 292             /* Fallback to read+write with allocated buffer */
 293         } else {
 294             if (s->use_copy_range) {
 295                 /*
 296                  * Successful copy-range. Now increase copy_size.  copy_range
 297                  * does not respect max_transfer (it's a TODO), so we factor
 298                  * that in here.
 299                  *
 300                  * Note: we double-check s->use_copy_range for the case when
 301                  * parallel block-copy request unsets it during previous
 302                  * bdrv_co_copy_range call.
 303                  */
 304                 s->copy_size =
 305                         MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
 306                             QEMU_ALIGN_DOWN(block_copy_max_transfer(s->source,
 307                                                                     s->target),
 308                                             s->cluster_size));
 309             }
 310             goto out;
 311         }
 312     }
 313
 314     /*
 315      * In case of failed copy_range request above, we may proceed with buffered
 316      * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will
 317      * be properly limited, so don't care too much. Moreover the most likely
 318      * case (copy_range is unsupported for the configuration, so the very first
 319      * copy_range request fails) is handled by setting large copy_size only
 320      * after first successful copy_range.
 321      */
 322
 323     bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
 324
 325     ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0);
 326     if (ret < 0) {
 327         trace_block_copy_read_fail(s, offset, ret);
 328         if (error_is_read) {
 329             *error_is_read = true;
 330         }
 331         goto out;
 332     }
 333
 334     ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer,
 335                          s->write_flags);
 336     if (ret < 0) {
 337         trace_block_copy_write_fail(s, offset, ret);
 338         if (error_is_read) {
 339             *error_is_read = false;
 340         }
 341         goto out;
 342     }
 343
 344 out:
 345     qemu_vfree(bounce_buffer);
 346
 347     return ret;
 348 }
 349
 350 static int block_copy_block_status(BlockCopyState *s, int64_t offset,
 351                                    int64_t bytes, int64_t *pnum)
 352 {
 353     int64_t num;
 354     BlockDriverState *base;
 355     int ret;
 356
 357     if (s->skip_unallocated && s->source->bs->backing) {
 358         base = s->source->bs->backing->bs;
 359     } else {
 360         base = NULL;
 361     }
 362
 363     ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num,
 364                                   NULL, NULL);
 365     if (ret < 0 || num < s->cluster_size) {
 366         /*
 367          * On error or if failed to obtain large enough chunk just fallback to
 368          * copy one cluster.
 369          */
 370         num = s->cluster_size;
 371         ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA;
 372     } else if (offset + num == s->len) {
 373         num = QEMU_ALIGN_UP(num, s->cluster_size);
 374     } else {
 375         num = QEMU_ALIGN_DOWN(num, s->cluster_size);
 376     }
 377
 378     *pnum = num;
 379     return ret;
 380 }
 381
 382 /*
 383  * Check if the cluster starting at offset is allocated or not.
 384  * return via pnum the number of contiguous clusters sharing this allocation.
 385  */
 386 static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset,
 387                                            int64_t *pnum)
 388 {
 389     BlockDriverState *bs = s->source->bs;
 390     int64_t count, total_count = 0;
 391     int64_t bytes = s->len - offset;
 392     int ret;
 393
 394     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
 395
 396     while (true) {
 397         ret = bdrv_is_allocated(bs, offset, bytes, &count);
 398         if (ret < 0) {
 399             return ret;
 400         }
 401
 402         total_count += count;
 403
 404         if (ret || count == 0) {
 405             /*
 406              * ret: partial segment(s) are considered allocated.
 407              * otherwise: unallocated tail is treated as an entire segment.
 408              */
 409             *pnum = DIV_ROUND_UP(total_count, s->cluster_size);
 410             return ret;
 411         }
 412
 413         /* Unallocated segment(s) with uncertain following segment(s) */
 414         if (total_count >= s->cluster_size) {
 415             *pnum = total_count / s->cluster_size;
 416             return 0;
 417         }
 418
 419         offset += count;
 420         bytes -= count;
 421     }
 422 }
 423
 424 /*
 425  * Reset bits in copy_bitmap starting at offset if they represent unallocated
 426  * data in the image. May reset subsequent contiguous bits.
 427  * @return 0 when the cluster at @offset was unallocated,
 428  *         1 otherwise, and -ret on error.
 429  */
 430 int64_t block_copy_reset_unallocated(BlockCopyState *s,
 431                                      int64_t offset, int64_t *count)
 432 {
 433     int ret;
 434     int64_t clusters, bytes;
 435
 436     ret = block_copy_is_cluster_allocated(s, offset, &clusters);
 437     if (ret < 0) {
 438         return ret;
 439     }
 440
 441     bytes = clusters * s->cluster_size;
 442
 443     if (!ret) {
 444         bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
 445         progress_set_remaining(s->progress,
 446                                bdrv_get_dirty_count(s->copy_bitmap) +
 447                                s->in_flight_bytes);
 448     }
 449
 450     *count = bytes;
 451     return ret;
 452 }
 453
 454 /*
 455  * block_copy_dirty_clusters
 456  *
 457  * Copy dirty clusters in @offset/@bytes range.
 458  * Returns 1 if dirty clusters found and successfully copied, 0 if no dirty
 459  * clusters found and -errno on failure.
 460  */
 461 static int coroutine_fn block_copy_dirty_clusters(BlockCopyState *s,
 462                                                   int64_t offset, int64_t bytes,
 463                                                   bool *error_is_read)
 464 {
 465     int ret = 0;
 466     bool found_dirty = false;
 467
 468     /*
 469      * block_copy() user is responsible for keeping source and target in same
 470      * aio context
 471      */
 472     assert(bdrv_get_aio_context(s->source->bs) ==
 473            bdrv_get_aio_context(s->target->bs));
 474
 475     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
 476     assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
 477
 478     while (bytes) {
 479         g_autofree BlockCopyTask *task = NULL;
 480         int64_t next_zero, cur_bytes, status_bytes;
 481
 482         if (!bdrv_dirty_bitmap_get(s->copy_bitmap, offset)) {
 483             trace_block_copy_skip(s, offset);
 484             offset += s->cluster_size;
 485             bytes -= s->cluster_size;
 486             continue; /* already copied */
 487         }
 488
 489         found_dirty = true;
 490
 491         cur_bytes = MIN(bytes, s->copy_size);
 492
 493         next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, offset,
 494                                                 cur_bytes);
 495         if (next_zero >= 0) {
 496             assert(next_zero > offset); /* offset is dirty */
 497             assert(next_zero < offset + cur_bytes); /* no need to do MIN() */
 498             cur_bytes = next_zero - offset;
 499         }
 500         task = block_copy_task_create(s, offset, cur_bytes);
 501
 502         ret = block_copy_block_status(s, offset, cur_bytes, &status_bytes);
 503         assert(ret >= 0); /* never fail */
 504         cur_bytes = MIN(cur_bytes, status_bytes);
 505         block_copy_task_shrink(s, task, cur_bytes);
 506         if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) {
 507             block_copy_task_end(s, task, 0);
 508             progress_set_remaining(s->progress,
 509                                    bdrv_get_dirty_count(s->copy_bitmap) +
 510                                    s->in_flight_bytes);
 511             trace_block_copy_skip_range(s, offset, status_bytes);
 512             offset += status_bytes;
 513             bytes -= status_bytes;
 514             continue;
 515         }
 516
 517         trace_block_copy_process(s, offset);
 518
 519         co_get_from_shres(s->mem, cur_bytes);
 520         ret = block_copy_do_copy(s, offset, cur_bytes, ret & BDRV_BLOCK_ZERO,
 521                                  error_is_read);
 522         co_put_to_shres(s->mem, cur_bytes);
 523         block_copy_task_end(s, task, ret);
 524         if (ret < 0) {
 525             return ret;
 526         }
 527
 528         progress_work_done(s->progress, cur_bytes);
 529         s->progress_bytes_callback(cur_bytes, s->progress_opaque);
 530         offset += cur_bytes;
 531         bytes -= cur_bytes;
 532     }
 533
 534     return found_dirty;
 535 }
 536
 537 /*
 538  * block_copy
 539  *
 540  * Copy requested region, accordingly to dirty bitmap.
 541  * Collaborate with parallel block_copy requests: if they succeed it will help
 542  * us. If they fail, we will retry not-copied regions. So, if we return error,
 543  * it means that some I/O operation failed in context of _this_ block_copy call,
 544  * not some parallel operation.
 545  */
 546 int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes,
 547                             bool *error_is_read)
 548 {
 549     int ret;
 550
 551     do {
 552         ret = block_copy_dirty_clusters(s, offset, bytes, error_is_read);
 553
 554         if (ret == 0) {
 555             ret = block_copy_wait_one(s, offset, bytes);
 556         }
 557
 558         /*
 559          * We retry in two cases:
 560          * 1. Some progress done
 561          *    Something was copied, which means that there were yield points
 562          *    and some new dirty bits may have appeared (due to failed parallel
 563          *    block-copy requests).
 564          * 2. We have waited for some intersecting block-copy request
 565          *    It may have failed and produced new dirty bits.
 566          */
 567     } while (ret > 0);
 568
 569     return ret;
 570 }
 571
 572 BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s)
 573 {
 574     return s->copy_bitmap;
 575 }
 576
 577 void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip)
 578 {
 579     s->skip_unallocated = skip;
 580 }