block/block-copy.c

   1 /*
   2  * block_copy API
   3  *
   4  * Copyright (C) 2013 Proxmox Server Solutions
   5  * Copyright (c) 2019 Virtuozzo International GmbH.
   6  *
   7  * Authors:
   8  *  Dietmar Maurer (dietmar@proxmox.com)
   9  *  Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
  10  *
  11  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  12  * See the COPYING file in the top-level directory.
  13  */
  14
  15 #include "qemu/osdep.h"
  16
  17 #include "trace.h"
  18 #include "qapi/error.h"
  19 #include "block/block-copy.h"
  20 #include "sysemu/block-backend.h"
  21 #include "qemu/units.h"
  22
  23 #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
  24 #define BLOCK_COPY_MAX_BUFFER (1 * MiB)
  25 #define BLOCK_COPY_MAX_MEM (128 * MiB)
  26
  27 typedef struct BlockCopyInFlightReq {
  28     int64_t offset;
  29     int64_t bytes;
  30     QLIST_ENTRY(BlockCopyInFlightReq) list;
  31     CoQueue wait_queue; /* coroutines blocked on this request */
  32 } BlockCopyInFlightReq;
  33
  34 typedef struct BlockCopyState {
  35     /*
  36      * BdrvChild objects are not owned or managed by block-copy. They are
  37      * provided by block-copy user and user is responsible for appropriate
  38      * permissions on these children.
  39      */
  40     BdrvChild *source;
  41     BdrvChild *target;
  42     BdrvDirtyBitmap *copy_bitmap;
  43     int64_t in_flight_bytes;
  44     int64_t cluster_size;
  45     bool use_copy_range;
  46     int64_t copy_size;
  47     uint64_t len;
  48     QLIST_HEAD(, BlockCopyInFlightReq) inflight_reqs;
  49
  50     BdrvRequestFlags write_flags;
  51
  52     /*
  53      * skip_unallocated:
  54      *
  55      * Used by sync=top jobs, which first scan the source node for unallocated
  56      * areas and clear them in the copy_bitmap.  During this process, the bitmap
  57      * is thus not fully initialized: It may still have bits set for areas that
  58      * are unallocated and should actually not be copied.
  59      *
  60      * This is indicated by skip_unallocated.
  61      *
  62      * In this case, block_copy() will query the source’s allocation status,
  63      * skip unallocated regions, clear them in the copy_bitmap, and invoke
  64      * block_copy_reset_unallocated() every time it does.
  65      */
  66     bool skip_unallocated;
  67
  68     ProgressMeter *progress;
  69     /* progress_bytes_callback: called when some copying progress is done. */
  70     ProgressBytesCallbackFunc progress_bytes_callback;
  71     void *progress_opaque;
  72
  73     SharedResource *mem;
  74 } BlockCopyState;
  75
  76 static BlockCopyInFlightReq *find_conflicting_inflight_req(BlockCopyState *s,
  77                                                            int64_t offset,
  78                                                            int64_t bytes)
  79 {
  80     BlockCopyInFlightReq *req;
  81
  82     QLIST_FOREACH(req, &s->inflight_reqs, list) {
  83         if (offset + bytes > req->offset && offset < req->offset + req->bytes) {
  84             return req;
  85         }
  86     }
  87
  88     return NULL;
  89 }
  90
  91 /*
  92  * If there are no intersecting requests return false. Otherwise, wait for the
  93  * first found intersecting request to finish and return true.
  94  */
  95 static bool coroutine_fn block_copy_wait_one(BlockCopyState *s, int64_t offset,
  96                                              int64_t bytes)
  97 {
  98     BlockCopyInFlightReq *req = find_conflicting_inflight_req(s, offset, bytes);
  99
 100     if (!req) {
 101         return false;
 102     }
 103
 104     qemu_co_queue_wait(&req->wait_queue, NULL);
 105
 106     return true;
 107 }
 108
 109 /* Called only on full-dirty region */
 110 static void block_copy_inflight_req_begin(BlockCopyState *s,
 111                                           BlockCopyInFlightReq *req,
 112                                           int64_t offset, int64_t bytes)
 113 {
 114     assert(!find_conflicting_inflight_req(s, offset, bytes));
 115
 116     bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
 117     s->in_flight_bytes += bytes;
 118
 119     req->offset = offset;
 120     req->bytes = bytes;
 121     qemu_co_queue_init(&req->wait_queue);
 122     QLIST_INSERT_HEAD(&s->inflight_reqs, req, list);
 123 }
 124
 125 /*
 126  * block_copy_inflight_req_shrink
 127  *
 128  * Drop the tail of the request to be handled later. Set dirty bits back and
 129  * wake up all requests waiting for us (may be some of them are not intersecting
 130  * with shrunk request)
 131  */
 132 static void coroutine_fn block_copy_inflight_req_shrink(BlockCopyState *s,
 133         BlockCopyInFlightReq *req, int64_t new_bytes)
 134 {
 135     if (new_bytes == req->bytes) {
 136         return;
 137     }
 138
 139     assert(new_bytes > 0 && new_bytes < req->bytes);
 140
 141     s->in_flight_bytes -= req->bytes - new_bytes;
 142     bdrv_set_dirty_bitmap(s->copy_bitmap,
 143                           req->offset + new_bytes, req->bytes - new_bytes);
 144
 145     req->bytes = new_bytes;
 146     qemu_co_queue_restart_all(&req->wait_queue);
 147 }
 148
 149 static void coroutine_fn block_copy_inflight_req_end(BlockCopyState *s,
 150                                                      BlockCopyInFlightReq *req,
 151                                                      int ret)
 152 {
 153     s->in_flight_bytes -= req->bytes;
 154     if (ret < 0) {
 155         bdrv_set_dirty_bitmap(s->copy_bitmap, req->offset, req->bytes);
 156     }
 157     QLIST_REMOVE(req, list);
 158     qemu_co_queue_restart_all(&req->wait_queue);
 159 }
 160
 161 void block_copy_state_free(BlockCopyState *s)
 162 {
 163     if (!s) {
 164         return;
 165     }
 166
 167     bdrv_release_dirty_bitmap(s->copy_bitmap);
 168     shres_destroy(s->mem);
 169     g_free(s);
 170 }
 171
 172 static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target)
 173 {
 174     return MIN_NON_ZERO(INT_MAX,
 175                         MIN_NON_ZERO(source->bs->bl.max_transfer,
 176                                      target->bs->bl.max_transfer));
 177 }
 178
 179 BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
 180                                      int64_t cluster_size,
 181                                      BdrvRequestFlags write_flags, Error **errp)
 182 {
 183     BlockCopyState *s;
 184     BdrvDirtyBitmap *copy_bitmap;
 185
 186     copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
 187                                            errp);
 188     if (!copy_bitmap) {
 189         return NULL;
 190     }
 191     bdrv_disable_dirty_bitmap(copy_bitmap);
 192
 193     s = g_new(BlockCopyState, 1);
 194     *s = (BlockCopyState) {
 195         .source = source,
 196         .target = target,
 197         .copy_bitmap = copy_bitmap,
 198         .cluster_size = cluster_size,
 199         .len = bdrv_dirty_bitmap_size(copy_bitmap),
 200         .write_flags = write_flags,
 201         .mem = shres_create(BLOCK_COPY_MAX_MEM),
 202     };
 203
 204     if (block_copy_max_transfer(source, target) < cluster_size) {
 205         /*
 206          * copy_range does not respect max_transfer. We don't want to bother
 207          * with requests smaller than block-copy cluster size, so fallback to
 208          * buffered copying (read and write respect max_transfer on their
 209          * behalf).
 210          */
 211         s->use_copy_range = false;
 212         s->copy_size = cluster_size;
 213     } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) {
 214         /* Compression supports only cluster-size writes and no copy-range. */
 215         s->use_copy_range = false;
 216         s->copy_size = cluster_size;
 217     } else {
 218         /*
 219          * We enable copy-range, but keep small copy_size, until first
 220          * successful copy_range (look at block_copy_do_copy).
 221          */
 222         s->use_copy_range = true;
 223         s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
 224     }
 225
 226     QLIST_INIT(&s->inflight_reqs);
 227
 228     return s;
 229 }
 230
 231 void block_copy_set_progress_callback(
 232         BlockCopyState *s,
 233         ProgressBytesCallbackFunc progress_bytes_callback,
 234         void *progress_opaque)
 235 {
 236     s->progress_bytes_callback = progress_bytes_callback;
 237     s->progress_opaque = progress_opaque;
 238 }
 239
 240 void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm)
 241 {
 242     s->progress = pm;
 243 }
 244
 245 /*
 246  * block_copy_do_copy
 247  *
 248  * Do copy of cluster-aligned chunk. Requested region is allowed to exceed
 249  * s->len only to cover last cluster when s->len is not aligned to clusters.
 250  *
 251  * No sync here: nor bitmap neighter intersecting requests handling, only copy.
 252  *
 253  * Returns 0 on success.
 254  */
 255 static int coroutine_fn block_copy_do_copy(BlockCopyState *s,
 256                                            int64_t offset, int64_t bytes,
 257                                            bool zeroes, bool *error_is_read)
 258 {
 259     int ret;
 260     int64_t nbytes = MIN(offset + bytes, s->len) - offset;
 261     void *bounce_buffer = NULL;
 262
 263     assert(offset >= 0 && bytes > 0 && INT64_MAX - offset >= bytes);
 264     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
 265     assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
 266     assert(offset < s->len);
 267     assert(offset + bytes <= s->len ||
 268            offset + bytes == QEMU_ALIGN_UP(s->len, s->cluster_size));
 269     assert(nbytes < INT_MAX);
 270
 271     if (zeroes) {
 272         ret = bdrv_co_pwrite_zeroes(s->target, offset, nbytes, s->write_flags &
 273                                     ~BDRV_REQ_WRITE_COMPRESSED);
 274         if (ret < 0) {
 275             trace_block_copy_write_zeroes_fail(s, offset, ret);
 276             if (error_is_read) {
 277                 *error_is_read = false;
 278             }
 279         }
 280         return ret;
 281     }
 282
 283     if (s->use_copy_range) {
 284         ret = bdrv_co_copy_range(s->source, offset, s->target, offset, nbytes,
 285                                  0, s->write_flags);
 286         if (ret < 0) {
 287             trace_block_copy_copy_range_fail(s, offset, ret);
 288             s->use_copy_range = false;
 289             s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
 290             /* Fallback to read+write with allocated buffer */
 291         } else {
 292             if (s->use_copy_range) {
 293                 /*
 294                  * Successful copy-range. Now increase copy_size.  copy_range
 295                  * does not respect max_transfer (it's a TODO), so we factor
 296                  * that in here.
 297                  *
 298                  * Note: we double-check s->use_copy_range for the case when
 299                  * parallel block-copy request unsets it during previous
 300                  * bdrv_co_copy_range call.
 301                  */
 302                 s->copy_size =
 303                         MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
 304                             QEMU_ALIGN_DOWN(block_copy_max_transfer(s->source,
 305                                                                     s->target),
 306                                             s->cluster_size));
 307             }
 308             goto out;
 309         }
 310     }
 311
 312     /*
 313      * In case of failed copy_range request above, we may proceed with buffered
 314      * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will
 315      * be properly limited, so don't care too much. Moreover the most likely
 316      * case (copy_range is unsupported for the configuration, so the very first
 317      * copy_range request fails) is handled by setting large copy_size only
 318      * after first successful copy_range.
 319      */
 320
 321     bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
 322
 323     ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0);
 324     if (ret < 0) {
 325         trace_block_copy_read_fail(s, offset, ret);
 326         if (error_is_read) {
 327             *error_is_read = true;
 328         }
 329         goto out;
 330     }
 331
 332     ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer,
 333                          s->write_flags);
 334     if (ret < 0) {
 335         trace_block_copy_write_fail(s, offset, ret);
 336         if (error_is_read) {
 337             *error_is_read = false;
 338         }
 339         goto out;
 340     }
 341
 342 out:
 343     qemu_vfree(bounce_buffer);
 344
 345     return ret;
 346 }
 347
 348 static int block_copy_block_status(BlockCopyState *s, int64_t offset,
 349                                    int64_t bytes, int64_t *pnum)
 350 {
 351     int64_t num;
 352     BlockDriverState *base;
 353     int ret;
 354
 355     if (s->skip_unallocated && s->source->bs->backing) {
 356         base = s->source->bs->backing->bs;
 357     } else {
 358         base = NULL;
 359     }
 360
 361     ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num,
 362                                   NULL, NULL);
 363     if (ret < 0 || num < s->cluster_size) {
 364         /*
 365          * On error or if failed to obtain large enough chunk just fallback to
 366          * copy one cluster.
 367          */
 368         num = s->cluster_size;
 369         ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA;
 370     } else if (offset + num == s->len) {
 371         num = QEMU_ALIGN_UP(num, s->cluster_size);
 372     } else {
 373         num = QEMU_ALIGN_DOWN(num, s->cluster_size);
 374     }
 375
 376     *pnum = num;
 377     return ret;
 378 }
 379
 380 /*
 381  * Check if the cluster starting at offset is allocated or not.
 382  * return via pnum the number of contiguous clusters sharing this allocation.
 383  */
 384 static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset,
 385                                            int64_t *pnum)
 386 {
 387     BlockDriverState *bs = s->source->bs;
 388     int64_t count, total_count = 0;
 389     int64_t bytes = s->len - offset;
 390     int ret;
 391
 392     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
 393
 394     while (true) {
 395         ret = bdrv_is_allocated(bs, offset, bytes, &count);
 396         if (ret < 0) {
 397             return ret;
 398         }
 399
 400         total_count += count;
 401
 402         if (ret || count == 0) {
 403             /*
 404              * ret: partial segment(s) are considered allocated.
 405              * otherwise: unallocated tail is treated as an entire segment.
 406              */
 407             *pnum = DIV_ROUND_UP(total_count, s->cluster_size);
 408             return ret;
 409         }
 410
 411         /* Unallocated segment(s) with uncertain following segment(s) */
 412         if (total_count >= s->cluster_size) {
 413             *pnum = total_count / s->cluster_size;
 414             return 0;
 415         }
 416
 417         offset += count;
 418         bytes -= count;
 419     }
 420 }
 421
 422 /*
 423  * Reset bits in copy_bitmap starting at offset if they represent unallocated
 424  * data in the image. May reset subsequent contiguous bits.
 425  * @return 0 when the cluster at @offset was unallocated,
 426  *         1 otherwise, and -ret on error.
 427  */
 428 int64_t block_copy_reset_unallocated(BlockCopyState *s,
 429                                      int64_t offset, int64_t *count)
 430 {
 431     int ret;
 432     int64_t clusters, bytes;
 433
 434     ret = block_copy_is_cluster_allocated(s, offset, &clusters);
 435     if (ret < 0) {
 436         return ret;
 437     }
 438
 439     bytes = clusters * s->cluster_size;
 440
 441     if (!ret) {
 442         bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
 443         progress_set_remaining(s->progress,
 444                                bdrv_get_dirty_count(s->copy_bitmap) +
 445                                s->in_flight_bytes);
 446     }
 447
 448     *count = bytes;
 449     return ret;
 450 }
 451
 452 /*
 453  * block_copy_dirty_clusters
 454  *
 455  * Copy dirty clusters in @offset/@bytes range.
 456  * Returns 1 if dirty clusters found and successfully copied, 0 if no dirty
 457  * clusters found and -errno on failure.
 458  */
 459 static int coroutine_fn block_copy_dirty_clusters(BlockCopyState *s,
 460                                                   int64_t offset, int64_t bytes,
 461                                                   bool *error_is_read)
 462 {
 463     int ret = 0;
 464     bool found_dirty = false;
 465
 466     /*
 467      * block_copy() user is responsible for keeping source and target in same
 468      * aio context
 469      */
 470     assert(bdrv_get_aio_context(s->source->bs) ==
 471            bdrv_get_aio_context(s->target->bs));
 472
 473     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
 474     assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
 475
 476     while (bytes) {
 477         BlockCopyInFlightReq req;
 478         int64_t next_zero, cur_bytes, status_bytes;
 479
 480         if (!bdrv_dirty_bitmap_get(s->copy_bitmap, offset)) {
 481             trace_block_copy_skip(s, offset);
 482             offset += s->cluster_size;
 483             bytes -= s->cluster_size;
 484             continue; /* already copied */
 485         }
 486
 487         found_dirty = true;
 488
 489         cur_bytes = MIN(bytes, s->copy_size);
 490
 491         next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, offset,
 492                                                 cur_bytes);
 493         if (next_zero >= 0) {
 494             assert(next_zero > offset); /* offset is dirty */
 495             assert(next_zero < offset + cur_bytes); /* no need to do MIN() */
 496             cur_bytes = next_zero - offset;
 497         }
 498         block_copy_inflight_req_begin(s, &req, offset, cur_bytes);
 499
 500         ret = block_copy_block_status(s, offset, cur_bytes, &status_bytes);
 501         assert(ret >= 0); /* never fail */
 502         cur_bytes = MIN(cur_bytes, status_bytes);
 503         block_copy_inflight_req_shrink(s, &req, cur_bytes);
 504         if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) {
 505             block_copy_inflight_req_end(s, &req, 0);
 506             progress_set_remaining(s->progress,
 507                                    bdrv_get_dirty_count(s->copy_bitmap) +
 508                                    s->in_flight_bytes);
 509             trace_block_copy_skip_range(s, offset, status_bytes);
 510             offset += status_bytes;
 511             bytes -= status_bytes;
 512             continue;
 513         }
 514
 515         trace_block_copy_process(s, offset);
 516
 517         co_get_from_shres(s->mem, cur_bytes);
 518         ret = block_copy_do_copy(s, offset, cur_bytes, ret & BDRV_BLOCK_ZERO,
 519                                  error_is_read);
 520         co_put_to_shres(s->mem, cur_bytes);
 521         block_copy_inflight_req_end(s, &req, ret);
 522         if (ret < 0) {
 523             return ret;
 524         }
 525
 526         progress_work_done(s->progress, cur_bytes);
 527         s->progress_bytes_callback(cur_bytes, s->progress_opaque);
 528         offset += cur_bytes;
 529         bytes -= cur_bytes;
 530     }
 531
 532     return found_dirty;
 533 }
 534
 535 /*
 536  * block_copy
 537  *
 538  * Copy requested region, accordingly to dirty bitmap.
 539  * Collaborate with parallel block_copy requests: if they succeed it will help
 540  * us. If they fail, we will retry not-copied regions. So, if we return error,
 541  * it means that some I/O operation failed in context of _this_ block_copy call,
 542  * not some parallel operation.
 543  */
 544 int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes,
 545                             bool *error_is_read)
 546 {
 547     int ret;
 548
 549     do {
 550         ret = block_copy_dirty_clusters(s, offset, bytes, error_is_read);
 551
 552         if (ret == 0) {
 553             ret = block_copy_wait_one(s, offset, bytes);
 554         }
 555
 556         /*
 557          * We retry in two cases:
 558          * 1. Some progress done
 559          *    Something was copied, which means that there were yield points
 560          *    and some new dirty bits may have appeared (due to failed parallel
 561          *    block-copy requests).
 562          * 2. We have waited for some intersecting block-copy request
 563          *    It may have failed and produced new dirty bits.
 564          */
 565     } while (ret > 0);
 566
 567     return ret;
 568 }
 569
 570 BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s)
 571 {
 572     return s->copy_bitmap;
 573 }
 574
 575 void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip)
 576 {
 577     s->skip_unallocated = skip;
 578 }