block/block-copy: alloc task on each iteration
[qemu/ar7.git] / block / block-copy.c
blob8d1b9ab9f0fe1927022c4d1d4fb8fb0bd2063092
1 /*
2 * block_copy API
4 * Copyright (C) 2013 Proxmox Server Solutions
5 * Copyright (c) 2019 Virtuozzo International GmbH.
7 * Authors:
8 * Dietmar Maurer (dietmar@proxmox.com)
9 * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
15 #include "qemu/osdep.h"
17 #include "trace.h"
18 #include "qapi/error.h"
19 #include "block/block-copy.h"
20 #include "sysemu/block-backend.h"
21 #include "qemu/units.h"
23 #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
24 #define BLOCK_COPY_MAX_BUFFER (1 * MiB)
25 #define BLOCK_COPY_MAX_MEM (128 * MiB)
27 typedef struct BlockCopyTask {
28 int64_t offset;
29 int64_t bytes;
30 QLIST_ENTRY(BlockCopyTask) list;
31 CoQueue wait_queue; /* coroutines blocked on this task */
32 } BlockCopyTask;
34 typedef struct BlockCopyState {
36 * BdrvChild objects are not owned or managed by block-copy. They are
37 * provided by block-copy user and user is responsible for appropriate
38 * permissions on these children.
40 BdrvChild *source;
41 BdrvChild *target;
42 BdrvDirtyBitmap *copy_bitmap;
43 int64_t in_flight_bytes;
44 int64_t cluster_size;
45 bool use_copy_range;
46 int64_t copy_size;
47 uint64_t len;
48 QLIST_HEAD(, BlockCopyTask) tasks;
50 BdrvRequestFlags write_flags;
53 * skip_unallocated:
55 * Used by sync=top jobs, which first scan the source node for unallocated
56 * areas and clear them in the copy_bitmap. During this process, the bitmap
57 * is thus not fully initialized: It may still have bits set for areas that
58 * are unallocated and should actually not be copied.
60 * This is indicated by skip_unallocated.
62 * In this case, block_copy() will query the source’s allocation status,
63 * skip unallocated regions, clear them in the copy_bitmap, and invoke
64 * block_copy_reset_unallocated() every time it does.
66 bool skip_unallocated;
68 ProgressMeter *progress;
69 /* progress_bytes_callback: called when some copying progress is done. */
70 ProgressBytesCallbackFunc progress_bytes_callback;
71 void *progress_opaque;
73 SharedResource *mem;
74 } BlockCopyState;
76 static BlockCopyTask *find_conflicting_task(BlockCopyState *s,
77 int64_t offset, int64_t bytes)
79 BlockCopyTask *t;
81 QLIST_FOREACH(t, &s->tasks, list) {
82 if (offset + bytes > t->offset && offset < t->offset + t->bytes) {
83 return t;
87 return NULL;
91 * If there are no intersecting tasks return false. Otherwise, wait for the
92 * first found intersecting tasks to finish and return true.
94 static bool coroutine_fn block_copy_wait_one(BlockCopyState *s, int64_t offset,
95 int64_t bytes)
97 BlockCopyTask *task = find_conflicting_task(s, offset, bytes);
99 if (!task) {
100 return false;
103 qemu_co_queue_wait(&task->wait_queue, NULL);
105 return true;
108 /* Called only on full-dirty region */
109 static BlockCopyTask *block_copy_task_create(BlockCopyState *s,
110 int64_t offset, int64_t bytes)
112 BlockCopyTask *task = g_new(BlockCopyTask, 1);
114 assert(!find_conflicting_task(s, offset, bytes));
116 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
117 s->in_flight_bytes += bytes;
119 task->offset = offset;
120 task->bytes = bytes;
121 qemu_co_queue_init(&task->wait_queue);
122 QLIST_INSERT_HEAD(&s->tasks, task, list);
124 return task;
128 * block_copy_task_shrink
130 * Drop the tail of the task to be handled later. Set dirty bits back and
131 * wake up all tasks waiting for us (may be some of them are not intersecting
132 * with shrunk task)
134 static void coroutine_fn block_copy_task_shrink(BlockCopyState *s,
135 BlockCopyTask *task,
136 int64_t new_bytes)
138 if (new_bytes == task->bytes) {
139 return;
142 assert(new_bytes > 0 && new_bytes < task->bytes);
144 s->in_flight_bytes -= task->bytes - new_bytes;
145 bdrv_set_dirty_bitmap(s->copy_bitmap,
146 task->offset + new_bytes, task->bytes - new_bytes);
148 task->bytes = new_bytes;
149 qemu_co_queue_restart_all(&task->wait_queue);
152 static void coroutine_fn block_copy_task_end(BlockCopyState *s,
153 BlockCopyTask *task, int ret)
155 s->in_flight_bytes -= task->bytes;
156 if (ret < 0) {
157 bdrv_set_dirty_bitmap(s->copy_bitmap, task->offset, task->bytes);
159 QLIST_REMOVE(task, list);
160 qemu_co_queue_restart_all(&task->wait_queue);
163 void block_copy_state_free(BlockCopyState *s)
165 if (!s) {
166 return;
169 bdrv_release_dirty_bitmap(s->copy_bitmap);
170 shres_destroy(s->mem);
171 g_free(s);
174 static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target)
176 return MIN_NON_ZERO(INT_MAX,
177 MIN_NON_ZERO(source->bs->bl.max_transfer,
178 target->bs->bl.max_transfer));
181 BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
182 int64_t cluster_size,
183 BdrvRequestFlags write_flags, Error **errp)
185 BlockCopyState *s;
186 BdrvDirtyBitmap *copy_bitmap;
188 copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
189 errp);
190 if (!copy_bitmap) {
191 return NULL;
193 bdrv_disable_dirty_bitmap(copy_bitmap);
195 s = g_new(BlockCopyState, 1);
196 *s = (BlockCopyState) {
197 .source = source,
198 .target = target,
199 .copy_bitmap = copy_bitmap,
200 .cluster_size = cluster_size,
201 .len = bdrv_dirty_bitmap_size(copy_bitmap),
202 .write_flags = write_flags,
203 .mem = shres_create(BLOCK_COPY_MAX_MEM),
206 if (block_copy_max_transfer(source, target) < cluster_size) {
208 * copy_range does not respect max_transfer. We don't want to bother
209 * with requests smaller than block-copy cluster size, so fallback to
210 * buffered copying (read and write respect max_transfer on their
211 * behalf).
213 s->use_copy_range = false;
214 s->copy_size = cluster_size;
215 } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) {
216 /* Compression supports only cluster-size writes and no copy-range. */
217 s->use_copy_range = false;
218 s->copy_size = cluster_size;
219 } else {
221 * We enable copy-range, but keep small copy_size, until first
222 * successful copy_range (look at block_copy_do_copy).
224 s->use_copy_range = true;
225 s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
228 QLIST_INIT(&s->tasks);
230 return s;
233 void block_copy_set_progress_callback(
234 BlockCopyState *s,
235 ProgressBytesCallbackFunc progress_bytes_callback,
236 void *progress_opaque)
238 s->progress_bytes_callback = progress_bytes_callback;
239 s->progress_opaque = progress_opaque;
242 void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm)
244 s->progress = pm;
248 * block_copy_do_copy
250 * Do copy of cluster-aligned chunk. Requested region is allowed to exceed
251 * s->len only to cover last cluster when s->len is not aligned to clusters.
253 * No sync here: nor bitmap neighter intersecting requests handling, only copy.
255 * Returns 0 on success.
257 static int coroutine_fn block_copy_do_copy(BlockCopyState *s,
258 int64_t offset, int64_t bytes,
259 bool zeroes, bool *error_is_read)
261 int ret;
262 int64_t nbytes = MIN(offset + bytes, s->len) - offset;
263 void *bounce_buffer = NULL;
265 assert(offset >= 0 && bytes > 0 && INT64_MAX - offset >= bytes);
266 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
267 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
268 assert(offset < s->len);
269 assert(offset + bytes <= s->len ||
270 offset + bytes == QEMU_ALIGN_UP(s->len, s->cluster_size));
271 assert(nbytes < INT_MAX);
273 if (zeroes) {
274 ret = bdrv_co_pwrite_zeroes(s->target, offset, nbytes, s->write_flags &
275 ~BDRV_REQ_WRITE_COMPRESSED);
276 if (ret < 0) {
277 trace_block_copy_write_zeroes_fail(s, offset, ret);
278 if (error_is_read) {
279 *error_is_read = false;
282 return ret;
285 if (s->use_copy_range) {
286 ret = bdrv_co_copy_range(s->source, offset, s->target, offset, nbytes,
287 0, s->write_flags);
288 if (ret < 0) {
289 trace_block_copy_copy_range_fail(s, offset, ret);
290 s->use_copy_range = false;
291 s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
292 /* Fallback to read+write with allocated buffer */
293 } else {
294 if (s->use_copy_range) {
296 * Successful copy-range. Now increase copy_size. copy_range
297 * does not respect max_transfer (it's a TODO), so we factor
298 * that in here.
300 * Note: we double-check s->use_copy_range for the case when
301 * parallel block-copy request unsets it during previous
302 * bdrv_co_copy_range call.
304 s->copy_size =
305 MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
306 QEMU_ALIGN_DOWN(block_copy_max_transfer(s->source,
307 s->target),
308 s->cluster_size));
310 goto out;
315 * In case of failed copy_range request above, we may proceed with buffered
316 * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will
317 * be properly limited, so don't care too much. Moreover the most likely
318 * case (copy_range is unsupported for the configuration, so the very first
319 * copy_range request fails) is handled by setting large copy_size only
320 * after first successful copy_range.
323 bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
325 ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0);
326 if (ret < 0) {
327 trace_block_copy_read_fail(s, offset, ret);
328 if (error_is_read) {
329 *error_is_read = true;
331 goto out;
334 ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer,
335 s->write_flags);
336 if (ret < 0) {
337 trace_block_copy_write_fail(s, offset, ret);
338 if (error_is_read) {
339 *error_is_read = false;
341 goto out;
344 out:
345 qemu_vfree(bounce_buffer);
347 return ret;
350 static int block_copy_block_status(BlockCopyState *s, int64_t offset,
351 int64_t bytes, int64_t *pnum)
353 int64_t num;
354 BlockDriverState *base;
355 int ret;
357 if (s->skip_unallocated && s->source->bs->backing) {
358 base = s->source->bs->backing->bs;
359 } else {
360 base = NULL;
363 ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num,
364 NULL, NULL);
365 if (ret < 0 || num < s->cluster_size) {
367 * On error or if failed to obtain large enough chunk just fallback to
368 * copy one cluster.
370 num = s->cluster_size;
371 ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA;
372 } else if (offset + num == s->len) {
373 num = QEMU_ALIGN_UP(num, s->cluster_size);
374 } else {
375 num = QEMU_ALIGN_DOWN(num, s->cluster_size);
378 *pnum = num;
379 return ret;
383 * Check if the cluster starting at offset is allocated or not.
384 * return via pnum the number of contiguous clusters sharing this allocation.
386 static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset,
387 int64_t *pnum)
389 BlockDriverState *bs = s->source->bs;
390 int64_t count, total_count = 0;
391 int64_t bytes = s->len - offset;
392 int ret;
394 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
396 while (true) {
397 ret = bdrv_is_allocated(bs, offset, bytes, &count);
398 if (ret < 0) {
399 return ret;
402 total_count += count;
404 if (ret || count == 0) {
406 * ret: partial segment(s) are considered allocated.
407 * otherwise: unallocated tail is treated as an entire segment.
409 *pnum = DIV_ROUND_UP(total_count, s->cluster_size);
410 return ret;
413 /* Unallocated segment(s) with uncertain following segment(s) */
414 if (total_count >= s->cluster_size) {
415 *pnum = total_count / s->cluster_size;
416 return 0;
419 offset += count;
420 bytes -= count;
425 * Reset bits in copy_bitmap starting at offset if they represent unallocated
426 * data in the image. May reset subsequent contiguous bits.
427 * @return 0 when the cluster at @offset was unallocated,
428 * 1 otherwise, and -ret on error.
430 int64_t block_copy_reset_unallocated(BlockCopyState *s,
431 int64_t offset, int64_t *count)
433 int ret;
434 int64_t clusters, bytes;
436 ret = block_copy_is_cluster_allocated(s, offset, &clusters);
437 if (ret < 0) {
438 return ret;
441 bytes = clusters * s->cluster_size;
443 if (!ret) {
444 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
445 progress_set_remaining(s->progress,
446 bdrv_get_dirty_count(s->copy_bitmap) +
447 s->in_flight_bytes);
450 *count = bytes;
451 return ret;
455 * block_copy_dirty_clusters
457 * Copy dirty clusters in @offset/@bytes range.
458 * Returns 1 if dirty clusters found and successfully copied, 0 if no dirty
459 * clusters found and -errno on failure.
461 static int coroutine_fn block_copy_dirty_clusters(BlockCopyState *s,
462 int64_t offset, int64_t bytes,
463 bool *error_is_read)
465 int ret = 0;
466 bool found_dirty = false;
469 * block_copy() user is responsible for keeping source and target in same
470 * aio context
472 assert(bdrv_get_aio_context(s->source->bs) ==
473 bdrv_get_aio_context(s->target->bs));
475 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
476 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
478 while (bytes) {
479 g_autofree BlockCopyTask *task = NULL;
480 int64_t next_zero, cur_bytes, status_bytes;
482 if (!bdrv_dirty_bitmap_get(s->copy_bitmap, offset)) {
483 trace_block_copy_skip(s, offset);
484 offset += s->cluster_size;
485 bytes -= s->cluster_size;
486 continue; /* already copied */
489 found_dirty = true;
491 cur_bytes = MIN(bytes, s->copy_size);
493 next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, offset,
494 cur_bytes);
495 if (next_zero >= 0) {
496 assert(next_zero > offset); /* offset is dirty */
497 assert(next_zero < offset + cur_bytes); /* no need to do MIN() */
498 cur_bytes = next_zero - offset;
500 task = block_copy_task_create(s, offset, cur_bytes);
502 ret = block_copy_block_status(s, offset, cur_bytes, &status_bytes);
503 assert(ret >= 0); /* never fail */
504 cur_bytes = MIN(cur_bytes, status_bytes);
505 block_copy_task_shrink(s, task, cur_bytes);
506 if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) {
507 block_copy_task_end(s, task, 0);
508 progress_set_remaining(s->progress,
509 bdrv_get_dirty_count(s->copy_bitmap) +
510 s->in_flight_bytes);
511 trace_block_copy_skip_range(s, offset, status_bytes);
512 offset += status_bytes;
513 bytes -= status_bytes;
514 continue;
517 trace_block_copy_process(s, offset);
519 co_get_from_shres(s->mem, cur_bytes);
520 ret = block_copy_do_copy(s, offset, cur_bytes, ret & BDRV_BLOCK_ZERO,
521 error_is_read);
522 co_put_to_shres(s->mem, cur_bytes);
523 block_copy_task_end(s, task, ret);
524 if (ret < 0) {
525 return ret;
528 progress_work_done(s->progress, cur_bytes);
529 s->progress_bytes_callback(cur_bytes, s->progress_opaque);
530 offset += cur_bytes;
531 bytes -= cur_bytes;
534 return found_dirty;
538 * block_copy
540 * Copy requested region, accordingly to dirty bitmap.
541 * Collaborate with parallel block_copy requests: if they succeed it will help
542 * us. If they fail, we will retry not-copied regions. So, if we return error,
543 * it means that some I/O operation failed in context of _this_ block_copy call,
544 * not some parallel operation.
546 int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes,
547 bool *error_is_read)
549 int ret;
551 do {
552 ret = block_copy_dirty_clusters(s, offset, bytes, error_is_read);
554 if (ret == 0) {
555 ret = block_copy_wait_one(s, offset, bytes);
559 * We retry in two cases:
560 * 1. Some progress done
561 * Something was copied, which means that there were yield points
562 * and some new dirty bits may have appeared (due to failed parallel
563 * block-copy requests).
564 * 2. We have waited for some intersecting block-copy request
565 * It may have failed and produced new dirty bits.
567 } while (ret > 0);
569 return ret;
572 BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s)
574 return s->copy_bitmap;
577 void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip)
579 s->skip_unallocated = skip;