atomics: convert to reStructuredText
[qemu.git] / block / block-copy.c
blob05227e18bf15a7ce237a53e90c0950720ac3725c
1 /*
2 * block_copy API
4 * Copyright (C) 2013 Proxmox Server Solutions
5 * Copyright (c) 2019 Virtuozzo International GmbH.
7 * Authors:
8 * Dietmar Maurer (dietmar@proxmox.com)
9 * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
15 #include "qemu/osdep.h"
17 #include "trace.h"
18 #include "qapi/error.h"
19 #include "block/block-copy.h"
20 #include "sysemu/block-backend.h"
21 #include "qemu/units.h"
23 #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
24 #define BLOCK_COPY_MAX_BUFFER (1 * MiB)
25 #define BLOCK_COPY_MAX_MEM (128 * MiB)
27 typedef struct BlockCopyInFlightReq {
28 int64_t offset;
29 int64_t bytes;
30 QLIST_ENTRY(BlockCopyInFlightReq) list;
31 CoQueue wait_queue; /* coroutines blocked on this request */
32 } BlockCopyInFlightReq;
34 typedef struct BlockCopyState {
36 * BdrvChild objects are not owned or managed by block-copy. They are
37 * provided by block-copy user and user is responsible for appropriate
38 * permissions on these children.
40 BdrvChild *source;
41 BdrvChild *target;
42 BdrvDirtyBitmap *copy_bitmap;
43 int64_t in_flight_bytes;
44 int64_t cluster_size;
45 bool use_copy_range;
46 int64_t copy_size;
47 uint64_t len;
48 QLIST_HEAD(, BlockCopyInFlightReq) inflight_reqs;
50 BdrvRequestFlags write_flags;
53 * skip_unallocated:
55 * Used by sync=top jobs, which first scan the source node for unallocated
56 * areas and clear them in the copy_bitmap. During this process, the bitmap
57 * is thus not fully initialized: It may still have bits set for areas that
58 * are unallocated and should actually not be copied.
60 * This is indicated by skip_unallocated.
62 * In this case, block_copy() will query the source’s allocation status,
63 * skip unallocated regions, clear them in the copy_bitmap, and invoke
64 * block_copy_reset_unallocated() every time it does.
66 bool skip_unallocated;
68 ProgressMeter *progress;
69 /* progress_bytes_callback: called when some copying progress is done. */
70 ProgressBytesCallbackFunc progress_bytes_callback;
71 void *progress_opaque;
73 SharedResource *mem;
74 } BlockCopyState;
76 static BlockCopyInFlightReq *find_conflicting_inflight_req(BlockCopyState *s,
77 int64_t offset,
78 int64_t bytes)
80 BlockCopyInFlightReq *req;
82 QLIST_FOREACH(req, &s->inflight_reqs, list) {
83 if (offset + bytes > req->offset && offset < req->offset + req->bytes) {
84 return req;
88 return NULL;
92 * If there are no intersecting requests return false. Otherwise, wait for the
93 * first found intersecting request to finish and return true.
95 static bool coroutine_fn block_copy_wait_one(BlockCopyState *s, int64_t offset,
96 int64_t bytes)
98 BlockCopyInFlightReq *req = find_conflicting_inflight_req(s, offset, bytes);
100 if (!req) {
101 return false;
104 qemu_co_queue_wait(&req->wait_queue, NULL);
106 return true;
109 /* Called only on full-dirty region */
110 static void block_copy_inflight_req_begin(BlockCopyState *s,
111 BlockCopyInFlightReq *req,
112 int64_t offset, int64_t bytes)
114 assert(!find_conflicting_inflight_req(s, offset, bytes));
116 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
117 s->in_flight_bytes += bytes;
119 req->offset = offset;
120 req->bytes = bytes;
121 qemu_co_queue_init(&req->wait_queue);
122 QLIST_INSERT_HEAD(&s->inflight_reqs, req, list);
126 * block_copy_inflight_req_shrink
128 * Drop the tail of the request to be handled later. Set dirty bits back and
129 * wake up all requests waiting for us (may be some of them are not intersecting
130 * with shrunk request)
132 static void coroutine_fn block_copy_inflight_req_shrink(BlockCopyState *s,
133 BlockCopyInFlightReq *req, int64_t new_bytes)
135 if (new_bytes == req->bytes) {
136 return;
139 assert(new_bytes > 0 && new_bytes < req->bytes);
141 s->in_flight_bytes -= req->bytes - new_bytes;
142 bdrv_set_dirty_bitmap(s->copy_bitmap,
143 req->offset + new_bytes, req->bytes - new_bytes);
145 req->bytes = new_bytes;
146 qemu_co_queue_restart_all(&req->wait_queue);
149 static void coroutine_fn block_copy_inflight_req_end(BlockCopyState *s,
150 BlockCopyInFlightReq *req,
151 int ret)
153 s->in_flight_bytes -= req->bytes;
154 if (ret < 0) {
155 bdrv_set_dirty_bitmap(s->copy_bitmap, req->offset, req->bytes);
157 QLIST_REMOVE(req, list);
158 qemu_co_queue_restart_all(&req->wait_queue);
161 void block_copy_state_free(BlockCopyState *s)
163 if (!s) {
164 return;
167 bdrv_release_dirty_bitmap(s->copy_bitmap);
168 shres_destroy(s->mem);
169 g_free(s);
172 static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target)
174 return MIN_NON_ZERO(INT_MAX,
175 MIN_NON_ZERO(source->bs->bl.max_transfer,
176 target->bs->bl.max_transfer));
179 BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
180 int64_t cluster_size,
181 BdrvRequestFlags write_flags, Error **errp)
183 BlockCopyState *s;
184 BdrvDirtyBitmap *copy_bitmap;
186 copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
187 errp);
188 if (!copy_bitmap) {
189 return NULL;
191 bdrv_disable_dirty_bitmap(copy_bitmap);
193 s = g_new(BlockCopyState, 1);
194 *s = (BlockCopyState) {
195 .source = source,
196 .target = target,
197 .copy_bitmap = copy_bitmap,
198 .cluster_size = cluster_size,
199 .len = bdrv_dirty_bitmap_size(copy_bitmap),
200 .write_flags = write_flags,
201 .mem = shres_create(BLOCK_COPY_MAX_MEM),
204 if (block_copy_max_transfer(source, target) < cluster_size) {
206 * copy_range does not respect max_transfer. We don't want to bother
207 * with requests smaller than block-copy cluster size, so fallback to
208 * buffered copying (read and write respect max_transfer on their
209 * behalf).
211 s->use_copy_range = false;
212 s->copy_size = cluster_size;
213 } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) {
214 /* Compression supports only cluster-size writes and no copy-range. */
215 s->use_copy_range = false;
216 s->copy_size = cluster_size;
217 } else {
219 * We enable copy-range, but keep small copy_size, until first
220 * successful copy_range (look at block_copy_do_copy).
222 s->use_copy_range = true;
223 s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
226 QLIST_INIT(&s->inflight_reqs);
228 return s;
231 void block_copy_set_progress_callback(
232 BlockCopyState *s,
233 ProgressBytesCallbackFunc progress_bytes_callback,
234 void *progress_opaque)
236 s->progress_bytes_callback = progress_bytes_callback;
237 s->progress_opaque = progress_opaque;
240 void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm)
242 s->progress = pm;
246 * block_copy_do_copy
248 * Do copy of cluster-aligned chunk. Requested region is allowed to exceed
249 * s->len only to cover last cluster when s->len is not aligned to clusters.
251 * No sync here: nor bitmap neighter intersecting requests handling, only copy.
253 * Returns 0 on success.
255 static int coroutine_fn block_copy_do_copy(BlockCopyState *s,
256 int64_t offset, int64_t bytes,
257 bool zeroes, bool *error_is_read)
259 int ret;
260 int64_t nbytes = MIN(offset + bytes, s->len) - offset;
261 void *bounce_buffer = NULL;
263 assert(offset >= 0 && bytes > 0 && INT64_MAX - offset >= bytes);
264 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
265 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
266 assert(offset < s->len);
267 assert(offset + bytes <= s->len ||
268 offset + bytes == QEMU_ALIGN_UP(s->len, s->cluster_size));
269 assert(nbytes < INT_MAX);
271 if (zeroes) {
272 ret = bdrv_co_pwrite_zeroes(s->target, offset, nbytes, s->write_flags &
273 ~BDRV_REQ_WRITE_COMPRESSED);
274 if (ret < 0) {
275 trace_block_copy_write_zeroes_fail(s, offset, ret);
276 if (error_is_read) {
277 *error_is_read = false;
280 return ret;
283 if (s->use_copy_range) {
284 ret = bdrv_co_copy_range(s->source, offset, s->target, offset, nbytes,
285 0, s->write_flags);
286 if (ret < 0) {
287 trace_block_copy_copy_range_fail(s, offset, ret);
288 s->use_copy_range = false;
289 s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
290 /* Fallback to read+write with allocated buffer */
291 } else {
292 if (s->use_copy_range) {
294 * Successful copy-range. Now increase copy_size. copy_range
295 * does not respect max_transfer (it's a TODO), so we factor
296 * that in here.
298 * Note: we double-check s->use_copy_range for the case when
299 * parallel block-copy request unsets it during previous
300 * bdrv_co_copy_range call.
302 s->copy_size =
303 MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
304 QEMU_ALIGN_DOWN(block_copy_max_transfer(s->source,
305 s->target),
306 s->cluster_size));
308 goto out;
313 * In case of failed copy_range request above, we may proceed with buffered
314 * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will
315 * be properly limited, so don't care too much. Moreover the most likely
316 * case (copy_range is unsupported for the configuration, so the very first
317 * copy_range request fails) is handled by setting large copy_size only
318 * after first successful copy_range.
321 bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
323 ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0);
324 if (ret < 0) {
325 trace_block_copy_read_fail(s, offset, ret);
326 if (error_is_read) {
327 *error_is_read = true;
329 goto out;
332 ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer,
333 s->write_flags);
334 if (ret < 0) {
335 trace_block_copy_write_fail(s, offset, ret);
336 if (error_is_read) {
337 *error_is_read = false;
339 goto out;
342 out:
343 qemu_vfree(bounce_buffer);
345 return ret;
348 static int block_copy_block_status(BlockCopyState *s, int64_t offset,
349 int64_t bytes, int64_t *pnum)
351 int64_t num;
352 BlockDriverState *base;
353 int ret;
355 if (s->skip_unallocated && s->source->bs->backing) {
356 base = s->source->bs->backing->bs;
357 } else {
358 base = NULL;
361 ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num,
362 NULL, NULL);
363 if (ret < 0 || num < s->cluster_size) {
365 * On error or if failed to obtain large enough chunk just fallback to
366 * copy one cluster.
368 num = s->cluster_size;
369 ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA;
370 } else if (offset + num == s->len) {
371 num = QEMU_ALIGN_UP(num, s->cluster_size);
372 } else {
373 num = QEMU_ALIGN_DOWN(num, s->cluster_size);
376 *pnum = num;
377 return ret;
381 * Check if the cluster starting at offset is allocated or not.
382 * return via pnum the number of contiguous clusters sharing this allocation.
384 static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset,
385 int64_t *pnum)
387 BlockDriverState *bs = s->source->bs;
388 int64_t count, total_count = 0;
389 int64_t bytes = s->len - offset;
390 int ret;
392 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
394 while (true) {
395 ret = bdrv_is_allocated(bs, offset, bytes, &count);
396 if (ret < 0) {
397 return ret;
400 total_count += count;
402 if (ret || count == 0) {
404 * ret: partial segment(s) are considered allocated.
405 * otherwise: unallocated tail is treated as an entire segment.
407 *pnum = DIV_ROUND_UP(total_count, s->cluster_size);
408 return ret;
411 /* Unallocated segment(s) with uncertain following segment(s) */
412 if (total_count >= s->cluster_size) {
413 *pnum = total_count / s->cluster_size;
414 return 0;
417 offset += count;
418 bytes -= count;
423 * Reset bits in copy_bitmap starting at offset if they represent unallocated
424 * data in the image. May reset subsequent contiguous bits.
425 * @return 0 when the cluster at @offset was unallocated,
426 * 1 otherwise, and -ret on error.
428 int64_t block_copy_reset_unallocated(BlockCopyState *s,
429 int64_t offset, int64_t *count)
431 int ret;
432 int64_t clusters, bytes;
434 ret = block_copy_is_cluster_allocated(s, offset, &clusters);
435 if (ret < 0) {
436 return ret;
439 bytes = clusters * s->cluster_size;
441 if (!ret) {
442 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
443 progress_set_remaining(s->progress,
444 bdrv_get_dirty_count(s->copy_bitmap) +
445 s->in_flight_bytes);
448 *count = bytes;
449 return ret;
453 * block_copy_dirty_clusters
455 * Copy dirty clusters in @offset/@bytes range.
456 * Returns 1 if dirty clusters found and successfully copied, 0 if no dirty
457 * clusters found and -errno on failure.
459 static int coroutine_fn block_copy_dirty_clusters(BlockCopyState *s,
460 int64_t offset, int64_t bytes,
461 bool *error_is_read)
463 int ret = 0;
464 bool found_dirty = false;
467 * block_copy() user is responsible for keeping source and target in same
468 * aio context
470 assert(bdrv_get_aio_context(s->source->bs) ==
471 bdrv_get_aio_context(s->target->bs));
473 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
474 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
476 while (bytes) {
477 BlockCopyInFlightReq req;
478 int64_t next_zero, cur_bytes, status_bytes;
480 if (!bdrv_dirty_bitmap_get(s->copy_bitmap, offset)) {
481 trace_block_copy_skip(s, offset);
482 offset += s->cluster_size;
483 bytes -= s->cluster_size;
484 continue; /* already copied */
487 found_dirty = true;
489 cur_bytes = MIN(bytes, s->copy_size);
491 next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, offset,
492 cur_bytes);
493 if (next_zero >= 0) {
494 assert(next_zero > offset); /* offset is dirty */
495 assert(next_zero < offset + cur_bytes); /* no need to do MIN() */
496 cur_bytes = next_zero - offset;
498 block_copy_inflight_req_begin(s, &req, offset, cur_bytes);
500 ret = block_copy_block_status(s, offset, cur_bytes, &status_bytes);
501 assert(ret >= 0); /* never fail */
502 cur_bytes = MIN(cur_bytes, status_bytes);
503 block_copy_inflight_req_shrink(s, &req, cur_bytes);
504 if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) {
505 block_copy_inflight_req_end(s, &req, 0);
506 progress_set_remaining(s->progress,
507 bdrv_get_dirty_count(s->copy_bitmap) +
508 s->in_flight_bytes);
509 trace_block_copy_skip_range(s, offset, status_bytes);
510 offset += status_bytes;
511 bytes -= status_bytes;
512 continue;
515 trace_block_copy_process(s, offset);
517 co_get_from_shres(s->mem, cur_bytes);
518 ret = block_copy_do_copy(s, offset, cur_bytes, ret & BDRV_BLOCK_ZERO,
519 error_is_read);
520 co_put_to_shres(s->mem, cur_bytes);
521 block_copy_inflight_req_end(s, &req, ret);
522 if (ret < 0) {
523 return ret;
526 progress_work_done(s->progress, cur_bytes);
527 s->progress_bytes_callback(cur_bytes, s->progress_opaque);
528 offset += cur_bytes;
529 bytes -= cur_bytes;
532 return found_dirty;
536 * block_copy
538 * Copy requested region, accordingly to dirty bitmap.
539 * Collaborate with parallel block_copy requests: if they succeed it will help
540 * us. If they fail, we will retry not-copied regions. So, if we return error,
541 * it means that some I/O operation failed in context of _this_ block_copy call,
542 * not some parallel operation.
544 int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes,
545 bool *error_is_read)
547 int ret;
549 do {
550 ret = block_copy_dirty_clusters(s, offset, bytes, error_is_read);
552 if (ret == 0) {
553 ret = block_copy_wait_one(s, offset, bytes);
557 * We retry in two cases:
558 * 1. Some progress done
559 * Something was copied, which means that there were yield points
560 * and some new dirty bits may have appeared (due to failed parallel
561 * block-copy requests).
562 * 2. We have waited for some intersecting block-copy request
563 * It may have failed and produced new dirty bits.
565 } while (ret > 0);
567 return ret;
570 BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s)
572 return s->copy_bitmap;
575 void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip)
577 s->skip_unallocated = skip;