block/backup: add backup_is_cluster_allocated
[qemu/ar7.git] / block / backup.c
blobf6bf32c94385a79f268cabeddbff8351a8781646
1 /*
2 * QEMU backup
4 * Copyright (C) 2013 Proxmox Server Solutions
6 * Authors:
7 * Dietmar Maurer (dietmar@proxmox.com)
9 * This work is licensed under the terms of the GNU GPL, version 2 or later.
10 * See the COPYING file in the top-level directory.
14 #include "qemu/osdep.h"
16 #include "trace.h"
17 #include "block/block.h"
18 #include "block/block_int.h"
19 #include "block/blockjob_int.h"
20 #include "block/block_backup.h"
21 #include "qapi/error.h"
22 #include "qapi/qmp/qerror.h"
23 #include "qemu/ratelimit.h"
24 #include "qemu/cutils.h"
25 #include "sysemu/block-backend.h"
26 #include "qemu/bitmap.h"
27 #include "qemu/error-report.h"
29 #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
31 typedef struct CowRequest {
32 int64_t start_byte;
33 int64_t end_byte;
34 QLIST_ENTRY(CowRequest) list;
35 CoQueue wait_queue; /* coroutines blocked on this request */
36 } CowRequest;
38 typedef struct BackupBlockJob {
39 BlockJob common;
40 BlockBackend *target;
42 BdrvDirtyBitmap *sync_bitmap;
43 BdrvDirtyBitmap *copy_bitmap;
45 MirrorSyncMode sync_mode;
46 BitmapSyncMode bitmap_mode;
47 BlockdevOnError on_source_error;
48 BlockdevOnError on_target_error;
49 CoRwlock flush_rwlock;
50 uint64_t len;
51 uint64_t bytes_read;
52 int64_t cluster_size;
53 bool compress;
54 NotifierWithReturn before_write;
55 QLIST_HEAD(, CowRequest) inflight_reqs;
57 bool use_copy_range;
58 int64_t copy_range_size;
60 bool serialize_target_writes;
61 } BackupBlockJob;
63 static const BlockJobDriver backup_job_driver;
65 /* See if in-flight requests overlap and wait for them to complete */
66 static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
67 int64_t start,
68 int64_t end)
70 CowRequest *req;
71 bool retry;
73 do {
74 retry = false;
75 QLIST_FOREACH(req, &job->inflight_reqs, list) {
76 if (end > req->start_byte && start < req->end_byte) {
77 qemu_co_queue_wait(&req->wait_queue, NULL);
78 retry = true;
79 break;
82 } while (retry);
85 /* Keep track of an in-flight request */
86 static void cow_request_begin(CowRequest *req, BackupBlockJob *job,
87 int64_t start, int64_t end)
89 req->start_byte = start;
90 req->end_byte = end;
91 qemu_co_queue_init(&req->wait_queue);
92 QLIST_INSERT_HEAD(&job->inflight_reqs, req, list);
95 /* Forget about a completed request */
96 static void cow_request_end(CowRequest *req)
98 QLIST_REMOVE(req, list);
99 qemu_co_queue_restart_all(&req->wait_queue);
102 /* Copy range to target with a bounce buffer and return the bytes copied. If
103 * error occurred, return a negative error number */
104 static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
105 int64_t start,
106 int64_t end,
107 bool is_write_notifier,
108 bool *error_is_read,
109 void **bounce_buffer)
111 int ret;
112 BlockBackend *blk = job->common.blk;
113 int nbytes;
114 int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
115 int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0;
117 assert(QEMU_IS_ALIGNED(start, job->cluster_size));
118 bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
119 nbytes = MIN(job->cluster_size, job->len - start);
120 if (!*bounce_buffer) {
121 *bounce_buffer = blk_blockalign(blk, job->cluster_size);
124 ret = blk_co_pread(blk, start, nbytes, *bounce_buffer, read_flags);
125 if (ret < 0) {
126 trace_backup_do_cow_read_fail(job, start, ret);
127 if (error_is_read) {
128 *error_is_read = true;
130 goto fail;
133 if (buffer_is_zero(*bounce_buffer, nbytes)) {
134 ret = blk_co_pwrite_zeroes(job->target, start,
135 nbytes, write_flags | BDRV_REQ_MAY_UNMAP);
136 } else {
137 ret = blk_co_pwrite(job->target, start,
138 nbytes, *bounce_buffer, write_flags |
139 (job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0));
141 if (ret < 0) {
142 trace_backup_do_cow_write_fail(job, start, ret);
143 if (error_is_read) {
144 *error_is_read = false;
146 goto fail;
149 return nbytes;
150 fail:
151 bdrv_set_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
152 return ret;
156 /* Copy range to target and return the bytes copied. If error occurred, return a
157 * negative error number. */
158 static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job,
159 int64_t start,
160 int64_t end,
161 bool is_write_notifier)
163 int ret;
164 int nr_clusters;
165 BlockBackend *blk = job->common.blk;
166 int nbytes;
167 int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
168 int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0;
170 assert(QEMU_IS_ALIGNED(job->copy_range_size, job->cluster_size));
171 assert(QEMU_IS_ALIGNED(start, job->cluster_size));
172 nbytes = MIN(job->copy_range_size, end - start);
173 nr_clusters = DIV_ROUND_UP(nbytes, job->cluster_size);
174 bdrv_reset_dirty_bitmap(job->copy_bitmap, start,
175 job->cluster_size * nr_clusters);
176 ret = blk_co_copy_range(blk, start, job->target, start, nbytes,
177 read_flags, write_flags);
178 if (ret < 0) {
179 trace_backup_do_cow_copy_range_fail(job, start, ret);
180 bdrv_set_dirty_bitmap(job->copy_bitmap, start,
181 job->cluster_size * nr_clusters);
182 return ret;
185 return nbytes;
189 * Check if the cluster starting at offset is allocated or not.
190 * return via pnum the number of contiguous clusters sharing this allocation.
192 static int backup_is_cluster_allocated(BackupBlockJob *s, int64_t offset,
193 int64_t *pnum)
195 BlockDriverState *bs = blk_bs(s->common.blk);
196 int64_t count, total_count = 0;
197 int64_t bytes = s->len - offset;
198 int ret;
200 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
202 while (true) {
203 ret = bdrv_is_allocated(bs, offset, bytes, &count);
204 if (ret < 0) {
205 return ret;
208 total_count += count;
210 if (ret || count == 0) {
212 * ret: partial segment(s) are considered allocated.
213 * otherwise: unallocated tail is treated as an entire segment.
215 *pnum = DIV_ROUND_UP(total_count, s->cluster_size);
216 return ret;
219 /* Unallocated segment(s) with uncertain following segment(s) */
220 if (total_count >= s->cluster_size) {
221 *pnum = total_count / s->cluster_size;
222 return 0;
225 offset += count;
226 bytes -= count;
230 static int coroutine_fn backup_do_cow(BackupBlockJob *job,
231 int64_t offset, uint64_t bytes,
232 bool *error_is_read,
233 bool is_write_notifier)
235 CowRequest cow_request;
236 int ret = 0;
237 int64_t start, end; /* bytes */
238 void *bounce_buffer = NULL;
240 qemu_co_rwlock_rdlock(&job->flush_rwlock);
242 start = QEMU_ALIGN_DOWN(offset, job->cluster_size);
243 end = QEMU_ALIGN_UP(bytes + offset, job->cluster_size);
245 trace_backup_do_cow_enter(job, start, offset, bytes);
247 wait_for_overlapping_requests(job, start, end);
248 cow_request_begin(&cow_request, job, start, end);
250 while (start < end) {
251 int64_t dirty_end;
253 if (!bdrv_dirty_bitmap_get(job->copy_bitmap, start)) {
254 trace_backup_do_cow_skip(job, start);
255 start += job->cluster_size;
256 continue; /* already copied */
259 dirty_end = bdrv_dirty_bitmap_next_zero(job->copy_bitmap, start,
260 (end - start));
261 if (dirty_end < 0) {
262 dirty_end = end;
265 trace_backup_do_cow_process(job, start);
267 if (job->use_copy_range) {
268 ret = backup_cow_with_offload(job, start, dirty_end,
269 is_write_notifier);
270 if (ret < 0) {
271 job->use_copy_range = false;
274 if (!job->use_copy_range) {
275 ret = backup_cow_with_bounce_buffer(job, start, dirty_end,
276 is_write_notifier,
277 error_is_read, &bounce_buffer);
279 if (ret < 0) {
280 break;
283 /* Publish progress, guest I/O counts as progress too. Note that the
284 * offset field is an opaque progress value, it is not a disk offset.
286 start += ret;
287 job->bytes_read += ret;
288 job_progress_update(&job->common.job, ret);
289 ret = 0;
292 if (bounce_buffer) {
293 qemu_vfree(bounce_buffer);
296 cow_request_end(&cow_request);
298 trace_backup_do_cow_return(job, offset, bytes, ret);
300 qemu_co_rwlock_unlock(&job->flush_rwlock);
302 return ret;
305 static int coroutine_fn backup_before_write_notify(
306 NotifierWithReturn *notifier,
307 void *opaque)
309 BackupBlockJob *job = container_of(notifier, BackupBlockJob, before_write);
310 BdrvTrackedRequest *req = opaque;
312 assert(req->bs == blk_bs(job->common.blk));
313 assert(QEMU_IS_ALIGNED(req->offset, BDRV_SECTOR_SIZE));
314 assert(QEMU_IS_ALIGNED(req->bytes, BDRV_SECTOR_SIZE));
316 return backup_do_cow(job, req->offset, req->bytes, NULL, true);
319 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
321 BdrvDirtyBitmap *bm;
322 BlockDriverState *bs = blk_bs(job->common.blk);
323 bool sync = (((ret == 0) || (job->bitmap_mode == BITMAP_SYNC_MODE_ALWAYS)) \
324 && (job->bitmap_mode != BITMAP_SYNC_MODE_NEVER));
326 if (sync) {
328 * We succeeded, or we always intended to sync the bitmap.
329 * Delete this bitmap and install the child.
331 bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL);
332 } else {
334 * We failed, or we never intended to sync the bitmap anyway.
335 * Merge the successor back into the parent, keeping all data.
337 bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL);
340 assert(bm);
342 if (ret < 0 && job->bitmap_mode == BITMAP_SYNC_MODE_ALWAYS) {
343 /* If we failed and synced, merge in the bits we didn't copy: */
344 bdrv_dirty_bitmap_merge_internal(bm, job->copy_bitmap,
345 NULL, true);
349 static void backup_commit(Job *job)
351 BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
352 if (s->sync_bitmap) {
353 backup_cleanup_sync_bitmap(s, 0);
357 static void backup_abort(Job *job)
359 BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
360 if (s->sync_bitmap) {
361 backup_cleanup_sync_bitmap(s, -1);
365 static void backup_clean(Job *job)
367 BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
368 BlockDriverState *bs = blk_bs(s->common.blk);
370 if (s->copy_bitmap) {
371 bdrv_release_dirty_bitmap(bs, s->copy_bitmap);
372 s->copy_bitmap = NULL;
375 assert(s->target);
376 blk_unref(s->target);
377 s->target = NULL;
380 void backup_do_checkpoint(BlockJob *job, Error **errp)
382 BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
384 assert(block_job_driver(job) == &backup_job_driver);
386 if (backup_job->sync_mode != MIRROR_SYNC_MODE_NONE) {
387 error_setg(errp, "The backup job only supports block checkpoint in"
388 " sync=none mode");
389 return;
392 bdrv_set_dirty_bitmap(backup_job->copy_bitmap, 0, backup_job->len);
395 static void backup_drain(BlockJob *job)
397 BackupBlockJob *s = container_of(job, BackupBlockJob, common);
399 /* Need to keep a reference in case blk_drain triggers execution
400 * of backup_complete...
402 if (s->target) {
403 BlockBackend *target = s->target;
404 blk_ref(target);
405 blk_drain(target);
406 blk_unref(target);
410 static BlockErrorAction backup_error_action(BackupBlockJob *job,
411 bool read, int error)
413 if (read) {
414 return block_job_error_action(&job->common, job->on_source_error,
415 true, error);
416 } else {
417 return block_job_error_action(&job->common, job->on_target_error,
418 false, error);
422 static bool coroutine_fn yield_and_check(BackupBlockJob *job)
424 uint64_t delay_ns;
426 if (job_is_cancelled(&job->common.job)) {
427 return true;
430 /* We need to yield even for delay_ns = 0 so that bdrv_drain_all() can
431 * return. Without a yield, the VM would not reboot. */
432 delay_ns = block_job_ratelimit_get_delay(&job->common, job->bytes_read);
433 job->bytes_read = 0;
434 job_sleep_ns(&job->common.job, delay_ns);
436 if (job_is_cancelled(&job->common.job)) {
437 return true;
440 return false;
443 static int coroutine_fn backup_loop(BackupBlockJob *job)
445 bool error_is_read;
446 int64_t offset;
447 BdrvDirtyBitmapIter *bdbi;
448 int ret = 0;
449 int64_t dummy;
451 bdbi = bdrv_dirty_iter_new(job->copy_bitmap);
452 while ((offset = bdrv_dirty_iter_next(bdbi)) != -1) {
453 if (job->sync_mode == MIRROR_SYNC_MODE_TOP &&
454 !backup_is_cluster_allocated(job, offset, &dummy))
456 bdrv_reset_dirty_bitmap(job->copy_bitmap, offset,
457 job->cluster_size);
458 continue;
461 do {
462 if (yield_and_check(job)) {
463 goto out;
465 ret = backup_do_cow(job, offset,
466 job->cluster_size, &error_is_read, false);
467 if (ret < 0 && backup_error_action(job, error_is_read, -ret) ==
468 BLOCK_ERROR_ACTION_REPORT)
470 goto out;
472 } while (ret < 0);
475 out:
476 bdrv_dirty_iter_free(bdbi);
477 return ret;
480 static void backup_init_copy_bitmap(BackupBlockJob *job)
482 bool ret;
483 uint64_t estimate;
485 if (job->sync_mode == MIRROR_SYNC_MODE_BITMAP) {
486 ret = bdrv_dirty_bitmap_merge_internal(job->copy_bitmap,
487 job->sync_bitmap,
488 NULL, true);
489 assert(ret);
490 } else {
491 bdrv_set_dirty_bitmap(job->copy_bitmap, 0, job->len);
494 estimate = bdrv_get_dirty_count(job->copy_bitmap);
495 job_progress_set_remaining(&job->common.job, estimate);
498 static int coroutine_fn backup_run(Job *job, Error **errp)
500 BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
501 BlockDriverState *bs = blk_bs(s->common.blk);
502 int ret = 0;
504 QLIST_INIT(&s->inflight_reqs);
505 qemu_co_rwlock_init(&s->flush_rwlock);
507 backup_init_copy_bitmap(s);
509 s->before_write.notify = backup_before_write_notify;
510 bdrv_add_before_write_notifier(bs, &s->before_write);
512 if (s->sync_mode == MIRROR_SYNC_MODE_NONE) {
513 /* All bits are set in copy_bitmap to allow any cluster to be copied.
514 * This does not actually require them to be copied. */
515 while (!job_is_cancelled(job)) {
516 /* Yield until the job is cancelled. We just let our before_write
517 * notify callback service CoW requests. */
518 job_yield(job);
520 } else {
521 ret = backup_loop(s);
524 notifier_with_return_remove(&s->before_write);
526 /* wait until pending backup_do_cow() calls have completed */
527 qemu_co_rwlock_wrlock(&s->flush_rwlock);
528 qemu_co_rwlock_unlock(&s->flush_rwlock);
530 return ret;
533 static const BlockJobDriver backup_job_driver = {
534 .job_driver = {
535 .instance_size = sizeof(BackupBlockJob),
536 .job_type = JOB_TYPE_BACKUP,
537 .free = block_job_free,
538 .user_resume = block_job_user_resume,
539 .drain = block_job_drain,
540 .run = backup_run,
541 .commit = backup_commit,
542 .abort = backup_abort,
543 .clean = backup_clean,
545 .drain = backup_drain,
548 static int64_t backup_calculate_cluster_size(BlockDriverState *target,
549 Error **errp)
551 int ret;
552 BlockDriverInfo bdi;
555 * If there is no backing file on the target, we cannot rely on COW if our
556 * backup cluster size is smaller than the target cluster size. Even for
557 * targets with a backing file, try to avoid COW if possible.
559 ret = bdrv_get_info(target, &bdi);
560 if (ret == -ENOTSUP && !target->backing) {
561 /* Cluster size is not defined */
562 warn_report("The target block device doesn't provide "
563 "information about the block size and it doesn't have a "
564 "backing file. The default block size of %u bytes is "
565 "used. If the actual block size of the target exceeds "
566 "this default, the backup may be unusable",
567 BACKUP_CLUSTER_SIZE_DEFAULT);
568 return BACKUP_CLUSTER_SIZE_DEFAULT;
569 } else if (ret < 0 && !target->backing) {
570 error_setg_errno(errp, -ret,
571 "Couldn't determine the cluster size of the target image, "
572 "which has no backing file");
573 error_append_hint(errp,
574 "Aborting, since this may create an unusable destination image\n");
575 return ret;
576 } else if (ret < 0 && target->backing) {
577 /* Not fatal; just trudge on ahead. */
578 return BACKUP_CLUSTER_SIZE_DEFAULT;
581 return MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size);
584 BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
585 BlockDriverState *target, int64_t speed,
586 MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap,
587 BitmapSyncMode bitmap_mode,
588 bool compress,
589 BlockdevOnError on_source_error,
590 BlockdevOnError on_target_error,
591 int creation_flags,
592 BlockCompletionFunc *cb, void *opaque,
593 JobTxn *txn, Error **errp)
595 int64_t len;
596 BackupBlockJob *job = NULL;
597 int ret;
598 int64_t cluster_size;
599 BdrvDirtyBitmap *copy_bitmap = NULL;
601 assert(bs);
602 assert(target);
604 /* QMP interface protects us from these cases */
605 assert(sync_mode != MIRROR_SYNC_MODE_INCREMENTAL);
606 assert(sync_bitmap || sync_mode != MIRROR_SYNC_MODE_BITMAP);
608 if (bs == target) {
609 error_setg(errp, "Source and target cannot be the same");
610 return NULL;
613 if (!bdrv_is_inserted(bs)) {
614 error_setg(errp, "Device is not inserted: %s",
615 bdrv_get_device_name(bs));
616 return NULL;
619 if (!bdrv_is_inserted(target)) {
620 error_setg(errp, "Device is not inserted: %s",
621 bdrv_get_device_name(target));
622 return NULL;
625 if (compress && target->drv->bdrv_co_pwritev_compressed == NULL) {
626 error_setg(errp, "Compression is not supported for this drive %s",
627 bdrv_get_device_name(target));
628 return NULL;
631 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) {
632 return NULL;
635 if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_BACKUP_TARGET, errp)) {
636 return NULL;
639 if (sync_mode == MIRROR_SYNC_MODE_BITMAP) {
640 /* If we need to write to this bitmap, check that we can: */
641 if (bitmap_mode != BITMAP_SYNC_MODE_NEVER &&
642 bdrv_dirty_bitmap_check(sync_bitmap, BDRV_BITMAP_DEFAULT, errp)) {
643 return NULL;
646 /* Create a new bitmap, and freeze/disable this one. */
647 if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) {
648 return NULL;
650 } else if (sync_bitmap) {
651 error_setg(errp,
652 "a bitmap was given to backup_job_create, "
653 "but it received an incompatible sync_mode (%s)",
654 MirrorSyncMode_str(sync_mode));
655 return NULL;
658 len = bdrv_getlength(bs);
659 if (len < 0) {
660 error_setg_errno(errp, -len, "unable to get length for '%s'",
661 bdrv_get_device_name(bs));
662 goto error;
665 cluster_size = backup_calculate_cluster_size(target, errp);
666 if (cluster_size < 0) {
667 goto error;
670 copy_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp);
671 if (!copy_bitmap) {
672 goto error;
674 bdrv_disable_dirty_bitmap(copy_bitmap);
676 /* job->len is fixed, so we can't allow resize */
677 job = block_job_create(job_id, &backup_job_driver, txn, bs,
678 BLK_PERM_CONSISTENT_READ,
679 BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE |
680 BLK_PERM_WRITE_UNCHANGED | BLK_PERM_GRAPH_MOD,
681 speed, creation_flags, cb, opaque, errp);
682 if (!job) {
683 goto error;
686 /* The target must match the source in size, so no resize here either */
687 job->target = blk_new(job->common.job.aio_context,
688 BLK_PERM_WRITE,
689 BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE |
690 BLK_PERM_WRITE_UNCHANGED | BLK_PERM_GRAPH_MOD);
691 ret = blk_insert_bs(job->target, target, errp);
692 if (ret < 0) {
693 goto error;
695 blk_set_disable_request_queuing(job->target, true);
697 job->on_source_error = on_source_error;
698 job->on_target_error = on_target_error;
699 job->sync_mode = sync_mode;
700 job->sync_bitmap = sync_bitmap;
701 job->bitmap_mode = bitmap_mode;
702 job->compress = compress;
704 /* Detect image-fleecing (and similar) schemes */
705 job->serialize_target_writes = bdrv_chain_contains(target, bs);
706 job->cluster_size = cluster_size;
707 job->copy_bitmap = copy_bitmap;
708 copy_bitmap = NULL;
709 job->use_copy_range = !compress; /* compression isn't supported for it */
710 job->copy_range_size = MIN_NON_ZERO(blk_get_max_transfer(job->common.blk),
711 blk_get_max_transfer(job->target));
712 job->copy_range_size = MAX(job->cluster_size,
713 QEMU_ALIGN_UP(job->copy_range_size,
714 job->cluster_size));
716 /* Required permissions are already taken with target's blk_new() */
717 block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL,
718 &error_abort);
719 job->len = len;
721 return &job->common;
723 error:
724 if (copy_bitmap) {
725 assert(!job || !job->copy_bitmap);
726 bdrv_release_dirty_bitmap(bs, copy_bitmap);
728 if (sync_bitmap) {
729 bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
731 if (job) {
732 backup_clean(&job->common.job);
733 job_early_fail(&job->common.job);
736 return NULL;