4 * Copyright Red Hat, Inc. 2012
7 * Paolo Bonzini <pbonzini@redhat.com>
9 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
10 * See the COPYING.LIB file in the top-level directory.
15 #include "block/blockjob.h"
16 #include "block/block_int.h"
17 #include "qemu/ratelimit.h"
18 #include "qemu/bitmap.h"
20 #define SLICE_TIME 100000000ULL /* ns */
21 #define MAX_IN_FLIGHT 16
23 /* The mirroring buffer is a list of granularity-sized chunks.
24 * Free chunks are organized in a list.
26 typedef struct MirrorBuffer
{
27 QSIMPLEQ_ENTRY(MirrorBuffer
) next
;
30 typedef struct MirrorBlockJob
{
33 BlockDriverState
*target
;
34 BlockDriverState
*base
;
36 BlockdevOnError on_source_error
, on_target_error
;
42 unsigned long *cow_bitmap
;
43 BdrvDirtyBitmap
*dirty_bitmap
;
46 QSIMPLEQ_HEAD(, MirrorBuffer
) buf_free
;
49 unsigned long *in_flight_bitmap
;
54 typedef struct MirrorOp
{
61 static BlockErrorAction
mirror_error_action(MirrorBlockJob
*s
, bool read
,
66 return block_job_error_action(&s
->common
, s
->common
.bs
,
67 s
->on_source_error
, true, error
);
69 return block_job_error_action(&s
->common
, s
->target
,
70 s
->on_target_error
, false, error
);
74 static void mirror_iteration_done(MirrorOp
*op
, int ret
)
76 MirrorBlockJob
*s
= op
->s
;
79 int i
, nb_chunks
, sectors_per_chunk
;
81 trace_mirror_iteration_done(s
, op
->sector_num
, op
->nb_sectors
, ret
);
85 for (i
= 0; i
< op
->qiov
.niov
; i
++) {
86 MirrorBuffer
*buf
= (MirrorBuffer
*) iov
[i
].iov_base
;
87 QSIMPLEQ_INSERT_TAIL(&s
->buf_free
, buf
, next
);
91 sectors_per_chunk
= s
->granularity
>> BDRV_SECTOR_BITS
;
92 chunk_num
= op
->sector_num
/ sectors_per_chunk
;
93 nb_chunks
= op
->nb_sectors
/ sectors_per_chunk
;
94 bitmap_clear(s
->in_flight_bitmap
, chunk_num
, nb_chunks
);
95 if (s
->cow_bitmap
&& ret
>= 0) {
96 bitmap_set(s
->cow_bitmap
, chunk_num
, nb_chunks
);
99 qemu_iovec_destroy(&op
->qiov
);
100 g_slice_free(MirrorOp
, op
);
101 qemu_coroutine_enter(s
->common
.co
, NULL
);
104 static void mirror_write_complete(void *opaque
, int ret
)
106 MirrorOp
*op
= opaque
;
107 MirrorBlockJob
*s
= op
->s
;
109 BlockDriverState
*source
= s
->common
.bs
;
110 BlockErrorAction action
;
112 bdrv_set_dirty(source
, op
->sector_num
, op
->nb_sectors
);
113 action
= mirror_error_action(s
, false, -ret
);
114 if (action
== BDRV_ACTION_REPORT
&& s
->ret
>= 0) {
118 mirror_iteration_done(op
, ret
);
121 static void mirror_read_complete(void *opaque
, int ret
)
123 MirrorOp
*op
= opaque
;
124 MirrorBlockJob
*s
= op
->s
;
126 BlockDriverState
*source
= s
->common
.bs
;
127 BlockErrorAction action
;
129 bdrv_set_dirty(source
, op
->sector_num
, op
->nb_sectors
);
130 action
= mirror_error_action(s
, true, -ret
);
131 if (action
== BDRV_ACTION_REPORT
&& s
->ret
>= 0) {
135 mirror_iteration_done(op
, ret
);
138 bdrv_aio_writev(s
->target
, op
->sector_num
, &op
->qiov
, op
->nb_sectors
,
139 mirror_write_complete
, op
);
142 static void coroutine_fn
mirror_iteration(MirrorBlockJob
*s
)
144 BlockDriverState
*source
= s
->common
.bs
;
145 int nb_sectors
, sectors_per_chunk
, nb_chunks
;
146 int64_t end
, sector_num
, next_chunk
, next_sector
, hbitmap_next_sector
;
149 s
->sector_num
= hbitmap_iter_next(&s
->hbi
);
150 if (s
->sector_num
< 0) {
151 bdrv_dirty_iter_init(source
, s
->dirty_bitmap
, &s
->hbi
);
152 s
->sector_num
= hbitmap_iter_next(&s
->hbi
);
153 trace_mirror_restart_iter(s
,
154 bdrv_get_dirty_count(source
, s
->dirty_bitmap
));
155 assert(s
->sector_num
>= 0);
158 hbitmap_next_sector
= s
->sector_num
;
159 sector_num
= s
->sector_num
;
160 sectors_per_chunk
= s
->granularity
>> BDRV_SECTOR_BITS
;
161 end
= s
->common
.len
>> BDRV_SECTOR_BITS
;
163 /* Extend the QEMUIOVector to include all adjacent blocks that will
164 * be copied in this operation.
166 * We have to do this if we have no backing file yet in the destination,
167 * and the cluster size is very large. Then we need to do COW ourselves.
168 * The first time a cluster is copied, copy it entirely. Note that,
169 * because both the granularity and the cluster size are powers of two,
170 * the number of sectors to copy cannot exceed one cluster.
172 * We also want to extend the QEMUIOVector to include more adjacent
173 * dirty blocks if possible, to limit the number of I/O operations and
174 * run efficiently even with a small granularity.
178 next_sector
= sector_num
;
179 next_chunk
= sector_num
/ sectors_per_chunk
;
181 /* Wait for I/O to this cluster (from a previous iteration) to be done. */
182 while (test_bit(next_chunk
, s
->in_flight_bitmap
)) {
183 trace_mirror_yield_in_flight(s
, sector_num
, s
->in_flight
);
184 qemu_coroutine_yield();
188 int added_sectors
, added_chunks
;
190 if (!bdrv_get_dirty(source
, s
->dirty_bitmap
, next_sector
) ||
191 test_bit(next_chunk
, s
->in_flight_bitmap
)) {
192 assert(nb_sectors
> 0);
196 added_sectors
= sectors_per_chunk
;
197 if (s
->cow_bitmap
&& !test_bit(next_chunk
, s
->cow_bitmap
)) {
198 bdrv_round_to_clusters(s
->target
,
199 next_sector
, added_sectors
,
200 &next_sector
, &added_sectors
);
202 /* On the first iteration, the rounding may make us copy
203 * sectors before the first dirty one.
205 if (next_sector
< sector_num
) {
206 assert(nb_sectors
== 0);
207 sector_num
= next_sector
;
208 next_chunk
= next_sector
/ sectors_per_chunk
;
212 added_sectors
= MIN(added_sectors
, end
- (sector_num
+ nb_sectors
));
213 added_chunks
= (added_sectors
+ sectors_per_chunk
- 1) / sectors_per_chunk
;
215 /* When doing COW, it may happen that there is not enough space for
216 * a full cluster. Wait if that is the case.
218 while (nb_chunks
== 0 && s
->buf_free_count
< added_chunks
) {
219 trace_mirror_yield_buf_busy(s
, nb_chunks
, s
->in_flight
);
220 qemu_coroutine_yield();
222 if (s
->buf_free_count
< nb_chunks
+ added_chunks
) {
223 trace_mirror_break_buf_busy(s
, nb_chunks
, s
->in_flight
);
227 /* We have enough free space to copy these sectors. */
228 bitmap_set(s
->in_flight_bitmap
, next_chunk
, added_chunks
);
230 nb_sectors
+= added_sectors
;
231 nb_chunks
+= added_chunks
;
232 next_sector
+= added_sectors
;
233 next_chunk
+= added_chunks
;
234 } while (next_sector
< end
);
236 /* Allocate a MirrorOp that is used as an AIO callback. */
237 op
= g_slice_new(MirrorOp
);
239 op
->sector_num
= sector_num
;
240 op
->nb_sectors
= nb_sectors
;
242 /* Now make a QEMUIOVector taking enough granularity-sized chunks
245 qemu_iovec_init(&op
->qiov
, nb_chunks
);
246 next_sector
= sector_num
;
247 while (nb_chunks
-- > 0) {
248 MirrorBuffer
*buf
= QSIMPLEQ_FIRST(&s
->buf_free
);
249 QSIMPLEQ_REMOVE_HEAD(&s
->buf_free
, next
);
251 qemu_iovec_add(&op
->qiov
, buf
, s
->granularity
);
253 /* Advance the HBitmapIter in parallel, so that we do not examine
254 * the same sector twice.
256 if (next_sector
> hbitmap_next_sector
257 && bdrv_get_dirty(source
, s
->dirty_bitmap
, next_sector
)) {
258 hbitmap_next_sector
= hbitmap_iter_next(&s
->hbi
);
261 next_sector
+= sectors_per_chunk
;
264 bdrv_reset_dirty(source
, sector_num
, nb_sectors
);
266 /* Copy the dirty cluster. */
268 trace_mirror_one_iteration(s
, sector_num
, nb_sectors
);
269 bdrv_aio_readv(source
, sector_num
, &op
->qiov
, nb_sectors
,
270 mirror_read_complete
, op
);
273 static void mirror_free_init(MirrorBlockJob
*s
)
275 int granularity
= s
->granularity
;
276 size_t buf_size
= s
->buf_size
;
277 uint8_t *buf
= s
->buf
;
279 assert(s
->buf_free_count
== 0);
280 QSIMPLEQ_INIT(&s
->buf_free
);
281 while (buf_size
!= 0) {
282 MirrorBuffer
*cur
= (MirrorBuffer
*)buf
;
283 QSIMPLEQ_INSERT_TAIL(&s
->buf_free
, cur
, next
);
285 buf_size
-= granularity
;
290 static void mirror_drain(MirrorBlockJob
*s
)
292 while (s
->in_flight
> 0) {
293 qemu_coroutine_yield();
297 static void coroutine_fn
mirror_run(void *opaque
)
299 MirrorBlockJob
*s
= opaque
;
300 BlockDriverState
*bs
= s
->common
.bs
;
301 int64_t sector_num
, end
, sectors_per_chunk
, length
;
302 uint64_t last_pause_ns
;
304 char backing_filename
[1024];
308 if (block_job_is_cancelled(&s
->common
)) {
312 s
->common
.len
= bdrv_getlength(bs
);
313 if (s
->common
.len
<= 0) {
314 block_job_completed(&s
->common
, s
->common
.len
);
318 length
= (bdrv_getlength(bs
) + s
->granularity
- 1) / s
->granularity
;
319 s
->in_flight_bitmap
= bitmap_new(length
);
321 /* If we have no backing file yet in the destination, we cannot let
322 * the destination do COW. Instead, we copy sectors around the
323 * dirty data if needed. We need a bitmap to do that.
325 bdrv_get_backing_filename(s
->target
, backing_filename
,
326 sizeof(backing_filename
));
327 if (backing_filename
[0] && !s
->target
->backing_hd
) {
328 bdrv_get_info(s
->target
, &bdi
);
329 if (s
->granularity
< bdi
.cluster_size
) {
330 s
->buf_size
= MAX(s
->buf_size
, bdi
.cluster_size
);
331 s
->cow_bitmap
= bitmap_new(length
);
335 end
= s
->common
.len
>> BDRV_SECTOR_BITS
;
336 s
->buf
= qemu_blockalign(bs
, s
->buf_size
);
337 sectors_per_chunk
= s
->granularity
>> BDRV_SECTOR_BITS
;
340 if (!s
->is_none_mode
) {
341 /* First part, loop on the sectors and initialize the dirty bitmap. */
342 BlockDriverState
*base
= s
->base
;
343 for (sector_num
= 0; sector_num
< end
; ) {
344 int64_t next
= (sector_num
| (sectors_per_chunk
- 1)) + 1;
345 ret
= bdrv_is_allocated_above(bs
, base
,
346 sector_num
, next
- sector_num
, &n
);
354 bdrv_set_dirty(bs
, sector_num
, n
);
362 bdrv_dirty_iter_init(bs
, s
->dirty_bitmap
, &s
->hbi
);
363 last_pause_ns
= qemu_clock_get_ns(QEMU_CLOCK_REALTIME
);
367 bool should_complete
;
374 cnt
= bdrv_get_dirty_count(bs
, s
->dirty_bitmap
);
376 /* Note that even when no rate limit is applied we need to yield
377 * periodically with no pending I/O so that qemu_aio_flush() returns.
378 * We do so every SLICE_TIME nanoseconds, or when there is an error,
379 * or when the source is clean, whichever comes first.
381 if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME
) - last_pause_ns
< SLICE_TIME
&&
382 s
->common
.iostatus
== BLOCK_DEVICE_IO_STATUS_OK
) {
383 if (s
->in_flight
== MAX_IN_FLIGHT
|| s
->buf_free_count
== 0 ||
384 (cnt
== 0 && s
->in_flight
> 0)) {
385 trace_mirror_yield(s
, s
->in_flight
, s
->buf_free_count
, cnt
);
386 qemu_coroutine_yield();
388 } else if (cnt
!= 0) {
394 should_complete
= false;
395 if (s
->in_flight
== 0 && cnt
== 0) {
396 trace_mirror_before_flush(s
);
397 ret
= bdrv_flush(s
->target
);
399 if (mirror_error_action(s
, false, -ret
) == BDRV_ACTION_REPORT
) {
403 /* We're out of the streaming phase. From now on, if the job
404 * is cancelled we will actually complete all pending I/O and
405 * report completion. This way, block-job-cancel will leave
406 * the target in a consistent state.
408 s
->common
.offset
= end
* BDRV_SECTOR_SIZE
;
410 block_job_ready(&s
->common
);
414 should_complete
= s
->should_complete
||
415 block_job_is_cancelled(&s
->common
);
416 cnt
= bdrv_get_dirty_count(bs
, s
->dirty_bitmap
);
420 if (cnt
== 0 && should_complete
) {
421 /* The dirty bitmap is not updated while operations are pending.
422 * If we're about to exit, wait for pending operations before
423 * calling bdrv_get_dirty_count(bs), or we may exit while the
424 * source has dirty data to copy!
426 * Note that I/O can be submitted by the guest while
427 * mirror_populate runs.
429 trace_mirror_before_drain(s
, cnt
);
431 cnt
= bdrv_get_dirty_count(bs
, s
->dirty_bitmap
);
435 trace_mirror_before_sleep(s
, cnt
, s
->synced
);
437 /* Publish progress */
438 s
->common
.offset
= (end
- cnt
) * BDRV_SECTOR_SIZE
;
440 if (s
->common
.speed
) {
441 delay_ns
= ratelimit_calculate_delay(&s
->limit
, sectors_per_chunk
);
446 block_job_sleep_ns(&s
->common
, QEMU_CLOCK_REALTIME
, delay_ns
);
447 if (block_job_is_cancelled(&s
->common
)) {
450 } else if (!should_complete
) {
451 delay_ns
= (s
->in_flight
== 0 && cnt
== 0 ? SLICE_TIME
: 0);
452 block_job_sleep_ns(&s
->common
, QEMU_CLOCK_REALTIME
, delay_ns
);
453 } else if (cnt
== 0) {
454 /* The two disks are in sync. Exit and report successful
457 assert(QLIST_EMPTY(&bs
->tracked_requests
));
458 s
->common
.cancelled
= false;
461 last_pause_ns
= qemu_clock_get_ns(QEMU_CLOCK_REALTIME
);
465 if (s
->in_flight
> 0) {
466 /* We get here only if something went wrong. Either the job failed,
467 * or it was cancelled prematurely so that we do not guarantee that
468 * the target is a copy of the source.
470 assert(ret
< 0 || (!s
->synced
&& block_job_is_cancelled(&s
->common
)));
474 assert(s
->in_flight
== 0);
476 g_free(s
->cow_bitmap
);
477 g_free(s
->in_flight_bitmap
);
478 bdrv_release_dirty_bitmap(bs
, s
->dirty_bitmap
);
479 bdrv_iostatus_disable(s
->target
);
480 if (s
->should_complete
&& ret
== 0) {
481 if (bdrv_get_flags(s
->target
) != bdrv_get_flags(s
->common
.bs
)) {
482 bdrv_reopen(s
->target
, bdrv_get_flags(s
->common
.bs
), NULL
);
484 bdrv_swap(s
->target
, s
->common
.bs
);
485 if (s
->common
.driver
->job_type
== BLOCK_JOB_TYPE_COMMIT
) {
486 /* drop the bs loop chain formed by the swap: break the loop then
487 * trigger the unref from the top one */
488 BlockDriverState
*p
= s
->base
->backing_hd
;
489 s
->base
->backing_hd
= NULL
;
493 bdrv_unref(s
->target
);
494 block_job_completed(&s
->common
, ret
);
497 static void mirror_set_speed(BlockJob
*job
, int64_t speed
, Error
**errp
)
499 MirrorBlockJob
*s
= container_of(job
, MirrorBlockJob
, common
);
502 error_set(errp
, QERR_INVALID_PARAMETER
, "speed");
505 ratelimit_set_speed(&s
->limit
, speed
/ BDRV_SECTOR_SIZE
, SLICE_TIME
);
508 static void mirror_iostatus_reset(BlockJob
*job
)
510 MirrorBlockJob
*s
= container_of(job
, MirrorBlockJob
, common
);
512 bdrv_iostatus_reset(s
->target
);
515 static void mirror_complete(BlockJob
*job
, Error
**errp
)
517 MirrorBlockJob
*s
= container_of(job
, MirrorBlockJob
, common
);
518 Error
*local_err
= NULL
;
521 ret
= bdrv_open_backing_file(s
->target
, NULL
, &local_err
);
523 char backing_filename
[PATH_MAX
];
524 bdrv_get_full_backing_filename(s
->target
, backing_filename
,
525 sizeof(backing_filename
));
526 error_propagate(errp
, local_err
);
530 error_set(errp
, QERR_BLOCK_JOB_NOT_READY
, job
->bs
->device_name
);
534 s
->should_complete
= true;
535 block_job_resume(job
);
538 static const BlockJobDriver mirror_job_driver
= {
539 .instance_size
= sizeof(MirrorBlockJob
),
540 .job_type
= BLOCK_JOB_TYPE_MIRROR
,
541 .set_speed
= mirror_set_speed
,
542 .iostatus_reset
= mirror_iostatus_reset
,
543 .complete
= mirror_complete
,
546 static const BlockJobDriver commit_active_job_driver
= {
547 .instance_size
= sizeof(MirrorBlockJob
),
548 .job_type
= BLOCK_JOB_TYPE_COMMIT
,
549 .set_speed
= mirror_set_speed
,
551 = mirror_iostatus_reset
,
552 .complete
= mirror_complete
,
555 static void mirror_start_job(BlockDriverState
*bs
, BlockDriverState
*target
,
556 int64_t speed
, int64_t granularity
,
558 BlockdevOnError on_source_error
,
559 BlockdevOnError on_target_error
,
560 BlockDriverCompletionFunc
*cb
,
561 void *opaque
, Error
**errp
,
562 const BlockJobDriver
*driver
,
563 bool is_none_mode
, BlockDriverState
*base
)
567 if (granularity
== 0) {
568 /* Choose the default granularity based on the target file's cluster
569 * size, clamped between 4k and 64k. */
571 if (bdrv_get_info(target
, &bdi
) >= 0 && bdi
.cluster_size
!= 0) {
572 granularity
= MAX(4096, bdi
.cluster_size
);
573 granularity
= MIN(65536, granularity
);
579 assert ((granularity
& (granularity
- 1)) == 0);
581 if ((on_source_error
== BLOCKDEV_ON_ERROR_STOP
||
582 on_source_error
== BLOCKDEV_ON_ERROR_ENOSPC
) &&
583 !bdrv_iostatus_is_enabled(bs
)) {
584 error_set(errp
, QERR_INVALID_PARAMETER
, "on-source-error");
589 s
= block_job_create(driver
, bs
, speed
, cb
, opaque
, errp
);
594 s
->on_source_error
= on_source_error
;
595 s
->on_target_error
= on_target_error
;
597 s
->is_none_mode
= is_none_mode
;
599 s
->granularity
= granularity
;
600 s
->buf_size
= MAX(buf_size
, granularity
);
602 s
->dirty_bitmap
= bdrv_create_dirty_bitmap(bs
, granularity
);
603 bdrv_set_enable_write_cache(s
->target
, true);
604 bdrv_set_on_error(s
->target
, on_target_error
, on_target_error
);
605 bdrv_iostatus_enable(s
->target
);
606 s
->common
.co
= qemu_coroutine_create(mirror_run
);
607 trace_mirror_start(bs
, s
, s
->common
.co
, opaque
);
608 qemu_coroutine_enter(s
->common
.co
, s
);
611 void mirror_start(BlockDriverState
*bs
, BlockDriverState
*target
,
612 int64_t speed
, int64_t granularity
, int64_t buf_size
,
613 MirrorSyncMode mode
, BlockdevOnError on_source_error
,
614 BlockdevOnError on_target_error
,
615 BlockDriverCompletionFunc
*cb
,
616 void *opaque
, Error
**errp
)
619 BlockDriverState
*base
;
621 is_none_mode
= mode
== MIRROR_SYNC_MODE_NONE
;
622 base
= mode
== MIRROR_SYNC_MODE_TOP
? bs
->backing_hd
: NULL
;
623 mirror_start_job(bs
, target
, speed
, granularity
, buf_size
,
624 on_source_error
, on_target_error
, cb
, opaque
, errp
,
625 &mirror_job_driver
, is_none_mode
, base
);
628 void commit_active_start(BlockDriverState
*bs
, BlockDriverState
*base
,
630 BlockdevOnError on_error
,
631 BlockDriverCompletionFunc
*cb
,
632 void *opaque
, Error
**errp
)
634 int64_t length
, base_length
;
637 Error
*local_err
= NULL
;
639 orig_base_flags
= bdrv_get_flags(base
);
641 if (bdrv_reopen(base
, bs
->open_flags
, errp
)) {
645 length
= bdrv_getlength(bs
);
647 error_setg_errno(errp
, -length
,
648 "Unable to determine length of %s", bs
->filename
);
649 goto error_restore_flags
;
652 base_length
= bdrv_getlength(base
);
653 if (base_length
< 0) {
654 error_setg_errno(errp
, -base_length
,
655 "Unable to determine length of %s", base
->filename
);
656 goto error_restore_flags
;
659 if (length
> base_length
) {
660 ret
= bdrv_truncate(base
, length
);
662 error_setg_errno(errp
, -ret
,
663 "Top image %s is larger than base image %s, and "
664 "resize of base image failed",
665 bs
->filename
, base
->filename
);
666 goto error_restore_flags
;
671 mirror_start_job(bs
, base
, speed
, 0, 0,
672 on_error
, on_error
, cb
, opaque
, &local_err
,
673 &commit_active_job_driver
, false, base
);
674 if (error_is_set(&local_err
)) {
675 error_propagate(errp
, local_err
);
676 goto error_restore_flags
;
682 /* ignore error and errp for bdrv_reopen, because we want to propagate
683 * the original error */
684 bdrv_reopen(base
, orig_base_flags
, NULL
);