4 * Copyright Red Hat, Inc. 2012
7 * Paolo Bonzini <pbonzini@redhat.com>
9 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
10 * See the COPYING.LIB file in the top-level directory.
15 #include "block/blockjob.h"
16 #include "block/block_int.h"
17 #include "qemu/ratelimit.h"
18 #include "qemu/bitmap.h"
20 #define SLICE_TIME 100000000ULL /* ns */
21 #define MAX_IN_FLIGHT 16
23 /* The mirroring buffer is a list of granularity-sized chunks.
24 * Free chunks are organized in a list.
26 typedef struct MirrorBuffer
{
27 QSIMPLEQ_ENTRY(MirrorBuffer
) next
;
30 typedef struct MirrorBlockJob
{
33 BlockDriverState
*target
;
35 BlockdevOnError on_source_error
, on_target_error
;
41 unsigned long *cow_bitmap
;
42 BdrvDirtyBitmap
*dirty_bitmap
;
45 QSIMPLEQ_HEAD(, MirrorBuffer
) buf_free
;
48 unsigned long *in_flight_bitmap
;
53 typedef struct MirrorOp
{
60 static BlockErrorAction
mirror_error_action(MirrorBlockJob
*s
, bool read
,
65 return block_job_error_action(&s
->common
, s
->common
.bs
,
66 s
->on_source_error
, true, error
);
68 return block_job_error_action(&s
->common
, s
->target
,
69 s
->on_target_error
, false, error
);
73 static void mirror_iteration_done(MirrorOp
*op
, int ret
)
75 MirrorBlockJob
*s
= op
->s
;
78 int i
, nb_chunks
, sectors_per_chunk
;
80 trace_mirror_iteration_done(s
, op
->sector_num
, op
->nb_sectors
, ret
);
84 for (i
= 0; i
< op
->qiov
.niov
; i
++) {
85 MirrorBuffer
*buf
= (MirrorBuffer
*) iov
[i
].iov_base
;
86 QSIMPLEQ_INSERT_TAIL(&s
->buf_free
, buf
, next
);
90 sectors_per_chunk
= s
->granularity
>> BDRV_SECTOR_BITS
;
91 chunk_num
= op
->sector_num
/ sectors_per_chunk
;
92 nb_chunks
= op
->nb_sectors
/ sectors_per_chunk
;
93 bitmap_clear(s
->in_flight_bitmap
, chunk_num
, nb_chunks
);
94 if (s
->cow_bitmap
&& ret
>= 0) {
95 bitmap_set(s
->cow_bitmap
, chunk_num
, nb_chunks
);
98 g_slice_free(MirrorOp
, op
);
99 qemu_coroutine_enter(s
->common
.co
, NULL
);
102 static void mirror_write_complete(void *opaque
, int ret
)
104 MirrorOp
*op
= opaque
;
105 MirrorBlockJob
*s
= op
->s
;
107 BlockDriverState
*source
= s
->common
.bs
;
108 BlockErrorAction action
;
110 bdrv_set_dirty(source
, op
->sector_num
, op
->nb_sectors
);
111 action
= mirror_error_action(s
, false, -ret
);
112 if (action
== BDRV_ACTION_REPORT
&& s
->ret
>= 0) {
116 mirror_iteration_done(op
, ret
);
119 static void mirror_read_complete(void *opaque
, int ret
)
121 MirrorOp
*op
= opaque
;
122 MirrorBlockJob
*s
= op
->s
;
124 BlockDriverState
*source
= s
->common
.bs
;
125 BlockErrorAction action
;
127 bdrv_set_dirty(source
, op
->sector_num
, op
->nb_sectors
);
128 action
= mirror_error_action(s
, true, -ret
);
129 if (action
== BDRV_ACTION_REPORT
&& s
->ret
>= 0) {
133 mirror_iteration_done(op
, ret
);
136 bdrv_aio_writev(s
->target
, op
->sector_num
, &op
->qiov
, op
->nb_sectors
,
137 mirror_write_complete
, op
);
140 static void coroutine_fn
mirror_iteration(MirrorBlockJob
*s
)
142 BlockDriverState
*source
= s
->common
.bs
;
143 int nb_sectors
, sectors_per_chunk
, nb_chunks
;
144 int64_t end
, sector_num
, next_chunk
, next_sector
, hbitmap_next_sector
;
147 s
->sector_num
= hbitmap_iter_next(&s
->hbi
);
148 if (s
->sector_num
< 0) {
149 bdrv_dirty_iter_init(source
, s
->dirty_bitmap
, &s
->hbi
);
150 s
->sector_num
= hbitmap_iter_next(&s
->hbi
);
151 trace_mirror_restart_iter(s
,
152 bdrv_get_dirty_count(source
, s
->dirty_bitmap
));
153 assert(s
->sector_num
>= 0);
156 hbitmap_next_sector
= s
->sector_num
;
157 sector_num
= s
->sector_num
;
158 sectors_per_chunk
= s
->granularity
>> BDRV_SECTOR_BITS
;
159 end
= s
->common
.len
>> BDRV_SECTOR_BITS
;
161 /* Extend the QEMUIOVector to include all adjacent blocks that will
162 * be copied in this operation.
164 * We have to do this if we have no backing file yet in the destination,
165 * and the cluster size is very large. Then we need to do COW ourselves.
166 * The first time a cluster is copied, copy it entirely. Note that,
167 * because both the granularity and the cluster size are powers of two,
168 * the number of sectors to copy cannot exceed one cluster.
170 * We also want to extend the QEMUIOVector to include more adjacent
171 * dirty blocks if possible, to limit the number of I/O operations and
172 * run efficiently even with a small granularity.
176 next_sector
= sector_num
;
177 next_chunk
= sector_num
/ sectors_per_chunk
;
179 /* Wait for I/O to this cluster (from a previous iteration) to be done. */
180 while (test_bit(next_chunk
, s
->in_flight_bitmap
)) {
181 trace_mirror_yield_in_flight(s
, sector_num
, s
->in_flight
);
182 qemu_coroutine_yield();
186 int added_sectors
, added_chunks
;
188 if (!bdrv_get_dirty(source
, s
->dirty_bitmap
, next_sector
) ||
189 test_bit(next_chunk
, s
->in_flight_bitmap
)) {
190 assert(nb_sectors
> 0);
194 added_sectors
= sectors_per_chunk
;
195 if (s
->cow_bitmap
&& !test_bit(next_chunk
, s
->cow_bitmap
)) {
196 bdrv_round_to_clusters(s
->target
,
197 next_sector
, added_sectors
,
198 &next_sector
, &added_sectors
);
200 /* On the first iteration, the rounding may make us copy
201 * sectors before the first dirty one.
203 if (next_sector
< sector_num
) {
204 assert(nb_sectors
== 0);
205 sector_num
= next_sector
;
206 next_chunk
= next_sector
/ sectors_per_chunk
;
210 added_sectors
= MIN(added_sectors
, end
- (sector_num
+ nb_sectors
));
211 added_chunks
= (added_sectors
+ sectors_per_chunk
- 1) / sectors_per_chunk
;
213 /* When doing COW, it may happen that there is not enough space for
214 * a full cluster. Wait if that is the case.
216 while (nb_chunks
== 0 && s
->buf_free_count
< added_chunks
) {
217 trace_mirror_yield_buf_busy(s
, nb_chunks
, s
->in_flight
);
218 qemu_coroutine_yield();
220 if (s
->buf_free_count
< nb_chunks
+ added_chunks
) {
221 trace_mirror_break_buf_busy(s
, nb_chunks
, s
->in_flight
);
225 /* We have enough free space to copy these sectors. */
226 bitmap_set(s
->in_flight_bitmap
, next_chunk
, added_chunks
);
228 nb_sectors
+= added_sectors
;
229 nb_chunks
+= added_chunks
;
230 next_sector
+= added_sectors
;
231 next_chunk
+= added_chunks
;
232 } while (next_sector
< end
);
234 /* Allocate a MirrorOp that is used as an AIO callback. */
235 op
= g_slice_new(MirrorOp
);
237 op
->sector_num
= sector_num
;
238 op
->nb_sectors
= nb_sectors
;
240 /* Now make a QEMUIOVector taking enough granularity-sized chunks
243 qemu_iovec_init(&op
->qiov
, nb_chunks
);
244 next_sector
= sector_num
;
245 while (nb_chunks
-- > 0) {
246 MirrorBuffer
*buf
= QSIMPLEQ_FIRST(&s
->buf_free
);
247 QSIMPLEQ_REMOVE_HEAD(&s
->buf_free
, next
);
249 qemu_iovec_add(&op
->qiov
, buf
, s
->granularity
);
251 /* Advance the HBitmapIter in parallel, so that we do not examine
252 * the same sector twice.
254 if (next_sector
> hbitmap_next_sector
255 && bdrv_get_dirty(source
, s
->dirty_bitmap
, next_sector
)) {
256 hbitmap_next_sector
= hbitmap_iter_next(&s
->hbi
);
259 next_sector
+= sectors_per_chunk
;
262 bdrv_reset_dirty(source
, sector_num
, nb_sectors
);
264 /* Copy the dirty cluster. */
266 trace_mirror_one_iteration(s
, sector_num
, nb_sectors
);
267 bdrv_aio_readv(source
, sector_num
, &op
->qiov
, nb_sectors
,
268 mirror_read_complete
, op
);
271 static void mirror_free_init(MirrorBlockJob
*s
)
273 int granularity
= s
->granularity
;
274 size_t buf_size
= s
->buf_size
;
275 uint8_t *buf
= s
->buf
;
277 assert(s
->buf_free_count
== 0);
278 QSIMPLEQ_INIT(&s
->buf_free
);
279 while (buf_size
!= 0) {
280 MirrorBuffer
*cur
= (MirrorBuffer
*)buf
;
281 QSIMPLEQ_INSERT_TAIL(&s
->buf_free
, cur
, next
);
283 buf_size
-= granularity
;
288 static void mirror_drain(MirrorBlockJob
*s
)
290 while (s
->in_flight
> 0) {
291 qemu_coroutine_yield();
295 static void coroutine_fn
mirror_run(void *opaque
)
297 MirrorBlockJob
*s
= opaque
;
298 BlockDriverState
*bs
= s
->common
.bs
;
299 int64_t sector_num
, end
, sectors_per_chunk
, length
;
300 uint64_t last_pause_ns
;
302 char backing_filename
[1024];
306 if (block_job_is_cancelled(&s
->common
)) {
310 s
->common
.len
= bdrv_getlength(bs
);
311 if (s
->common
.len
<= 0) {
312 block_job_completed(&s
->common
, s
->common
.len
);
316 length
= (bdrv_getlength(bs
) + s
->granularity
- 1) / s
->granularity
;
317 s
->in_flight_bitmap
= bitmap_new(length
);
319 /* If we have no backing file yet in the destination, we cannot let
320 * the destination do COW. Instead, we copy sectors around the
321 * dirty data if needed. We need a bitmap to do that.
323 bdrv_get_backing_filename(s
->target
, backing_filename
,
324 sizeof(backing_filename
));
325 if (backing_filename
[0] && !s
->target
->backing_hd
) {
326 bdrv_get_info(s
->target
, &bdi
);
327 if (s
->granularity
< bdi
.cluster_size
) {
328 s
->buf_size
= MAX(s
->buf_size
, bdi
.cluster_size
);
329 s
->cow_bitmap
= bitmap_new(length
);
333 end
= s
->common
.len
>> BDRV_SECTOR_BITS
;
334 s
->buf
= qemu_blockalign(bs
, s
->buf_size
);
335 sectors_per_chunk
= s
->granularity
>> BDRV_SECTOR_BITS
;
338 if (s
->mode
!= MIRROR_SYNC_MODE_NONE
) {
339 /* First part, loop on the sectors and initialize the dirty bitmap. */
340 BlockDriverState
*base
;
341 base
= s
->mode
== MIRROR_SYNC_MODE_FULL
? NULL
: bs
->backing_hd
;
342 for (sector_num
= 0; sector_num
< end
; ) {
343 int64_t next
= (sector_num
| (sectors_per_chunk
- 1)) + 1;
344 ret
= bdrv_is_allocated_above(bs
, base
,
345 sector_num
, next
- sector_num
, &n
);
353 bdrv_set_dirty(bs
, sector_num
, n
);
361 bdrv_dirty_iter_init(bs
, s
->dirty_bitmap
, &s
->hbi
);
362 last_pause_ns
= qemu_clock_get_ns(QEMU_CLOCK_REALTIME
);
366 bool should_complete
;
373 cnt
= bdrv_get_dirty_count(bs
, s
->dirty_bitmap
);
375 /* Note that even when no rate limit is applied we need to yield
376 * periodically with no pending I/O so that qemu_aio_flush() returns.
377 * We do so every SLICE_TIME nanoseconds, or when there is an error,
378 * or when the source is clean, whichever comes first.
380 if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME
) - last_pause_ns
< SLICE_TIME
&&
381 s
->common
.iostatus
== BLOCK_DEVICE_IO_STATUS_OK
) {
382 if (s
->in_flight
== MAX_IN_FLIGHT
|| s
->buf_free_count
== 0 ||
383 (cnt
== 0 && s
->in_flight
> 0)) {
384 trace_mirror_yield(s
, s
->in_flight
, s
->buf_free_count
, cnt
);
385 qemu_coroutine_yield();
387 } else if (cnt
!= 0) {
393 should_complete
= false;
394 if (s
->in_flight
== 0 && cnt
== 0) {
395 trace_mirror_before_flush(s
);
396 ret
= bdrv_flush(s
->target
);
398 if (mirror_error_action(s
, false, -ret
) == BDRV_ACTION_REPORT
) {
402 /* We're out of the streaming phase. From now on, if the job
403 * is cancelled we will actually complete all pending I/O and
404 * report completion. This way, block-job-cancel will leave
405 * the target in a consistent state.
407 s
->common
.offset
= end
* BDRV_SECTOR_SIZE
;
409 block_job_ready(&s
->common
);
413 should_complete
= s
->should_complete
||
414 block_job_is_cancelled(&s
->common
);
415 cnt
= bdrv_get_dirty_count(bs
, s
->dirty_bitmap
);
419 if (cnt
== 0 && should_complete
) {
420 /* The dirty bitmap is not updated while operations are pending.
421 * If we're about to exit, wait for pending operations before
422 * calling bdrv_get_dirty_count(bs), or we may exit while the
423 * source has dirty data to copy!
425 * Note that I/O can be submitted by the guest while
426 * mirror_populate runs.
428 trace_mirror_before_drain(s
, cnt
);
430 cnt
= bdrv_get_dirty_count(bs
, s
->dirty_bitmap
);
434 trace_mirror_before_sleep(s
, cnt
, s
->synced
);
436 /* Publish progress */
437 s
->common
.offset
= (end
- cnt
) * BDRV_SECTOR_SIZE
;
439 if (s
->common
.speed
) {
440 delay_ns
= ratelimit_calculate_delay(&s
->limit
, sectors_per_chunk
);
445 block_job_sleep_ns(&s
->common
, QEMU_CLOCK_REALTIME
, delay_ns
);
446 if (block_job_is_cancelled(&s
->common
)) {
449 } else if (!should_complete
) {
450 delay_ns
= (s
->in_flight
== 0 && cnt
== 0 ? SLICE_TIME
: 0);
451 block_job_sleep_ns(&s
->common
, QEMU_CLOCK_REALTIME
, delay_ns
);
452 } else if (cnt
== 0) {
453 /* The two disks are in sync. Exit and report successful
456 assert(QLIST_EMPTY(&bs
->tracked_requests
));
457 s
->common
.cancelled
= false;
460 last_pause_ns
= qemu_clock_get_ns(QEMU_CLOCK_REALTIME
);
464 if (s
->in_flight
> 0) {
465 /* We get here only if something went wrong. Either the job failed,
466 * or it was cancelled prematurely so that we do not guarantee that
467 * the target is a copy of the source.
469 assert(ret
< 0 || (!s
->synced
&& block_job_is_cancelled(&s
->common
)));
473 assert(s
->in_flight
== 0);
475 g_free(s
->cow_bitmap
);
476 g_free(s
->in_flight_bitmap
);
477 bdrv_release_dirty_bitmap(bs
, s
->dirty_bitmap
);
478 bdrv_iostatus_disable(s
->target
);
479 if (s
->should_complete
&& ret
== 0) {
480 if (bdrv_get_flags(s
->target
) != bdrv_get_flags(s
->common
.bs
)) {
481 bdrv_reopen(s
->target
, bdrv_get_flags(s
->common
.bs
), NULL
);
483 bdrv_swap(s
->target
, s
->common
.bs
);
485 bdrv_close(s
->target
);
486 bdrv_unref(s
->target
);
487 block_job_completed(&s
->common
, ret
);
490 static void mirror_set_speed(BlockJob
*job
, int64_t speed
, Error
**errp
)
492 MirrorBlockJob
*s
= container_of(job
, MirrorBlockJob
, common
);
495 error_set(errp
, QERR_INVALID_PARAMETER
, "speed");
498 ratelimit_set_speed(&s
->limit
, speed
/ BDRV_SECTOR_SIZE
, SLICE_TIME
);
501 static void mirror_iostatus_reset(BlockJob
*job
)
503 MirrorBlockJob
*s
= container_of(job
, MirrorBlockJob
, common
);
505 bdrv_iostatus_reset(s
->target
);
508 static void mirror_complete(BlockJob
*job
, Error
**errp
)
510 MirrorBlockJob
*s
= container_of(job
, MirrorBlockJob
, common
);
511 Error
*local_err
= NULL
;
514 ret
= bdrv_open_backing_file(s
->target
, NULL
, &local_err
);
516 char backing_filename
[PATH_MAX
];
517 bdrv_get_full_backing_filename(s
->target
, backing_filename
,
518 sizeof(backing_filename
));
519 error_propagate(errp
, local_err
);
523 error_set(errp
, QERR_BLOCK_JOB_NOT_READY
, job
->bs
->device_name
);
527 s
->should_complete
= true;
528 block_job_resume(job
);
531 static const BlockJobDriver mirror_job_driver
= {
532 .instance_size
= sizeof(MirrorBlockJob
),
533 .job_type
= BLOCK_JOB_TYPE_MIRROR
,
534 .set_speed
= mirror_set_speed
,
535 .iostatus_reset
= mirror_iostatus_reset
,
536 .complete
= mirror_complete
,
539 void mirror_start(BlockDriverState
*bs
, BlockDriverState
*target
,
540 int64_t speed
, int64_t granularity
, int64_t buf_size
,
541 MirrorSyncMode mode
, BlockdevOnError on_source_error
,
542 BlockdevOnError on_target_error
,
543 BlockDriverCompletionFunc
*cb
,
544 void *opaque
, Error
**errp
)
548 if (granularity
== 0) {
549 /* Choose the default granularity based on the target file's cluster
550 * size, clamped between 4k and 64k. */
552 if (bdrv_get_info(target
, &bdi
) >= 0 && bdi
.cluster_size
!= 0) {
553 granularity
= MAX(4096, bdi
.cluster_size
);
554 granularity
= MIN(65536, granularity
);
560 assert ((granularity
& (granularity
- 1)) == 0);
562 if ((on_source_error
== BLOCKDEV_ON_ERROR_STOP
||
563 on_source_error
== BLOCKDEV_ON_ERROR_ENOSPC
) &&
564 !bdrv_iostatus_is_enabled(bs
)) {
565 error_set(errp
, QERR_INVALID_PARAMETER
, "on-source-error");
569 s
= block_job_create(&mirror_job_driver
, bs
, speed
, cb
, opaque
, errp
);
574 s
->on_source_error
= on_source_error
;
575 s
->on_target_error
= on_target_error
;
578 s
->granularity
= granularity
;
579 s
->buf_size
= MAX(buf_size
, granularity
);
581 s
->dirty_bitmap
= bdrv_create_dirty_bitmap(bs
, granularity
);
582 bdrv_set_enable_write_cache(s
->target
, true);
583 bdrv_set_on_error(s
->target
, on_target_error
, on_target_error
);
584 bdrv_iostatus_enable(s
->target
);
585 s
->common
.co
= qemu_coroutine_create(mirror_run
);
586 trace_mirror_start(bs
, s
, s
->common
.co
, opaque
);
587 qemu_coroutine_enter(s
->common
.co
, s
);