4 * Copyright Red Hat, Inc. 2012
7 * Paolo Bonzini <pbonzini@redhat.com>
9 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
10 * See the COPYING.LIB file in the top-level directory.
15 #include "block/blockjob.h"
16 #include "block/block_int.h"
17 #include "qemu/ratelimit.h"
18 #include "qemu/bitmap.h"
20 #define SLICE_TIME 100000000ULL /* ns */
21 #define MAX_IN_FLIGHT 16
23 /* The mirroring buffer is a list of granularity-sized chunks.
24 * Free chunks are organized in a list.
26 typedef struct MirrorBuffer
{
27 QSIMPLEQ_ENTRY(MirrorBuffer
) next
;
30 typedef struct MirrorBlockJob
{
33 BlockDriverState
*target
;
35 BlockdevOnError on_source_error
, on_target_error
;
41 unsigned long *cow_bitmap
;
44 QSIMPLEQ_HEAD(, MirrorBuffer
) buf_free
;
47 unsigned long *in_flight_bitmap
;
52 typedef struct MirrorOp
{
59 static BlockErrorAction
mirror_error_action(MirrorBlockJob
*s
, bool read
,
64 return block_job_error_action(&s
->common
, s
->common
.bs
,
65 s
->on_source_error
, true, error
);
67 return block_job_error_action(&s
->common
, s
->target
,
68 s
->on_target_error
, false, error
);
72 static void mirror_iteration_done(MirrorOp
*op
, int ret
)
74 MirrorBlockJob
*s
= op
->s
;
77 int i
, nb_chunks
, sectors_per_chunk
;
79 trace_mirror_iteration_done(s
, op
->sector_num
, op
->nb_sectors
, ret
);
83 for (i
= 0; i
< op
->qiov
.niov
; i
++) {
84 MirrorBuffer
*buf
= (MirrorBuffer
*) iov
[i
].iov_base
;
85 QSIMPLEQ_INSERT_TAIL(&s
->buf_free
, buf
, next
);
89 sectors_per_chunk
= s
->granularity
>> BDRV_SECTOR_BITS
;
90 chunk_num
= op
->sector_num
/ sectors_per_chunk
;
91 nb_chunks
= op
->nb_sectors
/ sectors_per_chunk
;
92 bitmap_clear(s
->in_flight_bitmap
, chunk_num
, nb_chunks
);
93 if (s
->cow_bitmap
&& ret
>= 0) {
94 bitmap_set(s
->cow_bitmap
, chunk_num
, nb_chunks
);
97 g_slice_free(MirrorOp
, op
);
98 qemu_coroutine_enter(s
->common
.co
, NULL
);
101 static void mirror_write_complete(void *opaque
, int ret
)
103 MirrorOp
*op
= opaque
;
104 MirrorBlockJob
*s
= op
->s
;
106 BlockDriverState
*source
= s
->common
.bs
;
107 BlockErrorAction action
;
109 bdrv_set_dirty(source
, op
->sector_num
, op
->nb_sectors
);
110 action
= mirror_error_action(s
, false, -ret
);
111 if (action
== BDRV_ACTION_REPORT
&& s
->ret
>= 0) {
115 mirror_iteration_done(op
, ret
);
118 static void mirror_read_complete(void *opaque
, int ret
)
120 MirrorOp
*op
= opaque
;
121 MirrorBlockJob
*s
= op
->s
;
123 BlockDriverState
*source
= s
->common
.bs
;
124 BlockErrorAction action
;
126 bdrv_set_dirty(source
, op
->sector_num
, op
->nb_sectors
);
127 action
= mirror_error_action(s
, true, -ret
);
128 if (action
== BDRV_ACTION_REPORT
&& s
->ret
>= 0) {
132 mirror_iteration_done(op
, ret
);
135 bdrv_aio_writev(s
->target
, op
->sector_num
, &op
->qiov
, op
->nb_sectors
,
136 mirror_write_complete
, op
);
139 static void coroutine_fn
mirror_iteration(MirrorBlockJob
*s
)
141 BlockDriverState
*source
= s
->common
.bs
;
142 int nb_sectors
, sectors_per_chunk
, nb_chunks
;
143 int64_t end
, sector_num
, next_chunk
, next_sector
, hbitmap_next_sector
;
146 s
->sector_num
= hbitmap_iter_next(&s
->hbi
);
147 if (s
->sector_num
< 0) {
148 bdrv_dirty_iter_init(source
, &s
->hbi
);
149 s
->sector_num
= hbitmap_iter_next(&s
->hbi
);
150 trace_mirror_restart_iter(s
, bdrv_get_dirty_count(source
));
151 assert(s
->sector_num
>= 0);
154 hbitmap_next_sector
= s
->sector_num
;
155 sector_num
= s
->sector_num
;
156 sectors_per_chunk
= s
->granularity
>> BDRV_SECTOR_BITS
;
157 end
= s
->common
.len
>> BDRV_SECTOR_BITS
;
159 /* Extend the QEMUIOVector to include all adjacent blocks that will
160 * be copied in this operation.
162 * We have to do this if we have no backing file yet in the destination,
163 * and the cluster size is very large. Then we need to do COW ourselves.
164 * The first time a cluster is copied, copy it entirely. Note that,
165 * because both the granularity and the cluster size are powers of two,
166 * the number of sectors to copy cannot exceed one cluster.
168 * We also want to extend the QEMUIOVector to include more adjacent
169 * dirty blocks if possible, to limit the number of I/O operations and
170 * run efficiently even with a small granularity.
174 next_sector
= sector_num
;
175 next_chunk
= sector_num
/ sectors_per_chunk
;
177 /* Wait for I/O to this cluster (from a previous iteration) to be done. */
178 while (test_bit(next_chunk
, s
->in_flight_bitmap
)) {
179 trace_mirror_yield_in_flight(s
, sector_num
, s
->in_flight
);
180 qemu_coroutine_yield();
184 int added_sectors
, added_chunks
;
186 if (!bdrv_get_dirty(source
, next_sector
) ||
187 test_bit(next_chunk
, s
->in_flight_bitmap
)) {
188 assert(nb_sectors
> 0);
192 added_sectors
= sectors_per_chunk
;
193 if (s
->cow_bitmap
&& !test_bit(next_chunk
, s
->cow_bitmap
)) {
194 bdrv_round_to_clusters(s
->target
,
195 next_sector
, added_sectors
,
196 &next_sector
, &added_sectors
);
198 /* On the first iteration, the rounding may make us copy
199 * sectors before the first dirty one.
201 if (next_sector
< sector_num
) {
202 assert(nb_sectors
== 0);
203 sector_num
= next_sector
;
204 next_chunk
= next_sector
/ sectors_per_chunk
;
208 added_sectors
= MIN(added_sectors
, end
- (sector_num
+ nb_sectors
));
209 added_chunks
= (added_sectors
+ sectors_per_chunk
- 1) / sectors_per_chunk
;
211 /* When doing COW, it may happen that there is not enough space for
212 * a full cluster. Wait if that is the case.
214 while (nb_chunks
== 0 && s
->buf_free_count
< added_chunks
) {
215 trace_mirror_yield_buf_busy(s
, nb_chunks
, s
->in_flight
);
216 qemu_coroutine_yield();
218 if (s
->buf_free_count
< nb_chunks
+ added_chunks
) {
219 trace_mirror_break_buf_busy(s
, nb_chunks
, s
->in_flight
);
223 /* We have enough free space to copy these sectors. */
224 bitmap_set(s
->in_flight_bitmap
, next_chunk
, added_chunks
);
226 nb_sectors
+= added_sectors
;
227 nb_chunks
+= added_chunks
;
228 next_sector
+= added_sectors
;
229 next_chunk
+= added_chunks
;
230 } while (next_sector
< end
);
232 /* Allocate a MirrorOp that is used as an AIO callback. */
233 op
= g_slice_new(MirrorOp
);
235 op
->sector_num
= sector_num
;
236 op
->nb_sectors
= nb_sectors
;
238 /* Now make a QEMUIOVector taking enough granularity-sized chunks
241 qemu_iovec_init(&op
->qiov
, nb_chunks
);
242 next_sector
= sector_num
;
243 while (nb_chunks
-- > 0) {
244 MirrorBuffer
*buf
= QSIMPLEQ_FIRST(&s
->buf_free
);
245 QSIMPLEQ_REMOVE_HEAD(&s
->buf_free
, next
);
247 qemu_iovec_add(&op
->qiov
, buf
, s
->granularity
);
249 /* Advance the HBitmapIter in parallel, so that we do not examine
250 * the same sector twice.
252 if (next_sector
> hbitmap_next_sector
&& bdrv_get_dirty(source
, next_sector
)) {
253 hbitmap_next_sector
= hbitmap_iter_next(&s
->hbi
);
256 next_sector
+= sectors_per_chunk
;
259 bdrv_reset_dirty(source
, sector_num
, nb_sectors
);
261 /* Copy the dirty cluster. */
263 trace_mirror_one_iteration(s
, sector_num
, nb_sectors
);
264 bdrv_aio_readv(source
, sector_num
, &op
->qiov
, nb_sectors
,
265 mirror_read_complete
, op
);
268 static void mirror_free_init(MirrorBlockJob
*s
)
270 int granularity
= s
->granularity
;
271 size_t buf_size
= s
->buf_size
;
272 uint8_t *buf
= s
->buf
;
274 assert(s
->buf_free_count
== 0);
275 QSIMPLEQ_INIT(&s
->buf_free
);
276 while (buf_size
!= 0) {
277 MirrorBuffer
*cur
= (MirrorBuffer
*)buf
;
278 QSIMPLEQ_INSERT_TAIL(&s
->buf_free
, cur
, next
);
280 buf_size
-= granularity
;
285 static void mirror_drain(MirrorBlockJob
*s
)
287 while (s
->in_flight
> 0) {
288 qemu_coroutine_yield();
292 static void coroutine_fn
mirror_run(void *opaque
)
294 MirrorBlockJob
*s
= opaque
;
295 BlockDriverState
*bs
= s
->common
.bs
;
296 int64_t sector_num
, end
, sectors_per_chunk
, length
;
297 uint64_t last_pause_ns
;
299 char backing_filename
[1024];
303 if (block_job_is_cancelled(&s
->common
)) {
307 s
->common
.len
= bdrv_getlength(bs
);
308 if (s
->common
.len
<= 0) {
309 block_job_completed(&s
->common
, s
->common
.len
);
313 length
= (bdrv_getlength(bs
) + s
->granularity
- 1) / s
->granularity
;
314 s
->in_flight_bitmap
= bitmap_new(length
);
316 /* If we have no backing file yet in the destination, we cannot let
317 * the destination do COW. Instead, we copy sectors around the
318 * dirty data if needed. We need a bitmap to do that.
320 bdrv_get_backing_filename(s
->target
, backing_filename
,
321 sizeof(backing_filename
));
322 if (backing_filename
[0] && !s
->target
->backing_hd
) {
323 bdrv_get_info(s
->target
, &bdi
);
324 if (s
->granularity
< bdi
.cluster_size
) {
325 s
->buf_size
= MAX(s
->buf_size
, bdi
.cluster_size
);
326 s
->cow_bitmap
= bitmap_new(length
);
330 end
= s
->common
.len
>> BDRV_SECTOR_BITS
;
331 s
->buf
= qemu_blockalign(bs
, s
->buf_size
);
332 sectors_per_chunk
= s
->granularity
>> BDRV_SECTOR_BITS
;
335 if (s
->mode
!= MIRROR_SYNC_MODE_NONE
) {
336 /* First part, loop on the sectors and initialize the dirty bitmap. */
337 BlockDriverState
*base
;
338 base
= s
->mode
== MIRROR_SYNC_MODE_FULL
? NULL
: bs
->backing_hd
;
339 for (sector_num
= 0; sector_num
< end
; ) {
340 int64_t next
= (sector_num
| (sectors_per_chunk
- 1)) + 1;
341 ret
= bdrv_is_allocated_above(bs
, base
,
342 sector_num
, next
- sector_num
, &n
);
350 bdrv_set_dirty(bs
, sector_num
, n
);
358 bdrv_dirty_iter_init(bs
, &s
->hbi
);
359 last_pause_ns
= qemu_clock_get_ns(QEMU_CLOCK_REALTIME
);
363 bool should_complete
;
370 cnt
= bdrv_get_dirty_count(bs
);
372 /* Note that even when no rate limit is applied we need to yield
373 * periodically with no pending I/O so that qemu_aio_flush() returns.
374 * We do so every SLICE_TIME nanoseconds, or when there is an error,
375 * or when the source is clean, whichever comes first.
377 if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME
) - last_pause_ns
< SLICE_TIME
&&
378 s
->common
.iostatus
== BLOCK_DEVICE_IO_STATUS_OK
) {
379 if (s
->in_flight
== MAX_IN_FLIGHT
|| s
->buf_free_count
== 0 ||
380 (cnt
== 0 && s
->in_flight
> 0)) {
381 trace_mirror_yield(s
, s
->in_flight
, s
->buf_free_count
, cnt
);
382 qemu_coroutine_yield();
384 } else if (cnt
!= 0) {
390 should_complete
= false;
391 if (s
->in_flight
== 0 && cnt
== 0) {
392 trace_mirror_before_flush(s
);
393 ret
= bdrv_flush(s
->target
);
395 if (mirror_error_action(s
, false, -ret
) == BDRV_ACTION_REPORT
) {
399 /* We're out of the streaming phase. From now on, if the job
400 * is cancelled we will actually complete all pending I/O and
401 * report completion. This way, block-job-cancel will leave
402 * the target in a consistent state.
404 s
->common
.offset
= end
* BDRV_SECTOR_SIZE
;
406 block_job_ready(&s
->common
);
410 should_complete
= s
->should_complete
||
411 block_job_is_cancelled(&s
->common
);
412 cnt
= bdrv_get_dirty_count(bs
);
416 if (cnt
== 0 && should_complete
) {
417 /* The dirty bitmap is not updated while operations are pending.
418 * If we're about to exit, wait for pending operations before
419 * calling bdrv_get_dirty_count(bs), or we may exit while the
420 * source has dirty data to copy!
422 * Note that I/O can be submitted by the guest while
423 * mirror_populate runs.
425 trace_mirror_before_drain(s
, cnt
);
427 cnt
= bdrv_get_dirty_count(bs
);
431 trace_mirror_before_sleep(s
, cnt
, s
->synced
);
433 /* Publish progress */
434 s
->common
.offset
= (end
- cnt
) * BDRV_SECTOR_SIZE
;
436 if (s
->common
.speed
) {
437 delay_ns
= ratelimit_calculate_delay(&s
->limit
, sectors_per_chunk
);
442 block_job_sleep_ns(&s
->common
, QEMU_CLOCK_REALTIME
, delay_ns
);
443 if (block_job_is_cancelled(&s
->common
)) {
446 } else if (!should_complete
) {
447 delay_ns
= (s
->in_flight
== 0 && cnt
== 0 ? SLICE_TIME
: 0);
448 block_job_sleep_ns(&s
->common
, QEMU_CLOCK_REALTIME
, delay_ns
);
449 } else if (cnt
== 0) {
450 /* The two disks are in sync. Exit and report successful
453 assert(QLIST_EMPTY(&bs
->tracked_requests
));
454 s
->common
.cancelled
= false;
457 last_pause_ns
= qemu_clock_get_ns(QEMU_CLOCK_REALTIME
);
461 if (s
->in_flight
> 0) {
462 /* We get here only if something went wrong. Either the job failed,
463 * or it was cancelled prematurely so that we do not guarantee that
464 * the target is a copy of the source.
466 assert(ret
< 0 || (!s
->synced
&& block_job_is_cancelled(&s
->common
)));
470 assert(s
->in_flight
== 0);
472 g_free(s
->cow_bitmap
);
473 g_free(s
->in_flight_bitmap
);
474 bdrv_set_dirty_tracking(bs
, 0);
475 bdrv_iostatus_disable(s
->target
);
476 if (s
->should_complete
&& ret
== 0) {
477 if (bdrv_get_flags(s
->target
) != bdrv_get_flags(s
->common
.bs
)) {
478 bdrv_reopen(s
->target
, bdrv_get_flags(s
->common
.bs
), NULL
);
480 bdrv_swap(s
->target
, s
->common
.bs
);
482 bdrv_close(s
->target
);
483 bdrv_unref(s
->target
);
484 block_job_completed(&s
->common
, ret
);
487 static void mirror_set_speed(BlockJob
*job
, int64_t speed
, Error
**errp
)
489 MirrorBlockJob
*s
= container_of(job
, MirrorBlockJob
, common
);
492 error_set(errp
, QERR_INVALID_PARAMETER
, "speed");
495 ratelimit_set_speed(&s
->limit
, speed
/ BDRV_SECTOR_SIZE
, SLICE_TIME
);
498 static void mirror_iostatus_reset(BlockJob
*job
)
500 MirrorBlockJob
*s
= container_of(job
, MirrorBlockJob
, common
);
502 bdrv_iostatus_reset(s
->target
);
505 static void mirror_complete(BlockJob
*job
, Error
**errp
)
507 MirrorBlockJob
*s
= container_of(job
, MirrorBlockJob
, common
);
510 ret
= bdrv_open_backing_file(s
->target
, NULL
);
512 char backing_filename
[PATH_MAX
];
513 bdrv_get_full_backing_filename(s
->target
, backing_filename
,
514 sizeof(backing_filename
));
515 error_setg_file_open(errp
, -ret
, backing_filename
);
519 error_set(errp
, QERR_BLOCK_JOB_NOT_READY
, job
->bs
->device_name
);
523 s
->should_complete
= true;
524 block_job_resume(job
);
527 static const BlockJobType mirror_job_type
= {
528 .instance_size
= sizeof(MirrorBlockJob
),
529 .job_type
= "mirror",
530 .set_speed
= mirror_set_speed
,
531 .iostatus_reset
= mirror_iostatus_reset
,
532 .complete
= mirror_complete
,
535 void mirror_start(BlockDriverState
*bs
, BlockDriverState
*target
,
536 int64_t speed
, int64_t granularity
, int64_t buf_size
,
537 MirrorSyncMode mode
, BlockdevOnError on_source_error
,
538 BlockdevOnError on_target_error
,
539 BlockDriverCompletionFunc
*cb
,
540 void *opaque
, Error
**errp
)
544 if (granularity
== 0) {
545 /* Choose the default granularity based on the target file's cluster
546 * size, clamped between 4k and 64k. */
548 if (bdrv_get_info(target
, &bdi
) >= 0 && bdi
.cluster_size
!= 0) {
549 granularity
= MAX(4096, bdi
.cluster_size
);
550 granularity
= MIN(65536, granularity
);
556 assert ((granularity
& (granularity
- 1)) == 0);
558 if ((on_source_error
== BLOCKDEV_ON_ERROR_STOP
||
559 on_source_error
== BLOCKDEV_ON_ERROR_ENOSPC
) &&
560 !bdrv_iostatus_is_enabled(bs
)) {
561 error_set(errp
, QERR_INVALID_PARAMETER
, "on-source-error");
565 s
= block_job_create(&mirror_job_type
, bs
, speed
, cb
, opaque
, errp
);
570 s
->on_source_error
= on_source_error
;
571 s
->on_target_error
= on_target_error
;
574 s
->granularity
= granularity
;
575 s
->buf_size
= MAX(buf_size
, granularity
);
577 bdrv_set_dirty_tracking(bs
, granularity
);
578 bdrv_set_enable_write_cache(s
->target
, true);
579 bdrv_set_on_error(s
->target
, on_target_error
, on_target_error
);
580 bdrv_iostatus_enable(s
->target
);
581 s
->common
.co
= qemu_coroutine_create(mirror_run
);
582 trace_mirror_start(bs
, s
, s
->common
.co
, opaque
);
583 qemu_coroutine_enter(s
->common
.co
, s
);