2 * QEMU live block migration
4 * Copyright IBM, Corp. 2009
7 * Liran Schour <lirans@il.ibm.com>
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
14 #include "qemu-common.h"
15 #include "block_int.h"
17 #include "qemu-queue.h"
18 #include "qemu-timer.h"
20 #include "block-migration.h"
21 #include "migration.h"
24 #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
26 #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01
27 #define BLK_MIG_FLAG_EOS 0x02
28 #define BLK_MIG_FLAG_PROGRESS 0x04
30 #define MAX_IS_ALLOCATED_SEARCH 65536
32 //#define DEBUG_BLK_MIGRATION
34 #ifdef DEBUG_BLK_MIGRATION
35 #define DPRINTF(fmt, ...) \
36 do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
38 #define DPRINTF(fmt, ...) \
42 typedef struct BlkMigDevState
{
48 int64_t completed_sectors
;
49 int64_t total_sectors
;
51 QSIMPLEQ_ENTRY(BlkMigDevState
) entry
;
54 typedef struct BlkMigBlock
{
60 BlockDriverAIOCB
*aiocb
;
63 QSIMPLEQ_ENTRY(BlkMigBlock
) entry
;
66 typedef struct BlkMigState
{
69 QSIMPLEQ_HEAD(bmds_list
, BlkMigDevState
) bmds_list
;
70 QSIMPLEQ_HEAD(blk_list
, BlkMigBlock
) blk_list
;
74 int64_t total_sector_sum
;
77 long double total_time
;
81 static BlkMigState block_mig_state
;
83 static void blk_send(QEMUFile
*f
, BlkMigBlock
* blk
)
87 /* sector number and flags */
88 qemu_put_be64(f
, (blk
->sector
<< BDRV_SECTOR_BITS
)
89 | BLK_MIG_FLAG_DEVICE_BLOCK
);
92 len
= strlen(blk
->bmds
->bs
->device_name
);
93 qemu_put_byte(f
, len
);
94 qemu_put_buffer(f
, (uint8_t *)blk
->bmds
->bs
->device_name
, len
);
96 qemu_put_buffer(f
, blk
->buf
, BLOCK_SIZE
);
99 int blk_mig_active(void)
101 return !QSIMPLEQ_EMPTY(&block_mig_state
.bmds_list
);
104 uint64_t blk_mig_bytes_transferred(void)
106 BlkMigDevState
*bmds
;
109 QSIMPLEQ_FOREACH(bmds
, &block_mig_state
.bmds_list
, entry
) {
110 sum
+= bmds
->completed_sectors
;
112 return sum
<< BDRV_SECTOR_BITS
;
115 uint64_t blk_mig_bytes_remaining(void)
117 return blk_mig_bytes_total() - blk_mig_bytes_transferred();
120 uint64_t blk_mig_bytes_total(void)
122 BlkMigDevState
*bmds
;
125 QSIMPLEQ_FOREACH(bmds
, &block_mig_state
.bmds_list
, entry
) {
126 sum
+= bmds
->total_sectors
;
128 return sum
<< BDRV_SECTOR_BITS
;
131 static inline void add_avg_read_time(int64_t time
)
133 block_mig_state
.reads
++;
134 block_mig_state
.total_time
+= time
;
137 static inline long double compute_read_bwidth(void)
139 assert(block_mig_state
.total_time
!= 0);
140 return (block_mig_state
.reads
* BLOCK_SIZE
)/ block_mig_state
.total_time
;
143 static void blk_mig_read_cb(void *opaque
, int ret
)
145 BlkMigBlock
*blk
= opaque
;
149 blk
->time
= qemu_get_clock_ns(rt_clock
) - blk
->time
;
151 add_avg_read_time(blk
->time
);
153 QSIMPLEQ_INSERT_TAIL(&block_mig_state
.blk_list
, blk
, entry
);
155 block_mig_state
.submitted
--;
156 block_mig_state
.read_done
++;
157 assert(block_mig_state
.submitted
>= 0);
160 static int mig_save_device_bulk(Monitor
*mon
, QEMUFile
*f
,
161 BlkMigDevState
*bmds
)
163 int64_t total_sectors
= bmds
->total_sectors
;
164 int64_t cur_sector
= bmds
->cur_sector
;
165 BlockDriverState
*bs
= bmds
->bs
;
169 if (bmds
->shared_base
) {
170 while (cur_sector
< total_sectors
&&
171 !bdrv_is_allocated(bs
, cur_sector
, MAX_IS_ALLOCATED_SEARCH
,
173 cur_sector
+= nr_sectors
;
177 if (cur_sector
>= total_sectors
) {
178 bmds
->cur_sector
= bmds
->completed_sectors
= total_sectors
;
182 bmds
->completed_sectors
= cur_sector
;
184 cur_sector
&= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK
- 1);
186 /* we are going to transfer a full block even if it is not allocated */
187 nr_sectors
= BDRV_SECTORS_PER_DIRTY_CHUNK
;
189 if (total_sectors
- cur_sector
< BDRV_SECTORS_PER_DIRTY_CHUNK
) {
190 nr_sectors
= total_sectors
- cur_sector
;
193 blk
= qemu_malloc(sizeof(BlkMigBlock
));
194 blk
->buf
= qemu_malloc(BLOCK_SIZE
);
196 blk
->sector
= cur_sector
;
198 blk
->iov
.iov_base
= blk
->buf
;
199 blk
->iov
.iov_len
= nr_sectors
* BDRV_SECTOR_SIZE
;
200 qemu_iovec_init_external(&blk
->qiov
, &blk
->iov
, 1);
202 blk
->time
= qemu_get_clock_ns(rt_clock
);
204 blk
->aiocb
= bdrv_aio_readv(bs
, cur_sector
, &blk
->qiov
,
205 nr_sectors
, blk_mig_read_cb
, blk
);
209 block_mig_state
.submitted
++;
211 bdrv_reset_dirty(bs
, cur_sector
, nr_sectors
);
212 bmds
->cur_sector
= cur_sector
+ nr_sectors
;
214 return (bmds
->cur_sector
>= total_sectors
);
217 monitor_printf(mon
, "Error reading sector %" PRId64
"\n", cur_sector
);
218 qemu_file_set_error(f
);
224 static void set_dirty_tracking(int enable
)
226 BlkMigDevState
*bmds
;
228 QSIMPLEQ_FOREACH(bmds
, &block_mig_state
.bmds_list
, entry
) {
229 bdrv_set_dirty_tracking(bmds
->bs
, enable
);
233 static void init_blk_migration(Monitor
*mon
, QEMUFile
*f
)
235 BlkMigDevState
*bmds
;
236 BlockDriverState
*bs
;
239 block_mig_state
.submitted
= 0;
240 block_mig_state
.read_done
= 0;
241 block_mig_state
.transferred
= 0;
242 block_mig_state
.total_sector_sum
= 0;
243 block_mig_state
.prev_progress
= -1;
244 block_mig_state
.bulk_completed
= 0;
245 block_mig_state
.total_time
= 0;
246 block_mig_state
.reads
= 0;
248 for (bs
= bdrv_first
; bs
!= NULL
; bs
= bs
->next
) {
249 if (bs
->type
== BDRV_TYPE_HD
) {
250 sectors
= bdrv_getlength(bs
) >> BDRV_SECTOR_BITS
;
255 bmds
= qemu_mallocz(sizeof(BlkMigDevState
));
257 bmds
->bulk_completed
= 0;
258 bmds
->total_sectors
= sectors
;
259 bmds
->completed_sectors
= 0;
260 bmds
->shared_base
= block_mig_state
.shared_base
;
262 block_mig_state
.total_sector_sum
+= sectors
;
264 if (bmds
->shared_base
) {
265 monitor_printf(mon
, "Start migration for %s with shared base "
269 monitor_printf(mon
, "Start full migration for %s\n",
273 QSIMPLEQ_INSERT_TAIL(&block_mig_state
.bmds_list
, bmds
, entry
);
278 static int blk_mig_save_bulked_block(Monitor
*mon
, QEMUFile
*f
)
280 int64_t completed_sector_sum
= 0;
281 BlkMigDevState
*bmds
;
285 QSIMPLEQ_FOREACH(bmds
, &block_mig_state
.bmds_list
, entry
) {
286 if (bmds
->bulk_completed
== 0) {
287 if (mig_save_device_bulk(mon
, f
, bmds
) == 1) {
288 /* completed bulk section for this device */
289 bmds
->bulk_completed
= 1;
291 completed_sector_sum
+= bmds
->completed_sectors
;
295 completed_sector_sum
+= bmds
->completed_sectors
;
299 progress
= completed_sector_sum
* 100 / block_mig_state
.total_sector_sum
;
300 if (progress
!= block_mig_state
.prev_progress
) {
301 block_mig_state
.prev_progress
= progress
;
302 qemu_put_be64(f
, (progress
<< BDRV_SECTOR_BITS
)
303 | BLK_MIG_FLAG_PROGRESS
);
304 monitor_printf(mon
, "Completed %d %%\r", progress
);
311 static void blk_mig_reset_dirty_cursor(void)
313 BlkMigDevState
*bmds
;
315 QSIMPLEQ_FOREACH(bmds
, &block_mig_state
.bmds_list
, entry
) {
320 static int mig_save_device_dirty(Monitor
*mon
, QEMUFile
*f
,
321 BlkMigDevState
*bmds
, int is_async
)
324 int64_t total_sectors
= bmds
->total_sectors
;
328 for (sector
= bmds
->cur_dirty
; sector
< bmds
->total_sectors
;) {
329 if (bdrv_get_dirty(bmds
->bs
, sector
)) {
331 if (total_sectors
- sector
< BDRV_SECTORS_PER_DIRTY_CHUNK
) {
332 nr_sectors
= total_sectors
- sector
;
334 nr_sectors
= BDRV_SECTORS_PER_DIRTY_CHUNK
;
336 blk
= qemu_malloc(sizeof(BlkMigBlock
));
337 blk
->buf
= qemu_malloc(BLOCK_SIZE
);
339 blk
->sector
= sector
;
342 blk
->iov
.iov_base
= blk
->buf
;
343 blk
->iov
.iov_len
= nr_sectors
* BDRV_SECTOR_SIZE
;
344 qemu_iovec_init_external(&blk
->qiov
, &blk
->iov
, 1);
346 blk
->time
= qemu_get_clock_ns(rt_clock
);
348 blk
->aiocb
= bdrv_aio_readv(bmds
->bs
, sector
, &blk
->qiov
,
349 nr_sectors
, blk_mig_read_cb
, blk
);
353 block_mig_state
.submitted
++;
355 if (bdrv_read(bmds
->bs
, sector
, blk
->buf
,
365 bdrv_reset_dirty(bmds
->bs
, sector
, nr_sectors
);
368 sector
+= BDRV_SECTORS_PER_DIRTY_CHUNK
;
369 bmds
->cur_dirty
= sector
;
372 return (bmds
->cur_dirty
>= bmds
->total_sectors
);
375 monitor_printf(mon
, "Error reading sector %" PRId64
"\n", sector
);
376 qemu_file_set_error(f
);
382 static int blk_mig_save_dirty_block(Monitor
*mon
, QEMUFile
*f
, int is_async
)
384 BlkMigDevState
*bmds
;
387 QSIMPLEQ_FOREACH(bmds
, &block_mig_state
.bmds_list
, entry
) {
388 if (mig_save_device_dirty(mon
, f
, bmds
, is_async
) == 0) {
397 static void flush_blks(QEMUFile
* f
)
401 DPRINTF("%s Enter submitted %d read_done %d transferred %d\n",
402 __FUNCTION__
, block_mig_state
.submitted
, block_mig_state
.read_done
,
403 block_mig_state
.transferred
);
405 while ((blk
= QSIMPLEQ_FIRST(&block_mig_state
.blk_list
)) != NULL
) {
406 if (qemu_file_rate_limit(f
)) {
410 qemu_file_set_error(f
);
415 QSIMPLEQ_REMOVE_HEAD(&block_mig_state
.blk_list
, entry
);
419 block_mig_state
.read_done
--;
420 block_mig_state
.transferred
++;
421 assert(block_mig_state
.read_done
>= 0);
424 DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__
,
425 block_mig_state
.submitted
, block_mig_state
.read_done
,
426 block_mig_state
.transferred
);
429 static int64_t get_remaining_dirty(void)
431 BlkMigDevState
*bmds
;
434 QSIMPLEQ_FOREACH(bmds
, &block_mig_state
.bmds_list
, entry
) {
435 dirty
+= bdrv_get_dirty_count(bmds
->bs
);
438 return dirty
* BLOCK_SIZE
;
441 static int is_stage2_completed(void)
443 int64_t remaining_dirty
;
446 if (block_mig_state
.bulk_completed
== 1) {
448 remaining_dirty
= get_remaining_dirty();
449 if (remaining_dirty
== 0) {
453 bwidth
= compute_read_bwidth();
455 if ((remaining_dirty
/ bwidth
) <=
456 migrate_max_downtime()) {
457 /* finish stage2 because we think that we can finish remaing work
458 below max_downtime */
467 static void blk_mig_cleanup(Monitor
*mon
)
469 BlkMigDevState
*bmds
;
472 while ((bmds
= QSIMPLEQ_FIRST(&block_mig_state
.bmds_list
)) != NULL
) {
473 QSIMPLEQ_REMOVE_HEAD(&block_mig_state
.bmds_list
, entry
);
477 while ((blk
= QSIMPLEQ_FIRST(&block_mig_state
.blk_list
)) != NULL
) {
478 QSIMPLEQ_REMOVE_HEAD(&block_mig_state
.blk_list
, entry
);
483 set_dirty_tracking(0);
485 monitor_printf(mon
, "\n");
488 static int block_save_live(Monitor
*mon
, QEMUFile
*f
, int stage
, void *opaque
)
490 DPRINTF("Enter save live stage %d submitted %d transferred %d\n",
491 stage
, block_mig_state
.submitted
, block_mig_state
.transferred
);
494 blk_mig_cleanup(mon
);
498 if (block_mig_state
.blk_enable
!= 1) {
499 /* no need to migrate storage */
500 qemu_put_be64(f
, BLK_MIG_FLAG_EOS
);
505 init_blk_migration(mon
, f
);
507 /* start track dirty blocks */
508 set_dirty_tracking(1);
513 if (qemu_file_has_error(f
)) {
514 blk_mig_cleanup(mon
);
518 blk_mig_reset_dirty_cursor();
521 /* control the rate of transfer */
522 while ((block_mig_state
.submitted
+
523 block_mig_state
.read_done
) * BLOCK_SIZE
<
524 qemu_file_get_rate_limit(f
)) {
525 if (block_mig_state
.bulk_completed
== 0) {
526 /* first finish the bulk phase */
527 if (blk_mig_save_bulked_block(mon
, f
) == 0) {
528 /* finished saving bulk on all devices */
529 block_mig_state
.bulk_completed
= 1;
532 if (blk_mig_save_dirty_block(mon
, f
, 1) == 0) {
533 /* no more dirty blocks */
541 if (qemu_file_has_error(f
)) {
542 blk_mig_cleanup(mon
);
548 /* we know for sure that save bulk is completed and
549 all async read completed */
550 assert(block_mig_state
.submitted
== 0);
552 while (blk_mig_save_dirty_block(mon
, f
, 0) != 0);
553 blk_mig_cleanup(mon
);
555 /* report completion */
556 qemu_put_be64(f
, (100 << BDRV_SECTOR_BITS
) | BLK_MIG_FLAG_PROGRESS
);
558 if (qemu_file_has_error(f
)) {
562 monitor_printf(mon
, "Block migration completed\n");
565 qemu_put_be64(f
, BLK_MIG_FLAG_EOS
);
567 return ((stage
== 2) && is_stage2_completed());
570 static int block_load(QEMUFile
*f
, void *opaque
, int version_id
)
572 static int banner_printed
;
574 char device_name
[256];
576 BlockDriverState
*bs
;
580 addr
= qemu_get_be64(f
);
582 flags
= addr
& ~BDRV_SECTOR_MASK
;
583 addr
>>= BDRV_SECTOR_BITS
;
585 if (flags
& BLK_MIG_FLAG_DEVICE_BLOCK
) {
586 /* get device name */
587 len
= qemu_get_byte(f
);
588 qemu_get_buffer(f
, (uint8_t *)device_name
, len
);
589 device_name
[len
] = '\0';
591 bs
= bdrv_find(device_name
);
593 fprintf(stderr
, "Error unknown block device %s\n",
598 buf
= qemu_malloc(BLOCK_SIZE
);
600 qemu_get_buffer(f
, buf
, BLOCK_SIZE
);
601 bdrv_write(bs
, addr
, buf
, BDRV_SECTORS_PER_DIRTY_CHUNK
);
604 } else if (flags
& BLK_MIG_FLAG_PROGRESS
) {
605 if (!banner_printed
) {
606 printf("Receiving block device images\n");
609 printf("Completed %d %%%c", (int)addr
,
610 (addr
== 100) ? '\n' : '\r');
612 } else if (!(flags
& BLK_MIG_FLAG_EOS
)) {
613 fprintf(stderr
, "Unknown flags\n");
616 if (qemu_file_has_error(f
)) {
619 } while (!(flags
& BLK_MIG_FLAG_EOS
));
624 static void block_set_params(int blk_enable
, int shared_base
, void *opaque
)
626 block_mig_state
.blk_enable
= blk_enable
;
627 block_mig_state
.shared_base
= shared_base
;
629 /* shared base means that blk_enable = 1 */
630 block_mig_state
.blk_enable
|= shared_base
;
633 void blk_mig_init(void)
635 QSIMPLEQ_INIT(&block_mig_state
.bmds_list
);
636 QSIMPLEQ_INIT(&block_mig_state
.blk_list
);
638 register_savevm_live("block", 0, 1, block_set_params
, block_save_live
,
639 NULL
, block_load
, &block_mig_state
);