2 * QEMU live block migration
4 * Copyright IBM, Corp. 2009
7 * Liran Schour <lirans@il.ibm.com>
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
14 #include "qemu-common.h"
15 #include "block_int.h"
17 #include "qemu-queue.h"
18 #include "qemu-timer.h"
20 #include "block-migration.h"
21 #include "migration.h"
24 #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
26 #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01
27 #define BLK_MIG_FLAG_EOS 0x02
28 #define BLK_MIG_FLAG_PROGRESS 0x04
30 #define MAX_IS_ALLOCATED_SEARCH 65536
32 //#define DEBUG_BLK_MIGRATION
34 #ifdef DEBUG_BLK_MIGRATION
35 #define DPRINTF(fmt, ...) \
36 do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
38 #define DPRINTF(fmt, ...) \
42 typedef struct BlkMigDevState
{
48 int64_t completed_sectors
;
49 int64_t total_sectors
;
51 QSIMPLEQ_ENTRY(BlkMigDevState
) entry
;
54 typedef struct BlkMigBlock
{
60 BlockDriverAIOCB
*aiocb
;
63 QSIMPLEQ_ENTRY(BlkMigBlock
) entry
;
66 typedef struct BlkMigState
{
69 QSIMPLEQ_HEAD(bmds_list
, BlkMigDevState
) bmds_list
;
70 QSIMPLEQ_HEAD(blk_list
, BlkMigBlock
) blk_list
;
74 int64_t total_sector_sum
;
77 long double total_time
;
81 static BlkMigState block_mig_state
;
83 static void blk_send(QEMUFile
*f
, BlkMigBlock
* blk
)
87 /* sector number and flags */
88 qemu_put_be64(f
, (blk
->sector
<< BDRV_SECTOR_BITS
)
89 | BLK_MIG_FLAG_DEVICE_BLOCK
);
92 len
= strlen(blk
->bmds
->bs
->device_name
);
93 qemu_put_byte(f
, len
);
94 qemu_put_buffer(f
, (uint8_t *)blk
->bmds
->bs
->device_name
, len
);
96 qemu_put_buffer(f
, blk
->buf
, BLOCK_SIZE
);
99 int blk_mig_active(void)
101 return !QSIMPLEQ_EMPTY(&block_mig_state
.bmds_list
);
104 uint64_t blk_mig_bytes_transferred(void)
106 BlkMigDevState
*bmds
;
109 QSIMPLEQ_FOREACH(bmds
, &block_mig_state
.bmds_list
, entry
) {
110 sum
+= bmds
->completed_sectors
;
112 return sum
<< BDRV_SECTOR_BITS
;
115 uint64_t blk_mig_bytes_remaining(void)
117 return blk_mig_bytes_total() - blk_mig_bytes_transferred();
120 uint64_t blk_mig_bytes_total(void)
122 BlkMigDevState
*bmds
;
125 QSIMPLEQ_FOREACH(bmds
, &block_mig_state
.bmds_list
, entry
) {
126 sum
+= bmds
->total_sectors
;
128 return sum
<< BDRV_SECTOR_BITS
;
131 static inline void add_avg_read_time(int64_t time
)
133 block_mig_state
.reads
++;
134 block_mig_state
.total_time
+= time
;
137 static inline long double compute_read_bwidth(void)
139 assert(block_mig_state
.total_time
!= 0);
140 return (block_mig_state
.reads
* BLOCK_SIZE
)/ block_mig_state
.total_time
;
143 static void blk_mig_read_cb(void *opaque
, int ret
)
145 BlkMigBlock
*blk
= opaque
;
149 blk
->time
= qemu_get_clock_ns(rt_clock
) - blk
->time
;
151 add_avg_read_time(blk
->time
);
153 QSIMPLEQ_INSERT_TAIL(&block_mig_state
.blk_list
, blk
, entry
);
155 block_mig_state
.submitted
--;
156 block_mig_state
.read_done
++;
157 assert(block_mig_state
.submitted
>= 0);
160 static int mig_save_device_bulk(Monitor
*mon
, QEMUFile
*f
,
161 BlkMigDevState
*bmds
)
163 int64_t total_sectors
= bmds
->total_sectors
;
164 int64_t cur_sector
= bmds
->cur_sector
;
165 BlockDriverState
*bs
= bmds
->bs
;
169 if (bmds
->shared_base
) {
170 while (cur_sector
< total_sectors
&&
171 !bdrv_is_allocated(bs
, cur_sector
, MAX_IS_ALLOCATED_SEARCH
,
173 cur_sector
+= nr_sectors
;
177 if (cur_sector
>= total_sectors
) {
178 bmds
->cur_sector
= bmds
->completed_sectors
= total_sectors
;
182 bmds
->completed_sectors
= cur_sector
;
184 cur_sector
&= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK
- 1);
186 /* we are going to transfer a full block even if it is not allocated */
187 nr_sectors
= BDRV_SECTORS_PER_DIRTY_CHUNK
;
189 if (total_sectors
- cur_sector
< BDRV_SECTORS_PER_DIRTY_CHUNK
) {
190 nr_sectors
= total_sectors
- cur_sector
;
193 blk
= qemu_malloc(sizeof(BlkMigBlock
));
194 blk
->buf
= qemu_malloc(BLOCK_SIZE
);
196 blk
->sector
= cur_sector
;
198 blk
->iov
.iov_base
= blk
->buf
;
199 blk
->iov
.iov_len
= nr_sectors
* BDRV_SECTOR_SIZE
;
200 qemu_iovec_init_external(&blk
->qiov
, &blk
->iov
, 1);
202 blk
->time
= qemu_get_clock_ns(rt_clock
);
204 blk
->aiocb
= bdrv_aio_readv(bs
, cur_sector
, &blk
->qiov
,
205 nr_sectors
, blk_mig_read_cb
, blk
);
209 block_mig_state
.submitted
++;
211 bdrv_reset_dirty(bs
, cur_sector
, nr_sectors
);
212 bmds
->cur_sector
= cur_sector
+ nr_sectors
;
214 return (bmds
->cur_sector
>= total_sectors
);
217 monitor_printf(mon
, "Error reading sector %" PRId64
"\n", cur_sector
);
218 qemu_file_set_error(f
);
224 static void set_dirty_tracking(int enable
)
226 BlkMigDevState
*bmds
;
228 QSIMPLEQ_FOREACH(bmds
, &block_mig_state
.bmds_list
, entry
) {
229 bdrv_set_dirty_tracking(bmds
->bs
, enable
);
233 static void init_blk_migration_it(void *opaque
, BlockDriverState
*bs
)
235 Monitor
*mon
= opaque
;
236 BlkMigDevState
*bmds
;
239 if (!bdrv_is_read_only(bs
)) {
240 sectors
= bdrv_getlength(bs
) >> BDRV_SECTOR_BITS
;
245 bmds
= qemu_mallocz(sizeof(BlkMigDevState
));
247 bmds
->bulk_completed
= 0;
248 bmds
->total_sectors
= sectors
;
249 bmds
->completed_sectors
= 0;
250 bmds
->shared_base
= block_mig_state
.shared_base
;
252 block_mig_state
.total_sector_sum
+= sectors
;
254 if (bmds
->shared_base
) {
255 monitor_printf(mon
, "Start migration for %s with shared base "
259 monitor_printf(mon
, "Start full migration for %s\n",
263 QSIMPLEQ_INSERT_TAIL(&block_mig_state
.bmds_list
, bmds
, entry
);
267 static void init_blk_migration(Monitor
*mon
, QEMUFile
*f
)
269 block_mig_state
.submitted
= 0;
270 block_mig_state
.read_done
= 0;
271 block_mig_state
.transferred
= 0;
272 block_mig_state
.total_sector_sum
= 0;
273 block_mig_state
.prev_progress
= -1;
274 block_mig_state
.bulk_completed
= 0;
275 block_mig_state
.total_time
= 0;
276 block_mig_state
.reads
= 0;
278 bdrv_iterate(init_blk_migration_it
, mon
);
281 static int blk_mig_save_bulked_block(Monitor
*mon
, QEMUFile
*f
)
283 int64_t completed_sector_sum
= 0;
284 BlkMigDevState
*bmds
;
288 QSIMPLEQ_FOREACH(bmds
, &block_mig_state
.bmds_list
, entry
) {
289 if (bmds
->bulk_completed
== 0) {
290 if (mig_save_device_bulk(mon
, f
, bmds
) == 1) {
291 /* completed bulk section for this device */
292 bmds
->bulk_completed
= 1;
294 completed_sector_sum
+= bmds
->completed_sectors
;
298 completed_sector_sum
+= bmds
->completed_sectors
;
302 progress
= completed_sector_sum
* 100 / block_mig_state
.total_sector_sum
;
303 if (progress
!= block_mig_state
.prev_progress
) {
304 block_mig_state
.prev_progress
= progress
;
305 qemu_put_be64(f
, (progress
<< BDRV_SECTOR_BITS
)
306 | BLK_MIG_FLAG_PROGRESS
);
307 monitor_printf(mon
, "Completed %d %%\r", progress
);
314 static void blk_mig_reset_dirty_cursor(void)
316 BlkMigDevState
*bmds
;
318 QSIMPLEQ_FOREACH(bmds
, &block_mig_state
.bmds_list
, entry
) {
323 static int mig_save_device_dirty(Monitor
*mon
, QEMUFile
*f
,
324 BlkMigDevState
*bmds
, int is_async
)
327 int64_t total_sectors
= bmds
->total_sectors
;
331 for (sector
= bmds
->cur_dirty
; sector
< bmds
->total_sectors
;) {
332 if (bdrv_get_dirty(bmds
->bs
, sector
)) {
334 if (total_sectors
- sector
< BDRV_SECTORS_PER_DIRTY_CHUNK
) {
335 nr_sectors
= total_sectors
- sector
;
337 nr_sectors
= BDRV_SECTORS_PER_DIRTY_CHUNK
;
339 blk
= qemu_malloc(sizeof(BlkMigBlock
));
340 blk
->buf
= qemu_malloc(BLOCK_SIZE
);
342 blk
->sector
= sector
;
345 blk
->iov
.iov_base
= blk
->buf
;
346 blk
->iov
.iov_len
= nr_sectors
* BDRV_SECTOR_SIZE
;
347 qemu_iovec_init_external(&blk
->qiov
, &blk
->iov
, 1);
349 blk
->time
= qemu_get_clock_ns(rt_clock
);
351 blk
->aiocb
= bdrv_aio_readv(bmds
->bs
, sector
, &blk
->qiov
,
352 nr_sectors
, blk_mig_read_cb
, blk
);
356 block_mig_state
.submitted
++;
358 if (bdrv_read(bmds
->bs
, sector
, blk
->buf
,
368 bdrv_reset_dirty(bmds
->bs
, sector
, nr_sectors
);
371 sector
+= BDRV_SECTORS_PER_DIRTY_CHUNK
;
372 bmds
->cur_dirty
= sector
;
375 return (bmds
->cur_dirty
>= bmds
->total_sectors
);
378 monitor_printf(mon
, "Error reading sector %" PRId64
"\n", sector
);
379 qemu_file_set_error(f
);
385 static int blk_mig_save_dirty_block(Monitor
*mon
, QEMUFile
*f
, int is_async
)
387 BlkMigDevState
*bmds
;
390 QSIMPLEQ_FOREACH(bmds
, &block_mig_state
.bmds_list
, entry
) {
391 if (mig_save_device_dirty(mon
, f
, bmds
, is_async
) == 0) {
400 static void flush_blks(QEMUFile
* f
)
404 DPRINTF("%s Enter submitted %d read_done %d transferred %d\n",
405 __FUNCTION__
, block_mig_state
.submitted
, block_mig_state
.read_done
,
406 block_mig_state
.transferred
);
408 while ((blk
= QSIMPLEQ_FIRST(&block_mig_state
.blk_list
)) != NULL
) {
409 if (qemu_file_rate_limit(f
)) {
413 qemu_file_set_error(f
);
418 QSIMPLEQ_REMOVE_HEAD(&block_mig_state
.blk_list
, entry
);
422 block_mig_state
.read_done
--;
423 block_mig_state
.transferred
++;
424 assert(block_mig_state
.read_done
>= 0);
427 DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__
,
428 block_mig_state
.submitted
, block_mig_state
.read_done
,
429 block_mig_state
.transferred
);
432 static int64_t get_remaining_dirty(void)
434 BlkMigDevState
*bmds
;
437 QSIMPLEQ_FOREACH(bmds
, &block_mig_state
.bmds_list
, entry
) {
438 dirty
+= bdrv_get_dirty_count(bmds
->bs
);
441 return dirty
* BLOCK_SIZE
;
444 static int is_stage2_completed(void)
446 int64_t remaining_dirty
;
449 if (block_mig_state
.bulk_completed
== 1) {
451 remaining_dirty
= get_remaining_dirty();
452 if (remaining_dirty
== 0) {
456 bwidth
= compute_read_bwidth();
458 if ((remaining_dirty
/ bwidth
) <=
459 migrate_max_downtime()) {
460 /* finish stage2 because we think that we can finish remaing work
461 below max_downtime */
470 static void blk_mig_cleanup(Monitor
*mon
)
472 BlkMigDevState
*bmds
;
475 while ((bmds
= QSIMPLEQ_FIRST(&block_mig_state
.bmds_list
)) != NULL
) {
476 QSIMPLEQ_REMOVE_HEAD(&block_mig_state
.bmds_list
, entry
);
480 while ((blk
= QSIMPLEQ_FIRST(&block_mig_state
.blk_list
)) != NULL
) {
481 QSIMPLEQ_REMOVE_HEAD(&block_mig_state
.blk_list
, entry
);
486 set_dirty_tracking(0);
488 monitor_printf(mon
, "\n");
491 static int block_save_live(Monitor
*mon
, QEMUFile
*f
, int stage
, void *opaque
)
493 DPRINTF("Enter save live stage %d submitted %d transferred %d\n",
494 stage
, block_mig_state
.submitted
, block_mig_state
.transferred
);
497 blk_mig_cleanup(mon
);
501 if (block_mig_state
.blk_enable
!= 1) {
502 /* no need to migrate storage */
503 qemu_put_be64(f
, BLK_MIG_FLAG_EOS
);
508 init_blk_migration(mon
, f
);
510 /* start track dirty blocks */
511 set_dirty_tracking(1);
516 if (qemu_file_has_error(f
)) {
517 blk_mig_cleanup(mon
);
521 blk_mig_reset_dirty_cursor();
524 /* control the rate of transfer */
525 while ((block_mig_state
.submitted
+
526 block_mig_state
.read_done
) * BLOCK_SIZE
<
527 qemu_file_get_rate_limit(f
)) {
528 if (block_mig_state
.bulk_completed
== 0) {
529 /* first finish the bulk phase */
530 if (blk_mig_save_bulked_block(mon
, f
) == 0) {
531 /* finished saving bulk on all devices */
532 block_mig_state
.bulk_completed
= 1;
535 if (blk_mig_save_dirty_block(mon
, f
, 1) == 0) {
536 /* no more dirty blocks */
544 if (qemu_file_has_error(f
)) {
545 blk_mig_cleanup(mon
);
551 /* we know for sure that save bulk is completed and
552 all async read completed */
553 assert(block_mig_state
.submitted
== 0);
555 while (blk_mig_save_dirty_block(mon
, f
, 0) != 0);
556 blk_mig_cleanup(mon
);
558 /* report completion */
559 qemu_put_be64(f
, (100 << BDRV_SECTOR_BITS
) | BLK_MIG_FLAG_PROGRESS
);
561 if (qemu_file_has_error(f
)) {
565 monitor_printf(mon
, "Block migration completed\n");
568 qemu_put_be64(f
, BLK_MIG_FLAG_EOS
);
570 return ((stage
== 2) && is_stage2_completed());
573 static int block_load(QEMUFile
*f
, void *opaque
, int version_id
)
575 static int banner_printed
;
577 char device_name
[256];
579 BlockDriverState
*bs
;
583 addr
= qemu_get_be64(f
);
585 flags
= addr
& ~BDRV_SECTOR_MASK
;
586 addr
>>= BDRV_SECTOR_BITS
;
588 if (flags
& BLK_MIG_FLAG_DEVICE_BLOCK
) {
590 /* get device name */
591 len
= qemu_get_byte(f
);
592 qemu_get_buffer(f
, (uint8_t *)device_name
, len
);
593 device_name
[len
] = '\0';
595 bs
= bdrv_find(device_name
);
597 fprintf(stderr
, "Error unknown block device %s\n",
602 buf
= qemu_malloc(BLOCK_SIZE
);
604 qemu_get_buffer(f
, buf
, BLOCK_SIZE
);
605 ret
= bdrv_write(bs
, addr
, buf
, BDRV_SECTORS_PER_DIRTY_CHUNK
);
611 } else if (flags
& BLK_MIG_FLAG_PROGRESS
) {
612 if (!banner_printed
) {
613 printf("Receiving block device images\n");
616 printf("Completed %d %%%c", (int)addr
,
617 (addr
== 100) ? '\n' : '\r');
619 } else if (!(flags
& BLK_MIG_FLAG_EOS
)) {
620 fprintf(stderr
, "Unknown flags\n");
623 if (qemu_file_has_error(f
)) {
626 } while (!(flags
& BLK_MIG_FLAG_EOS
));
631 static void block_set_params(int blk_enable
, int shared_base
, void *opaque
)
633 block_mig_state
.blk_enable
= blk_enable
;
634 block_mig_state
.shared_base
= shared_base
;
636 /* shared base means that blk_enable = 1 */
637 block_mig_state
.blk_enable
|= shared_base
;
640 void blk_mig_init(void)
642 QSIMPLEQ_INIT(&block_mig_state
.bmds_list
);
643 QSIMPLEQ_INIT(&block_mig_state
.blk_list
);
645 register_savevm_live(NULL
, "block", 0, 1, block_set_params
,
646 block_save_live
, NULL
, block_load
, &block_mig_state
);