2 * QEMU live block migration
4 * Copyright IBM, Corp. 2009
7 * Liran Schour <lirans@il.ibm.com>
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
14 #include "qemu-common.h"
15 #include "block_int.h"
17 #include "block-migration.h"
21 #define SECTOR_SIZE (1 << SECTOR_BITS)
22 #define SECTOR_MASK ~(SECTOR_SIZE - 1);
24 #define BLOCK_SIZE (block_mig_state->sectors_per_block << SECTOR_BITS)
26 #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01
27 #define BLK_MIG_FLAG_EOS 0x02
29 #define MAX_IS_ALLOCATED_SEARCH 65536
30 #define MAX_BLOCKS_READ 10000
31 #define BLOCKS_READ_CHANGE 100
32 #define INITIAL_BLOCKS_READ 100
34 //#define DEBUG_BLK_MIGRATION
36 #ifdef DEBUG_BLK_MIGRATION
37 #define dprintf(fmt, ...) \
38 do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
40 #define dprintf(fmt, ...) \
44 typedef struct BlkMigBlock
{
50 BlockDriverAIOCB
*aiocb
;
52 struct BlkMigBlock
*next
;
55 typedef struct BlkMigState
{
61 BlkMigDevState
*bmds_first
;
62 int sectors_per_block
;
63 BlkMigBlock
*first_blk
;
64 BlkMigBlock
*last_blk
;
68 int64_t print_completion
;
71 static BlkMigState
*block_mig_state
= NULL
;
73 static void blk_mig_read_cb(void *opaque
, int ret
)
75 BlkMigBlock
*blk
= opaque
;
79 /* insert at the end */
80 if(block_mig_state
->last_blk
== NULL
) {
81 block_mig_state
->first_blk
= blk
;
82 block_mig_state
->last_blk
= blk
;
84 block_mig_state
->last_blk
->next
= blk
;
85 block_mig_state
->last_blk
= blk
;
88 block_mig_state
->submitted
--;
89 block_mig_state
->read_done
++;
90 assert(block_mig_state
->submitted
>= 0);
95 static int mig_read_device_bulk(QEMUFile
*f
, BlkMigDevState
*bms
)
98 int64_t total_sectors
, cur_sector
= 0;
99 BlockDriverState
*bs
= bms
->bs
;
102 blk
= qemu_malloc(sizeof(BlkMigBlock
));
103 blk
->buf
= qemu_malloc(BLOCK_SIZE
);
105 cur_sector
= bms
->cur_sector
;
106 total_sectors
= bdrv_getlength(bs
) >> SECTOR_BITS
;
108 if(bms
->shared_base
) {
109 while(cur_sector
< bms
->total_sectors
&&
110 !bdrv_is_allocated(bms
->bs
, cur_sector
,
111 MAX_IS_ALLOCATED_SEARCH
, &nr_sectors
)) {
112 cur_sector
+= nr_sectors
;
116 if(cur_sector
>= total_sectors
) {
117 bms
->cur_sector
= total_sectors
;
123 if(cur_sector
>= block_mig_state
->print_completion
) {
124 printf("Completed %" PRId64
" %%\r", cur_sector
* 100 / total_sectors
);
126 block_mig_state
->print_completion
+=
127 (block_mig_state
->sectors_per_block
* 10000);
130 /* we going to transfder BLOCK_SIZE any way even if it is not allocated */
131 nr_sectors
= block_mig_state
->sectors_per_block
;
133 cur_sector
&= ~((int64_t)block_mig_state
->sectors_per_block
-1);
135 if(total_sectors
- cur_sector
< block_mig_state
->sectors_per_block
) {
136 nr_sectors
= (total_sectors
- cur_sector
);
139 bms
->cur_sector
= cur_sector
+ nr_sectors
;
140 blk
->sector
= cur_sector
;
144 blk
->iov
.iov_base
= blk
->buf
;
145 blk
->iov
.iov_len
= nr_sectors
* SECTOR_SIZE
;
146 qemu_iovec_init_external(&blk
->qiov
, &blk
->iov
, 1);
148 blk
->aiocb
= bdrv_aio_readv(bs
, cur_sector
, &blk
->qiov
,
149 nr_sectors
, blk_mig_read_cb
, blk
);
152 printf("Error reading sector %" PRId64
"\n", cur_sector
);
158 bdrv_reset_dirty(bms
->bs
, cur_sector
, nr_sectors
);
159 block_mig_state
->submitted
++;
161 return (bms
->cur_sector
>= total_sectors
);
164 static int mig_save_device_bulk(QEMUFile
*f
, BlkMigDevState
*bmds
)
167 int64_t total_sectors
= bmds
->total_sectors
, cur_sector
= 0;
168 uint8_t *tmp_buf
= NULL
;
169 BlockDriverState
*bs
= bmds
->bs
;
171 tmp_buf
= qemu_malloc(BLOCK_SIZE
);
173 cur_sector
= bmds
->cur_sector
;
175 if(bmds
->shared_base
) {
176 while(cur_sector
< bmds
->total_sectors
&&
177 !bdrv_is_allocated(bmds
->bs
, cur_sector
,
178 MAX_IS_ALLOCATED_SEARCH
, &nr_sectors
)) {
179 cur_sector
+= nr_sectors
;
183 if(cur_sector
>= total_sectors
) {
184 bmds
->cur_sector
= total_sectors
;
189 if(cur_sector
>= block_mig_state
->print_completion
) {
190 printf("Completed %" PRId64
" %%\r", cur_sector
* 100 / total_sectors
);
192 block_mig_state
->print_completion
+=
193 (block_mig_state
->sectors_per_block
* 10000);
196 cur_sector
&= ~((int64_t)block_mig_state
->sectors_per_block
-1);
198 /* we going to transfer
200 any way even if it is not allocated */
201 nr_sectors
= block_mig_state
->sectors_per_block
;
203 if(total_sectors
- cur_sector
< block_mig_state
->sectors_per_block
) {
204 nr_sectors
= (total_sectors
- cur_sector
);
207 if(bdrv_read(bs
, cur_sector
, tmp_buf
, nr_sectors
) < 0) {
208 printf("Error reading sector %" PRId64
"\n", cur_sector
);
211 bdrv_reset_dirty(bs
, cur_sector
, nr_sectors
);
214 qemu_put_be64(f
,(cur_sector
<< SECTOR_BITS
) | BLK_MIG_FLAG_DEVICE_BLOCK
);
216 len
= strlen(bs
->device_name
);
217 qemu_put_byte(f
, len
);
218 qemu_put_buffer(f
, (uint8_t *)bs
->device_name
, len
);
220 qemu_put_buffer(f
, tmp_buf
,
223 bmds
->cur_sector
= cur_sector
+ block_mig_state
->sectors_per_block
;
227 return (bmds
->cur_sector
>= total_sectors
);
230 static void send_blk(QEMUFile
*f
, BlkMigBlock
* blk
)
235 qemu_put_be64(f
,(blk
->sector
<< SECTOR_BITS
) | BLK_MIG_FLAG_DEVICE_BLOCK
);
237 len
= strlen(blk
->bmds
->bs
->device_name
);
238 qemu_put_byte(f
, len
);
239 qemu_put_buffer(f
, (uint8_t *)blk
->bmds
->bs
->device_name
, len
);
241 qemu_put_buffer(f
, blk
->buf
,
247 static void blk_mig_save_dev_info(QEMUFile
*f
, BlkMigDevState
*bmds
)
251 static void set_dirty_tracking(int enable
)
253 BlkMigDevState
*bmds
;
254 for(bmds
= block_mig_state
->bmds_first
; bmds
!= NULL
; bmds
= bmds
->next
) {
255 bdrv_set_dirty_tracking(bmds
->bs
,enable
);
261 static void init_blk_migration(QEMUFile
*f
)
263 BlkMigDevState
**pbmds
, *bmds
;
264 BlockDriverState
*bs
;
266 for (bs
= bdrv_first
; bs
!= NULL
; bs
= bs
->next
) {
267 if(bs
->type
== BDRV_TYPE_HD
) {
268 bmds
= qemu_mallocz(sizeof(BlkMigDevState
));
270 bmds
->bulk_completed
= 0;
271 bmds
->total_sectors
= bdrv_getlength(bs
) >> SECTOR_BITS
;
272 bmds
->shared_base
= block_mig_state
->shared_base
;
274 if(bmds
->shared_base
) {
275 printf("Start migration for %s with shared base image\n",
278 printf("Start full migration for %s\n", bs
->device_name
);
281 /* insert at the end */
282 pbmds
= &block_mig_state
->bmds_first
;
283 while (*pbmds
!= NULL
)
284 pbmds
= &(*pbmds
)->next
;
287 blk_mig_save_dev_info(f
, bmds
);
292 block_mig_state
->sectors_per_block
= bdrv_get_sectors_per_chunk();
297 static int blk_mig_save_bulked_block(QEMUFile
*f
, int is_async
)
299 BlkMigDevState
*bmds
;
301 for (bmds
= block_mig_state
->bmds_first
; bmds
!= NULL
; bmds
= bmds
->next
) {
302 if(bmds
->bulk_completed
== 0) {
304 if(mig_read_device_bulk(f
, bmds
) == 1) {
305 /* completed bulk section for this device */
306 bmds
->bulk_completed
= 1;
309 if(mig_save_device_bulk(f
,bmds
) == 1) {
310 /* completed bulk section for this device */
311 bmds
->bulk_completed
= 1;
318 /* we reached here means bulk is completed */
319 block_mig_state
->bulk_completed
= 1;
325 #define MAX_NUM_BLOCKS 4
327 static void blk_mig_save_dirty_blocks(QEMUFile
*f
)
329 BlkMigDevState
*bmds
;
330 uint8_t buf
[BLOCK_SIZE
];
334 for(bmds
= block_mig_state
->bmds_first
; bmds
!= NULL
; bmds
= bmds
->next
) {
335 for(sector
= 0; sector
< bmds
->cur_sector
;) {
337 if(bdrv_get_dirty(bmds
->bs
,sector
)) {
339 if(bdrv_read(bmds
->bs
, sector
, buf
,
340 block_mig_state
->sectors_per_block
) < 0) {
344 qemu_put_be64(f
,(sector
<< SECTOR_BITS
)
345 | BLK_MIG_FLAG_DEVICE_BLOCK
);
347 len
= strlen(bmds
->bs
->device_name
);
349 qemu_put_byte(f
, len
);
350 qemu_put_buffer(f
, (uint8_t *)bmds
->bs
->device_name
, len
);
352 qemu_put_buffer(f
, buf
,
353 (block_mig_state
->sectors_per_block
*
356 bdrv_reset_dirty(bmds
->bs
, sector
,
357 block_mig_state
->sectors_per_block
);
359 sector
+= block_mig_state
->sectors_per_block
;
361 /* sector is clean */
362 sector
+= block_mig_state
->sectors_per_block
;
370 static void flush_blks(QEMUFile
* f
)
372 BlkMigBlock
*blk
, *tmp
;
374 dprintf("%s Enter submitted %d read_done %d transfered\n", __FUNCTION__
,
375 submitted
, read_done
, transfered
);
377 for(blk
= block_mig_state
->first_blk
;
378 blk
!= NULL
&& !qemu_file_rate_limit(f
); blk
= tmp
) {
385 block_mig_state
->read_done
--;
386 block_mig_state
->transferred
++;
387 assert(block_mig_state
->read_done
>= 0);
389 block_mig_state
->first_blk
= blk
;
391 if(block_mig_state
->first_blk
== NULL
) {
392 block_mig_state
->last_blk
= NULL
;
395 dprintf("%s Exit submitted %d read_done %d transferred%d\n", __FUNCTION__
,
396 block_mig_state
->submitted
, block_mig_state
->read_done
,
397 block_mig_state
->transferred
);
402 static int is_stage2_completed(void)
404 BlkMigDevState
*bmds
;
406 if(block_mig_state
->submitted
> 0) {
410 for (bmds
= block_mig_state
->bmds_first
; bmds
!= NULL
; bmds
= bmds
->next
) {
411 if(bmds
->bulk_completed
== 0) {
419 static int block_save_live(QEMUFile
*f
, int stage
, void *opaque
)
423 dprintf("Enter save live stage %d submitted %d transferred %d\n", stage
,
424 submitted
, transferred
);
426 if(block_mig_state
->blk_enable
!= 1) {
427 /* no need to migrate storage */
429 qemu_put_be64(f
,BLK_MIG_FLAG_EOS
);
434 init_blk_migration(f
);
436 /* start track dirty blocks */
437 set_dirty_tracking(1);
443 /* control the rate of transfer */
444 while ((block_mig_state
->submitted
+ block_mig_state
->read_done
) *
446 (qemu_file_get_rate_limit(f
))) {
448 ret
= blk_mig_save_bulked_block(f
, 1);
450 if (ret
== 0) /* no more bulk blocks for now*/
458 while(blk_mig_save_bulked_block(f
, 0) != 0);
460 blk_mig_save_dirty_blocks(f
);
462 /* stop track dirty blocks */
463 set_dirty_tracking(0);;
465 printf("\nBlock migration completed\n");
468 qemu_put_be64(f
,BLK_MIG_FLAG_EOS
);
470 return ((stage
== 2) && is_stage2_completed());
473 static int block_load(QEMUFile
*f
, void *opaque
, int version_id
)
476 char device_name
[256];
478 BlockDriverState
*bs
;
481 block_mig_state
->sectors_per_block
= bdrv_get_sectors_per_chunk();
482 buf
= qemu_malloc(BLOCK_SIZE
);
486 addr
= qemu_get_be64(f
);
488 flags
= addr
& ~SECTOR_MASK
;
491 if(flags
& BLK_MIG_FLAG_DEVICE_BLOCK
) {
493 /* get device name */
494 len
= qemu_get_byte(f
);
496 qemu_get_buffer(f
, (uint8_t *)device_name
, len
);
497 device_name
[len
] = '\0';
499 bs
= bdrv_find(device_name
);
501 qemu_get_buffer(f
, buf
,
505 bdrv_write(bs
, (addr
>> SECTOR_BITS
),
506 buf
, block_mig_state
->sectors_per_block
);
508 printf("Error unknown block device %s\n", device_name
);
510 } else if(flags
& BLK_MIG_FLAG_EOS
) {
513 printf("Unknown flags\n");
515 } while(!(flags
& BLK_MIG_FLAG_EOS
));
522 static void block_set_params(int blk_enable
, int shared_base
, void *opaque
)
524 assert(opaque
== block_mig_state
);
526 block_mig_state
->blk_enable
= blk_enable
;
527 block_mig_state
->shared_base
= shared_base
;
529 /* shared base means that blk_enable = 1 */
530 block_mig_state
->blk_enable
|= shared_base
;
535 void blk_mig_info(void)
537 BlockDriverState
*bs
;
539 for (bs
= bdrv_first
; bs
!= NULL
; bs
= bs
->next
) {
540 printf("Device %s\n", bs
->device_name
);
541 if(bs
->type
== BDRV_TYPE_HD
) {
542 printf("device %s format %s\n",
543 bs
->device_name
, bs
->drv
->format_name
);
548 void blk_mig_init(void)
551 block_mig_state
= qemu_mallocz(sizeof(BlkMigState
));
553 register_savevm_live("block", 0, 1, block_set_params
, block_save_live
,
554 NULL
, block_load
, block_mig_state
);