Provide marshalling mechanism for json
[qemu/aliguori-queue.git] / block-migration.c
blob4b4eddf62ade1be548ce9b69323cad466516f501
1 /*
2 * QEMU live block migration
4 * Copyright IBM, Corp. 2009
6 * Authors:
7 * Liran Schour <lirans@il.ibm.com>
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
14 #include "qemu-common.h"
15 #include "block_int.h"
16 #include "hw/hw.h"
17 #include "block-migration.h"
18 #include <assert.h>
19 #include <pthread.h>
21 #define SECTOR_BITS 9
22 #define SECTOR_SIZE (1 << SECTOR_BITS)
23 #define SECTOR_MASK ~(SECTOR_SIZE - 1);
25 #define BLOCK_SIZE (block_mig_state->sectors_per_block << SECTOR_BITS)
27 #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01
28 #define BLK_MIG_FLAG_EOS 0x02
30 #define MAX_IS_ALLOCATED_SEARCH 65536
31 #define MAX_BLOCKS_READ 10000
32 #define BLOCKS_READ_CHANGE 100
33 #define INITIAL_BLOCKS_READ 100
35 //#define DEBUG_BLK_MIGRATION
37 #ifdef DEBUG_BLK_MIGRATION
38 #define dprintf(fmt, ...) \
39 do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
40 #else
41 #define dprintf(fmt, ...) \
42 do { } while (0)
43 #endif
45 typedef struct BlkMigBlock {
46 uint8_t *buf;
47 BlkMigDevState *bmds;
48 int64_t sector;
49 struct iovec iov;
50 QEMUIOVector qiov;
51 BlockDriverAIOCB *aiocb;
52 int ret;
53 struct BlkMigBlock *next;
54 } BlkMigBlock;
56 typedef struct BlkMigState {
57 int bulk_completed;
58 int blk_enable;
59 int shared_base;
60 int no_dirty;
61 QEMUFile *load_file;
62 BlkMigDevState *bmds_first;
63 int sectors_per_block;
64 BlkMigBlock *first_blk;
65 BlkMigBlock *last_blk;
66 int submitted;
67 int read_done;
68 int transferred;
69 int64_t print_completion;
70 } BlkMigState;
72 static BlkMigState *block_mig_state = NULL;
74 static void blk_mig_read_cb(void *opaque, int ret)
76 BlkMigBlock *blk = opaque;
78 blk->ret = ret;
80 /* insert at the end */
81 if(block_mig_state->last_blk == NULL) {
82 block_mig_state->first_blk = blk;
83 block_mig_state->last_blk = blk;
84 } else {
85 block_mig_state->last_blk->next = blk;
86 block_mig_state->last_blk = blk;
89 block_mig_state->submitted--;
90 block_mig_state->read_done++;
91 assert(block_mig_state->submitted >= 0);
93 return;
96 static int mig_read_device_bulk(QEMUFile *f, BlkMigDevState *bms)
98 int nr_sectors;
99 int64_t total_sectors, cur_sector = 0;
100 BlockDriverState *bs = bms->bs;
101 BlkMigBlock *blk;
103 blk = qemu_malloc(sizeof(BlkMigBlock));
104 blk->buf = qemu_malloc(BLOCK_SIZE);
106 cur_sector = bms->cur_sector;
107 total_sectors = bdrv_getlength(bs) >> SECTOR_BITS;
109 if(bms->shared_base) {
110 while(cur_sector < bms->total_sectors &&
111 !bdrv_is_allocated(bms->bs, cur_sector,
112 MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
113 cur_sector += nr_sectors;
117 if(cur_sector >= total_sectors) {
118 bms->cur_sector = total_sectors;
119 qemu_free(blk->buf);
120 qemu_free(blk);
121 return 1;
124 if(cur_sector >= block_mig_state->print_completion) {
125 printf("Completed %" PRId64 " %%\r", cur_sector * 100 / total_sectors);
126 fflush(stdout);
127 block_mig_state->print_completion +=
128 (block_mig_state->sectors_per_block * 10000);
131 /* we going to transfder BLOCK_SIZE any way even if it is not allocated */
132 nr_sectors = block_mig_state->sectors_per_block;
134 cur_sector &= ~((int64_t)block_mig_state->sectors_per_block -1);
136 if(total_sectors - cur_sector < block_mig_state->sectors_per_block) {
137 nr_sectors = (total_sectors - cur_sector);
140 bms->cur_sector = cur_sector + nr_sectors;
141 blk->sector = cur_sector;
142 blk->bmds = bms;
143 blk->next = NULL;
145 blk->iov.iov_base = blk->buf;
146 blk->iov.iov_len = nr_sectors * SECTOR_SIZE;
147 qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
149 blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
150 nr_sectors, blk_mig_read_cb, blk);
152 if(!blk->aiocb) {
153 printf("Error reading sector %" PRId64 "\n", cur_sector);
154 qemu_free(blk->buf);
155 qemu_free(blk);
156 return 0;
159 bdrv_reset_dirty(bms->bs, cur_sector, nr_sectors);
160 block_mig_state->submitted++;
162 return (bms->cur_sector >= total_sectors);
165 static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
167 int len, nr_sectors;
168 int64_t total_sectors = bmds->total_sectors, cur_sector = 0;
169 uint8_t *tmp_buf = NULL;
170 BlockDriverState *bs = bmds->bs;
172 tmp_buf = qemu_malloc(BLOCK_SIZE);
174 cur_sector = bmds->cur_sector;
176 if(bmds->shared_base) {
177 while(cur_sector < bmds->total_sectors &&
178 !bdrv_is_allocated(bmds->bs, cur_sector,
179 MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
180 cur_sector += nr_sectors;
184 if(cur_sector >= total_sectors) {
185 bmds->cur_sector = total_sectors;
186 qemu_free(tmp_buf);
187 return 1;
190 if(cur_sector >= block_mig_state->print_completion) {
191 printf("Completed %" PRId64 " %%\r", cur_sector * 100 / total_sectors);
192 fflush(stdout);
193 block_mig_state->print_completion +=
194 (block_mig_state->sectors_per_block * 10000);
197 cur_sector &= ~((int64_t)block_mig_state->sectors_per_block -1);
199 /* we going to transfer
200 BLOCK_SIZE
201 any way even if it is not allocated */
202 nr_sectors = block_mig_state->sectors_per_block;
204 if(total_sectors - cur_sector < block_mig_state->sectors_per_block) {
205 nr_sectors = (total_sectors - cur_sector);
208 if(bdrv_read(bs, cur_sector, tmp_buf, nr_sectors) < 0) {
209 printf("Error reading sector %" PRId64 "\n", cur_sector);
212 bdrv_reset_dirty(bs, cur_sector, nr_sectors);
214 /* Device name */
215 qemu_put_be64(f,(cur_sector << SECTOR_BITS) | BLK_MIG_FLAG_DEVICE_BLOCK);
217 len = strlen(bs->device_name);
218 qemu_put_byte(f, len);
219 qemu_put_buffer(f, (uint8_t *)bs->device_name, len);
221 qemu_put_buffer(f, tmp_buf,
222 BLOCK_SIZE);
224 bmds->cur_sector = cur_sector + block_mig_state->sectors_per_block;
226 qemu_free(tmp_buf);
228 return (bmds->cur_sector >= total_sectors);
231 static void send_blk(QEMUFile *f, BlkMigBlock * blk)
233 int len;
235 /* Device name */
236 qemu_put_be64(f,(blk->sector << SECTOR_BITS) | BLK_MIG_FLAG_DEVICE_BLOCK);
238 len = strlen(blk->bmds->bs->device_name);
239 qemu_put_byte(f, len);
240 qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);
242 qemu_put_buffer(f, blk->buf,
243 BLOCK_SIZE);
245 return;
248 static void blk_mig_save_dev_info(QEMUFile *f, BlkMigDevState *bmds)
252 static void set_dirty_tracking(int enable)
254 BlkMigDevState *bmds;
255 for(bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) {
256 bdrv_set_dirty_tracking(bmds->bs,enable);
259 return;
262 static void init_blk_migration(QEMUFile *f)
264 BlkMigDevState **pbmds, *bmds;
265 BlockDriverState *bs;
267 for (bs = bdrv_first; bs != NULL; bs = bs->next) {
268 if(bs->type == BDRV_TYPE_HD) {
269 bmds = qemu_mallocz(sizeof(BlkMigDevState));
270 bmds->bs = bs;
271 bmds->bulk_completed = 0;
272 bmds->total_sectors = bdrv_getlength(bs) >> SECTOR_BITS;
273 bmds->shared_base = block_mig_state->shared_base;
275 if(bmds->shared_base) {
276 printf("Start migration for %s with shared base image\n",
277 bs->device_name);
278 } else {
279 printf("Start full migration for %s\n", bs->device_name);
282 /* insert at the end */
283 pbmds = &block_mig_state->bmds_first;
284 while (*pbmds != NULL)
285 pbmds = &(*pbmds)->next;
286 *pbmds = bmds;
288 blk_mig_save_dev_info(f, bmds);
293 block_mig_state->sectors_per_block = bdrv_get_sectors_per_chunk();
295 return;
298 static int blk_mig_save_bulked_block(QEMUFile *f, int is_async)
300 BlkMigDevState *bmds;
302 for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) {
303 if(bmds->bulk_completed == 0) {
304 if(is_async) {
305 if(mig_read_device_bulk(f, bmds) == 1) {
306 /* completed bulk section for this device */
307 bmds->bulk_completed = 1;
309 } else {
310 if(mig_save_device_bulk(f,bmds) == 1) {
311 /* completed bulk section for this device */
312 bmds->bulk_completed = 1;
315 return 1;
319 /* we reached here means bulk is completed */
320 block_mig_state->bulk_completed = 1;
322 return 0;
326 #define MAX_NUM_BLOCKS 4
328 static void blk_mig_save_dirty_blocks(QEMUFile *f)
330 BlkMigDevState *bmds;
331 uint8_t buf[BLOCK_SIZE];
332 int64_t sector;
333 int len;
335 for(bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) {
336 for(sector = 0; sector < bmds->cur_sector;) {
338 if(bdrv_get_dirty(bmds->bs,sector)) {
340 if(bdrv_read(bmds->bs, sector, buf,
341 block_mig_state->sectors_per_block) < 0) {
344 /* device name */
345 qemu_put_be64(f,(sector << SECTOR_BITS)
346 | BLK_MIG_FLAG_DEVICE_BLOCK);
348 len = strlen(bmds->bs->device_name);
350 qemu_put_byte(f, len);
351 qemu_put_buffer(f, (uint8_t *)bmds->bs->device_name, len);
353 qemu_put_buffer(f, buf,
354 (block_mig_state->sectors_per_block *
355 SECTOR_SIZE));
357 bdrv_reset_dirty(bmds->bs, sector,
358 block_mig_state->sectors_per_block);
360 sector += block_mig_state->sectors_per_block;
361 } else {
362 /* sector is clean */
363 sector += block_mig_state->sectors_per_block;
368 return;
371 static void flush_blks(QEMUFile* f)
373 BlkMigBlock *blk, *tmp;
375 dprintf("%s Enter submitted %d read_done %d transfered\n", __FUNCTION__,
376 submitted, read_done, transfered);
378 for(blk = block_mig_state->first_blk;
379 blk != NULL && !qemu_file_rate_limit(f); blk = tmp) {
380 send_blk(f, blk);
382 tmp = blk->next;
383 qemu_free(blk->buf);
384 qemu_free(blk);
386 block_mig_state->read_done--;
387 block_mig_state->transferred++;
388 assert(block_mig_state->read_done >= 0);
390 block_mig_state->first_blk = blk;
392 if(block_mig_state->first_blk == NULL) {
393 block_mig_state->last_blk = NULL;
396 dprintf("%s Exit submitted %d read_done %d transferred%d\n", __FUNCTION__,
397 block_mig_state->submitted, block_mig_state->read_done,
398 block_mig_state->transferred);
400 return;
403 static int is_stage2_completed(void)
405 BlkMigDevState *bmds;
407 if(block_mig_state->submitted > 0) {
408 return 0;
411 for (bmds = block_mig_state->bmds_first; bmds != NULL; bmds = bmds->next) {
412 if(bmds->bulk_completed == 0) {
413 return 0;
417 return 1;
420 static int block_save_live(QEMUFile *f, int stage, void *opaque)
422 int ret = 1;
424 dprintf("Enter save live stage %d submitted %d transferred %d\n", stage,
425 submitted, transferred);
427 if(block_mig_state->blk_enable != 1) {
428 /* no need to migrate storage */
430 qemu_put_be64(f,BLK_MIG_FLAG_EOS);
431 return 1;
434 if(stage == 1) {
435 init_blk_migration(f);
437 /* start track dirty blocks */
438 set_dirty_tracking(1);
442 flush_blks(f);
444 /* control the rate of transfer */
445 while ((block_mig_state->submitted + block_mig_state->read_done) *
446 (BLOCK_SIZE) <
447 (qemu_file_get_rate_limit(f))) {
449 ret = blk_mig_save_bulked_block(f, 1);
451 if (ret == 0) /* no more bulk blocks for now*/
452 break;
455 flush_blks(f);
457 if(stage == 3) {
459 while(blk_mig_save_bulked_block(f, 0) != 0);
461 blk_mig_save_dirty_blocks(f);
463 /* stop track dirty blocks */
464 set_dirty_tracking(0);;
466 printf("\nBlock migration completed\n");
469 qemu_put_be64(f,BLK_MIG_FLAG_EOS);
471 return ((stage == 2) && is_stage2_completed());
474 static int block_load(QEMUFile *f, void *opaque, int version_id)
476 int len, flags;
477 char device_name[256];
478 int64_t addr;
479 BlockDriverState *bs;
480 uint8_t *buf;
482 block_mig_state->sectors_per_block = bdrv_get_sectors_per_chunk();
483 buf = qemu_malloc(BLOCK_SIZE);
485 do {
487 addr = qemu_get_be64(f);
489 flags = addr & ~SECTOR_MASK;
490 addr &= SECTOR_MASK;
492 if(flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
494 /* get device name */
495 len = qemu_get_byte(f);
497 qemu_get_buffer(f, (uint8_t *)device_name, len);
498 device_name[len] = '\0';
500 bs = bdrv_find(device_name);
502 qemu_get_buffer(f, buf,
503 BLOCK_SIZE);
504 if(bs != NULL) {
506 bdrv_write(bs, (addr >> SECTOR_BITS),
507 buf, block_mig_state->sectors_per_block);
508 } else {
509 printf("Error unknown block device %s\n", device_name);
511 } else if(flags & BLK_MIG_FLAG_EOS) {
513 } else {
514 printf("Unknown flags\n");
516 } while(!(flags & BLK_MIG_FLAG_EOS));
518 qemu_free(buf);
520 return 0;
523 static void block_set_params(int blk_enable, int shared_base, void *opaque)
525 assert(opaque == block_mig_state);
527 block_mig_state->blk_enable = blk_enable;
528 block_mig_state->shared_base = shared_base;
530 /* shared base means that blk_enable = 1 */
531 block_mig_state->blk_enable |= shared_base;
533 return;
536 void blk_mig_info(void)
538 BlockDriverState *bs;
540 for (bs = bdrv_first; bs != NULL; bs = bs->next) {
541 printf("Device %s\n", bs->device_name);
542 if(bs->type == BDRV_TYPE_HD) {
543 printf("device %s format %s\n",
544 bs->device_name, bs->drv->format_name);
549 void blk_mig_init(void)
552 block_mig_state = qemu_mallocz(sizeof(BlkMigState));
554 register_savevm_live("block", 0, 1, block_set_params, block_save_live,
555 NULL, block_load, block_mig_state);