migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "cpu.h"
  30 #include <zlib.h>
  31 #include "qapi-event.h"
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration/register.h"
  40 #include "migration/misc.h"
  41 #include "qemu-file.h"
  42 #include "postcopy-ram.h"
  43 #include "migration/page_cache.h"
  44 #include "qemu/error-report.h"
  45 #include "trace.h"
  46 #include "exec/ram_addr.h"
  47 #include "qemu/rcu_queue.h"
  48 #include "migration/colo.h"
  49
  50 /***********************************************************/
  51 /* ram save/restore */
  52
  53 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  54  * worked for pages that where filled with the same char.  We switched
  55  * it to only search for the zero value.  And to avoid confusion with
  56  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  57  */
  58
  59 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  60 #define RAM_SAVE_FLAG_ZERO     0x02
  61 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  62 #define RAM_SAVE_FLAG_PAGE     0x08
  63 #define RAM_SAVE_FLAG_EOS      0x10
  64 #define RAM_SAVE_FLAG_CONTINUE 0x20
  65 #define RAM_SAVE_FLAG_XBZRLE   0x40
  66 /* 0x80 is reserved in migration.h start with 0x100 next */
  67 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  68
  69 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  70 {
  71     return buffer_is_zero(p, size);
  72 }
  73
  74 XBZRLECacheStats xbzrle_counters;
  75
  76 /* struct contains XBZRLE cache and a static page
  77    used by the compression */
  78 static struct {
  79     /* buffer used for XBZRLE encoding */
  80     uint8_t *encoded_buf;
  81     /* buffer for storing page content */
  82     uint8_t *current_buf;
  83     /* Cache for XBZRLE, Protected by lock. */
  84     PageCache *cache;
  85     QemuMutex lock;
  86     /* it will store a page full of zeros */
  87     uint8_t *zero_target_page;
  88     /* buffer used for XBZRLE decoding */
  89     uint8_t *decoded_buf;
  90 } XBZRLE;
  91
  92 static void XBZRLE_cache_lock(void)
  93 {
  94     if (migrate_use_xbzrle())
  95         qemu_mutex_lock(&XBZRLE.lock);
  96 }
  97
  98 static void XBZRLE_cache_unlock(void)
  99 {
 100     if (migrate_use_xbzrle())
 101         qemu_mutex_unlock(&XBZRLE.lock);
 102 }
 103
 104 /**
 105  * xbzrle_cache_resize: resize the xbzrle cache
 106  *
 107  * This function is called from qmp_migrate_set_cache_size in main
 108  * thread, possibly while a migration is in progress.  A running
 109  * migration may be using the cache and might finish during this call,
 110  * hence changes to the cache are protected by XBZRLE.lock().
 111  *
 112  * Returns the new_size or negative in case of error.
 113  *
 114  * @new_size: new cache size
 115  */
 116 int64_t xbzrle_cache_resize(int64_t new_size)
 117 {
 118     PageCache *new_cache;
 119     int64_t ret;
 120
 121     if (new_size < TARGET_PAGE_SIZE) {
 122         return -1;
 123     }
 124
 125     XBZRLE_cache_lock();
 126
 127     if (XBZRLE.cache != NULL) {
 128         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 129             goto out_new_size;
 130         }
 131         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 132                                         TARGET_PAGE_SIZE);
 133         if (!new_cache) {
 134             error_report("Error creating cache");
 135             ret = -1;
 136             goto out;
 137         }
 138
 139         cache_fini(XBZRLE.cache);
 140         XBZRLE.cache = new_cache;
 141     }
 142
 143 out_new_size:
 144     ret = pow2floor(new_size);
 145 out:
 146     XBZRLE_cache_unlock();
 147     return ret;
 148 }
 149
 150 /*
 151  * An outstanding page request, on the source, having been received
 152  * and queued
 153  */
 154 struct RAMSrcPageRequest {
 155     RAMBlock *rb;
 156     hwaddr    offset;
 157     hwaddr    len;
 158
 159     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 160 };
 161
 162 /* State of RAM for migration */
 163 struct RAMState {
 164     /* QEMUFile used for this migration */
 165     QEMUFile *f;
 166     /* Last block that we have visited searching for dirty pages */
 167     RAMBlock *last_seen_block;
 168     /* Last block from where we have sent data */
 169     RAMBlock *last_sent_block;
 170     /* Last dirty target page we have sent */
 171     ram_addr_t last_page;
 172     /* last ram version we have seen */
 173     uint32_t last_version;
 174     /* We are in the first round */
 175     bool ram_bulk_stage;
 176     /* How many times we have dirty too many pages */
 177     int dirty_rate_high_cnt;
 178     /* these variables are used for bitmap sync */
 179     /* last time we did a full bitmap_sync */
 180     int64_t time_last_bitmap_sync;
 181     /* bytes transferred at start_time */
 182     uint64_t bytes_xfer_prev;
 183     /* number of dirty pages since start_time */
 184     uint64_t num_dirty_pages_period;
 185     /* xbzrle misses since the beginning of the period */
 186     uint64_t xbzrle_cache_miss_prev;
 187     /* number of iterations at the beginning of period */
 188     uint64_t iterations_prev;
 189     /* Iterations since start */
 190     uint64_t iterations;
 191     /* number of dirty bits in the bitmap */
 192     uint64_t migration_dirty_pages;
 193     /* protects modification of the bitmap */
 194     QemuMutex bitmap_mutex;
 195     /* The RAMBlock used in the last src_page_requests */
 196     RAMBlock *last_req_rb;
 197     /* Queue of outstanding page requests from the destination */
 198     QemuMutex src_page_req_mutex;
 199     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 200 };
 201 typedef struct RAMState RAMState;
 202
 203 static RAMState *ram_state;
 204
 205 uint64_t ram_bytes_remaining(void)
 206 {
 207     return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
 208 }
 209
 210 MigrationStats ram_counters;
 211
 212 /* used by the search for pages to send */
 213 struct PageSearchStatus {
 214     /* Current block being searched */
 215     RAMBlock    *block;
 216     /* Current page to search from */
 217     unsigned long page;
 218     /* Set once we wrap around */
 219     bool         complete_round;
 220 };
 221 typedef struct PageSearchStatus PageSearchStatus;
 222
 223 struct CompressParam {
 224     bool done;
 225     bool quit;
 226     QEMUFile *file;
 227     QemuMutex mutex;
 228     QemuCond cond;
 229     RAMBlock *block;
 230     ram_addr_t offset;
 231 };
 232 typedef struct CompressParam CompressParam;
 233
 234 struct DecompressParam {
 235     bool done;
 236     bool quit;
 237     QemuMutex mutex;
 238     QemuCond cond;
 239     void *des;
 240     uint8_t *compbuf;
 241     int len;
 242 };
 243 typedef struct DecompressParam DecompressParam;
 244
 245 static CompressParam *comp_param;
 246 static QemuThread *compress_threads;
 247 /* comp_done_cond is used to wake up the migration thread when
 248  * one of the compression threads has finished the compression.
 249  * comp_done_lock is used to co-work with comp_done_cond.
 250  */
 251 static QemuMutex comp_done_lock;
 252 static QemuCond comp_done_cond;
 253 /* The empty QEMUFileOps will be used by file in CompressParam */
 254 static const QEMUFileOps empty_ops = { };
 255
 256 static DecompressParam *decomp_param;
 257 static QemuThread *decompress_threads;
 258 static QemuMutex decomp_done_lock;
 259 static QemuCond decomp_done_cond;
 260
 261 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 262                                 ram_addr_t offset);
 263
 264 static void *do_data_compress(void *opaque)
 265 {
 266     CompressParam *param = opaque;
 267     RAMBlock *block;
 268     ram_addr_t offset;
 269
 270     qemu_mutex_lock(&param->mutex);
 271     while (!param->quit) {
 272         if (param->block) {
 273             block = param->block;
 274             offset = param->offset;
 275             param->block = NULL;
 276             qemu_mutex_unlock(&param->mutex);
 277
 278             do_compress_ram_page(param->file, block, offset);
 279
 280             qemu_mutex_lock(&comp_done_lock);
 281             param->done = true;
 282             qemu_cond_signal(&comp_done_cond);
 283             qemu_mutex_unlock(&comp_done_lock);
 284
 285             qemu_mutex_lock(&param->mutex);
 286         } else {
 287             qemu_cond_wait(&param->cond, &param->mutex);
 288         }
 289     }
 290     qemu_mutex_unlock(&param->mutex);
 291
 292     return NULL;
 293 }
 294
 295 static inline void terminate_compression_threads(void)
 296 {
 297     int idx, thread_count;
 298
 299     thread_count = migrate_compress_threads();
 300
 301     for (idx = 0; idx < thread_count; idx++) {
 302         qemu_mutex_lock(&comp_param[idx].mutex);
 303         comp_param[idx].quit = true;
 304         qemu_cond_signal(&comp_param[idx].cond);
 305         qemu_mutex_unlock(&comp_param[idx].mutex);
 306     }
 307 }
 308
 309 static void compress_threads_save_cleanup(void)
 310 {
 311     int i, thread_count;
 312
 313     if (!migrate_use_compression()) {
 314         return;
 315     }
 316     terminate_compression_threads();
 317     thread_count = migrate_compress_threads();
 318     for (i = 0; i < thread_count; i++) {
 319         qemu_thread_join(compress_threads + i);
 320         qemu_fclose(comp_param[i].file);
 321         qemu_mutex_destroy(&comp_param[i].mutex);
 322         qemu_cond_destroy(&comp_param[i].cond);
 323     }
 324     qemu_mutex_destroy(&comp_done_lock);
 325     qemu_cond_destroy(&comp_done_cond);
 326     g_free(compress_threads);
 327     g_free(comp_param);
 328     compress_threads = NULL;
 329     comp_param = NULL;
 330 }
 331
 332 static void compress_threads_save_setup(void)
 333 {
 334     int i, thread_count;
 335
 336     if (!migrate_use_compression()) {
 337         return;
 338     }
 339     thread_count = migrate_compress_threads();
 340     compress_threads = g_new0(QemuThread, thread_count);
 341     comp_param = g_new0(CompressParam, thread_count);
 342     qemu_cond_init(&comp_done_cond);
 343     qemu_mutex_init(&comp_done_lock);
 344     for (i = 0; i < thread_count; i++) {
 345         /* comp_param[i].file is just used as a dummy buffer to save data,
 346          * set its ops to empty.
 347          */
 348         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 349         comp_param[i].done = true;
 350         comp_param[i].quit = false;
 351         qemu_mutex_init(&comp_param[i].mutex);
 352         qemu_cond_init(&comp_param[i].cond);
 353         qemu_thread_create(compress_threads + i, "compress",
 354                            do_data_compress, comp_param + i,
 355                            QEMU_THREAD_JOINABLE);
 356     }
 357 }
 358
 359 /* Multiple fd's */
 360
 361 struct MultiFDSendParams {
 362     uint8_t id;
 363     char *name;
 364     QemuThread thread;
 365     QemuSemaphore sem;
 366     QemuMutex mutex;
 367     bool quit;
 368 };
 369 typedef struct MultiFDSendParams MultiFDSendParams;
 370
 371 struct {
 372     MultiFDSendParams *params;
 373     /* number of created threads */
 374     int count;
 375 } *multifd_send_state;
 376
 377 static void terminate_multifd_send_threads(Error *errp)
 378 {
 379     int i;
 380
 381     for (i = 0; i < multifd_send_state->count; i++) {
 382         MultiFDSendParams *p = &multifd_send_state->params[i];
 383
 384         qemu_mutex_lock(&p->mutex);
 385         p->quit = true;
 386         qemu_sem_post(&p->sem);
 387         qemu_mutex_unlock(&p->mutex);
 388     }
 389 }
 390
 391 int multifd_save_cleanup(Error **errp)
 392 {
 393     int i;
 394     int ret = 0;
 395
 396     if (!migrate_use_multifd()) {
 397         return 0;
 398     }
 399     terminate_multifd_send_threads(NULL);
 400     for (i = 0; i < multifd_send_state->count; i++) {
 401         MultiFDSendParams *p = &multifd_send_state->params[i];
 402
 403         qemu_thread_join(&p->thread);
 404         qemu_mutex_destroy(&p->mutex);
 405         qemu_sem_destroy(&p->sem);
 406         g_free(p->name);
 407         p->name = NULL;
 408     }
 409     g_free(multifd_send_state->params);
 410     multifd_send_state->params = NULL;
 411     g_free(multifd_send_state);
 412     multifd_send_state = NULL;
 413     return ret;
 414 }
 415
 416 static void *multifd_send_thread(void *opaque)
 417 {
 418     MultiFDSendParams *p = opaque;
 419
 420     while (true) {
 421         qemu_mutex_lock(&p->mutex);
 422         if (p->quit) {
 423             qemu_mutex_unlock(&p->mutex);
 424             break;
 425         }
 426         qemu_mutex_unlock(&p->mutex);
 427         qemu_sem_wait(&p->sem);
 428     }
 429
 430     return NULL;
 431 }
 432
 433 int multifd_save_setup(void)
 434 {
 435     int thread_count;
 436     uint8_t i;
 437
 438     if (!migrate_use_multifd()) {
 439         return 0;
 440     }
 441     thread_count = migrate_multifd_channels();
 442     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
 443     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
 444     multifd_send_state->count = 0;
 445     for (i = 0; i < thread_count; i++) {
 446         MultiFDSendParams *p = &multifd_send_state->params[i];
 447
 448         qemu_mutex_init(&p->mutex);
 449         qemu_sem_init(&p->sem, 0);
 450         p->quit = false;
 451         p->id = i;
 452         p->name = g_strdup_printf("multifdsend_%d", i);
 453         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
 454                            QEMU_THREAD_JOINABLE);
 455
 456         multifd_send_state->count++;
 457     }
 458     return 0;
 459 }
 460
 461 struct MultiFDRecvParams {
 462     uint8_t id;
 463     char *name;
 464     QemuThread thread;
 465     QemuSemaphore sem;
 466     QemuMutex mutex;
 467     bool quit;
 468 };
 469 typedef struct MultiFDRecvParams MultiFDRecvParams;
 470
 471 struct {
 472     MultiFDRecvParams *params;
 473     /* number of created threads */
 474     int count;
 475 } *multifd_recv_state;
 476
 477 static void terminate_multifd_recv_threads(Error *errp)
 478 {
 479     int i;
 480
 481     for (i = 0; i < multifd_recv_state->count; i++) {
 482         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 483
 484         qemu_mutex_lock(&p->mutex);
 485         p->quit = true;
 486         qemu_sem_post(&p->sem);
 487         qemu_mutex_unlock(&p->mutex);
 488     }
 489 }
 490
 491 int multifd_load_cleanup(Error **errp)
 492 {
 493     int i;
 494     int ret = 0;
 495
 496     if (!migrate_use_multifd()) {
 497         return 0;
 498     }
 499     terminate_multifd_recv_threads(NULL);
 500     for (i = 0; i < multifd_recv_state->count; i++) {
 501         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 502
 503         qemu_thread_join(&p->thread);
 504         qemu_mutex_destroy(&p->mutex);
 505         qemu_sem_destroy(&p->sem);
 506         g_free(p->name);
 507         p->name = NULL;
 508     }
 509     g_free(multifd_recv_state->params);
 510     multifd_recv_state->params = NULL;
 511     g_free(multifd_recv_state);
 512     multifd_recv_state = NULL;
 513
 514     return ret;
 515 }
 516
 517 static void *multifd_recv_thread(void *opaque)
 518 {
 519     MultiFDRecvParams *p = opaque;
 520
 521     while (true) {
 522         qemu_mutex_lock(&p->mutex);
 523         if (p->quit) {
 524             qemu_mutex_unlock(&p->mutex);
 525             break;
 526         }
 527         qemu_mutex_unlock(&p->mutex);
 528         qemu_sem_wait(&p->sem);
 529     }
 530
 531     return NULL;
 532 }
 533
 534 int multifd_load_setup(void)
 535 {
 536     int thread_count;
 537     uint8_t i;
 538
 539     if (!migrate_use_multifd()) {
 540         return 0;
 541     }
 542     thread_count = migrate_multifd_channels();
 543     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
 544     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
 545     multifd_recv_state->count = 0;
 546     for (i = 0; i < thread_count; i++) {
 547         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 548
 549         qemu_mutex_init(&p->mutex);
 550         qemu_sem_init(&p->sem, 0);
 551         p->quit = false;
 552         p->id = i;
 553         p->name = g_strdup_printf("multifdrecv_%d", i);
 554         qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
 555                            QEMU_THREAD_JOINABLE);
 556         multifd_recv_state->count++;
 557     }
 558     return 0;
 559 }
 560
 561 /**
 562  * save_page_header: write page header to wire
 563  *
 564  * If this is the 1st block, it also writes the block identification
 565  *
 566  * Returns the number of bytes written
 567  *
 568  * @f: QEMUFile where to send the data
 569  * @block: block that contains the page we want to send
 570  * @offset: offset inside the block for the page
 571  *          in the lower bits, it contains flags
 572  */
 573 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 574                                ram_addr_t offset)
 575 {
 576     size_t size, len;
 577
 578     if (block == rs->last_sent_block) {
 579         offset |= RAM_SAVE_FLAG_CONTINUE;
 580     }
 581     qemu_put_be64(f, offset);
 582     size = 8;
 583
 584     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 585         len = strlen(block->idstr);
 586         qemu_put_byte(f, len);
 587         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 588         size += 1 + len;
 589         rs->last_sent_block = block;
 590     }
 591     return size;
 592 }
 593
 594 /**
 595  * mig_throttle_guest_down: throotle down the guest
 596  *
 597  * Reduce amount of guest cpu execution to hopefully slow down memory
 598  * writes. If guest dirty memory rate is reduced below the rate at
 599  * which we can transfer pages to the destination then we should be
 600  * able to complete migration. Some workloads dirty memory way too
 601  * fast and will not effectively converge, even with auto-converge.
 602  */
 603 static void mig_throttle_guest_down(void)
 604 {
 605     MigrationState *s = migrate_get_current();
 606     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 607     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 608
 609     /* We have not started throttling yet. Let's start it. */
 610     if (!cpu_throttle_active()) {
 611         cpu_throttle_set(pct_initial);
 612     } else {
 613         /* Throttling already on, just increase the rate */
 614         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 615     }
 616 }
 617
 618 /**
 619  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 620  *
 621  * @rs: current RAM state
 622  * @current_addr: address for the zero page
 623  *
 624  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 625  * The important thing is that a stale (not-yet-0'd) page be replaced
 626  * by the new data.
 627  * As a bonus, if the page wasn't in the cache it gets added so that
 628  * when a small write is made into the 0'd page it gets XBZRLE sent.
 629  */
 630 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 631 {
 632     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 633         return;
 634     }
 635
 636     /* We don't care if this fails to allocate a new cache page
 637      * as long as it updated an old one */
 638     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 639                  ram_counters.dirty_sync_count);
 640 }
 641
 642 #define ENCODING_FLAG_XBZRLE 0x1
 643
 644 /**
 645  * save_xbzrle_page: compress and send current page
 646  *
 647  * Returns: 1 means that we wrote the page
 648  *          0 means that page is identical to the one already sent
 649  *          -1 means that xbzrle would be longer than normal
 650  *
 651  * @rs: current RAM state
 652  * @current_data: pointer to the address of the page contents
 653  * @current_addr: addr of the page
 654  * @block: block that contains the page we want to send
 655  * @offset: offset inside the block for the page
 656  * @last_stage: if we are at the completion stage
 657  */
 658 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 659                             ram_addr_t current_addr, RAMBlock *block,
 660                             ram_addr_t offset, bool last_stage)
 661 {
 662     int encoded_len = 0, bytes_xbzrle;
 663     uint8_t *prev_cached_page;
 664
 665     if (!cache_is_cached(XBZRLE.cache, current_addr,
 666                          ram_counters.dirty_sync_count)) {
 667         xbzrle_counters.cache_miss++;
 668         if (!last_stage) {
 669             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 670                              ram_counters.dirty_sync_count) == -1) {
 671                 return -1;
 672             } else {
 673                 /* update *current_data when the page has been
 674                    inserted into cache */
 675                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 676             }
 677         }
 678         return -1;
 679     }
 680
 681     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 682
 683     /* save current buffer into memory */
 684     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 685
 686     /* XBZRLE encoding (if there is no overflow) */
 687     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 688                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 689                                        TARGET_PAGE_SIZE);
 690     if (encoded_len == 0) {
 691         trace_save_xbzrle_page_skipping();
 692         return 0;
 693     } else if (encoded_len == -1) {
 694         trace_save_xbzrle_page_overflow();
 695         xbzrle_counters.overflow++;
 696         /* update data in the cache */
 697         if (!last_stage) {
 698             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 699             *current_data = prev_cached_page;
 700         }
 701         return -1;
 702     }
 703
 704     /* we need to update the data in the cache, in order to get the same data */
 705     if (!last_stage) {
 706         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 707     }
 708
 709     /* Send XBZRLE based compressed page */
 710     bytes_xbzrle = save_page_header(rs, rs->f, block,
 711                                     offset | RAM_SAVE_FLAG_XBZRLE);
 712     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 713     qemu_put_be16(rs->f, encoded_len);
 714     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 715     bytes_xbzrle += encoded_len + 1 + 2;
 716     xbzrle_counters.pages++;
 717     xbzrle_counters.bytes += bytes_xbzrle;
 718     ram_counters.transferred += bytes_xbzrle;
 719
 720     return 1;
 721 }
 722
 723 /**
 724  * migration_bitmap_find_dirty: find the next dirty page from start
 725  *
 726  * Called with rcu_read_lock() to protect migration_bitmap
 727  *
 728  * Returns the byte offset within memory region of the start of a dirty page
 729  *
 730  * @rs: current RAM state
 731  * @rb: RAMBlock where to search for dirty pages
 732  * @start: page where we start the search
 733  */
 734 static inline
 735 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 736                                           unsigned long start)
 737 {
 738     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 739     unsigned long *bitmap = rb->bmap;
 740     unsigned long next;
 741
 742     if (rs->ram_bulk_stage && start > 0) {
 743         next = start + 1;
 744     } else {
 745         next = find_next_bit(bitmap, size, start);
 746     }
 747
 748     return next;
 749 }
 750
 751 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 752                                                 RAMBlock *rb,
 753                                                 unsigned long page)
 754 {
 755     bool ret;
 756
 757     ret = test_and_clear_bit(page, rb->bmap);
 758
 759     if (ret) {
 760         rs->migration_dirty_pages--;
 761     }
 762     return ret;
 763 }
 764
 765 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 766                                         ram_addr_t start, ram_addr_t length)
 767 {
 768     rs->migration_dirty_pages +=
 769         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 770                                               &rs->num_dirty_pages_period);
 771 }
 772
 773 /**
 774  * ram_pagesize_summary: calculate all the pagesizes of a VM
 775  *
 776  * Returns a summary bitmap of the page sizes of all RAMBlocks
 777  *
 778  * For VMs with just normal pages this is equivalent to the host page
 779  * size. If it's got some huge pages then it's the OR of all the
 780  * different page sizes.
 781  */
 782 uint64_t ram_pagesize_summary(void)
 783 {
 784     RAMBlock *block;
 785     uint64_t summary = 0;
 786
 787     RAMBLOCK_FOREACH(block) {
 788         summary |= block->page_size;
 789     }
 790
 791     return summary;
 792 }
 793
 794 static void migration_bitmap_sync(RAMState *rs)
 795 {
 796     RAMBlock *block;
 797     int64_t end_time;
 798     uint64_t bytes_xfer_now;
 799
 800     ram_counters.dirty_sync_count++;
 801
 802     if (!rs->time_last_bitmap_sync) {
 803         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 804     }
 805
 806     trace_migration_bitmap_sync_start();
 807     memory_global_dirty_log_sync();
 808
 809     qemu_mutex_lock(&rs->bitmap_mutex);
 810     rcu_read_lock();
 811     RAMBLOCK_FOREACH(block) {
 812         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 813     }
 814     rcu_read_unlock();
 815     qemu_mutex_unlock(&rs->bitmap_mutex);
 816
 817     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 818
 819     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 820
 821     /* more than 1 second = 1000 millisecons */
 822     if (end_time > rs->time_last_bitmap_sync + 1000) {
 823         /* calculate period counters */
 824         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 825             / (end_time - rs->time_last_bitmap_sync);
 826         bytes_xfer_now = ram_counters.transferred;
 827
 828         if (migrate_auto_converge()) {
 829             /* The following detection logic can be refined later. For now:
 830                Check to see if the dirtied bytes is 50% more than the approx.
 831                amount of bytes that just got transferred since the last time we
 832                were in this routine. If that happens twice, start or increase
 833                throttling */
 834
 835             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 836                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 837                 (++rs->dirty_rate_high_cnt >= 2)) {
 838                     trace_migration_throttle();
 839                     rs->dirty_rate_high_cnt = 0;
 840                     mig_throttle_guest_down();
 841             }
 842         }
 843
 844         if (migrate_use_xbzrle()) {
 845             if (rs->iterations_prev != rs->iterations) {
 846                 xbzrle_counters.cache_miss_rate =
 847                    (double)(xbzrle_counters.cache_miss -
 848                             rs->xbzrle_cache_miss_prev) /
 849                    (rs->iterations - rs->iterations_prev);
 850             }
 851             rs->iterations_prev = rs->iterations;
 852             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 853         }
 854
 855         /* reset period counters */
 856         rs->time_last_bitmap_sync = end_time;
 857         rs->num_dirty_pages_period = 0;
 858         rs->bytes_xfer_prev = bytes_xfer_now;
 859     }
 860     if (migrate_use_events()) {
 861         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
 862     }
 863 }
 864
 865 /**
 866  * save_zero_page: send the zero page to the stream
 867  *
 868  * Returns the number of pages written.
 869  *
 870  * @rs: current RAM state
 871  * @block: block that contains the page we want to send
 872  * @offset: offset inside the block for the page
 873  * @p: pointer to the page
 874  */
 875 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 876                           uint8_t *p)
 877 {
 878     int pages = -1;
 879
 880     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 881         ram_counters.duplicate++;
 882         ram_counters.transferred +=
 883             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 884         qemu_put_byte(rs->f, 0);
 885         ram_counters.transferred += 1;
 886         pages = 1;
 887     }
 888
 889     return pages;
 890 }
 891
 892 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 893 {
 894     if (!migrate_release_ram() || !migration_in_postcopy()) {
 895         return;
 896     }
 897
 898     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 899 }
 900
 901 /**
 902  * ram_save_page: send the given page to the stream
 903  *
 904  * Returns the number of pages written.
 905  *          < 0 - error
 906  *          >=0 - Number of pages written - this might legally be 0
 907  *                if xbzrle noticed the page was the same.
 908  *
 909  * @rs: current RAM state
 910  * @block: block that contains the page we want to send
 911  * @offset: offset inside the block for the page
 912  * @last_stage: if we are at the completion stage
 913  */
 914 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 915 {
 916     int pages = -1;
 917     uint64_t bytes_xmit;
 918     ram_addr_t current_addr;
 919     uint8_t *p;
 920     int ret;
 921     bool send_async = true;
 922     RAMBlock *block = pss->block;
 923     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 924
 925     p = block->host + offset;
 926     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 927
 928     /* In doubt sent page as normal */
 929     bytes_xmit = 0;
 930     ret = ram_control_save_page(rs->f, block->offset,
 931                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 932     if (bytes_xmit) {
 933         ram_counters.transferred += bytes_xmit;
 934         pages = 1;
 935     }
 936
 937     XBZRLE_cache_lock();
 938
 939     current_addr = block->offset + offset;
 940
 941     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 942         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 943             if (bytes_xmit > 0) {
 944                 ram_counters.normal++;
 945             } else if (bytes_xmit == 0) {
 946                 ram_counters.duplicate++;
 947             }
 948         }
 949     } else {
 950         pages = save_zero_page(rs, block, offset, p);
 951         if (pages > 0) {
 952             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 953              * page would be stale
 954              */
 955             xbzrle_cache_zero_page(rs, current_addr);
 956             ram_release_pages(block->idstr, offset, pages);
 957         } else if (!rs->ram_bulk_stage &&
 958                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 959             pages = save_xbzrle_page(rs, &p, current_addr, block,
 960                                      offset, last_stage);
 961             if (!last_stage) {
 962                 /* Can't send this cached data async, since the cache page
 963                  * might get updated before it gets to the wire
 964                  */
 965                 send_async = false;
 966             }
 967         }
 968     }
 969
 970     /* XBZRLE overflow or normal page */
 971     if (pages == -1) {
 972         ram_counters.transferred +=
 973             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
 974         if (send_async) {
 975             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
 976                                   migrate_release_ram() &
 977                                   migration_in_postcopy());
 978         } else {
 979             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
 980         }
 981         ram_counters.transferred += TARGET_PAGE_SIZE;
 982         pages = 1;
 983         ram_counters.normal++;
 984     }
 985
 986     XBZRLE_cache_unlock();
 987
 988     return pages;
 989 }
 990
 991 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 992                                 ram_addr_t offset)
 993 {
 994     RAMState *rs = ram_state;
 995     int bytes_sent, blen;
 996     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 997
 998     bytes_sent = save_page_header(rs, f, block, offset |
 999                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1000     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1001                                      migrate_compress_level());
1002     if (blen < 0) {
1003         bytes_sent = 0;
1004         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1005         error_report("compressed data failed!");
1006     } else {
1007         bytes_sent += blen;
1008         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1009     }
1010
1011     return bytes_sent;
1012 }
1013
1014 static void flush_compressed_data(RAMState *rs)
1015 {
1016     int idx, len, thread_count;
1017
1018     if (!migrate_use_compression()) {
1019         return;
1020     }
1021     thread_count = migrate_compress_threads();
1022
1023     qemu_mutex_lock(&comp_done_lock);
1024     for (idx = 0; idx < thread_count; idx++) {
1025         while (!comp_param[idx].done) {
1026             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1027         }
1028     }
1029     qemu_mutex_unlock(&comp_done_lock);
1030
1031     for (idx = 0; idx < thread_count; idx++) {
1032         qemu_mutex_lock(&comp_param[idx].mutex);
1033         if (!comp_param[idx].quit) {
1034             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1035             ram_counters.transferred += len;
1036         }
1037         qemu_mutex_unlock(&comp_param[idx].mutex);
1038     }
1039 }
1040
1041 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1042                                        ram_addr_t offset)
1043 {
1044     param->block = block;
1045     param->offset = offset;
1046 }
1047
1048 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1049                                            ram_addr_t offset)
1050 {
1051     int idx, thread_count, bytes_xmit = -1, pages = -1;
1052
1053     thread_count = migrate_compress_threads();
1054     qemu_mutex_lock(&comp_done_lock);
1055     while (true) {
1056         for (idx = 0; idx < thread_count; idx++) {
1057             if (comp_param[idx].done) {
1058                 comp_param[idx].done = false;
1059                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1060                 qemu_mutex_lock(&comp_param[idx].mutex);
1061                 set_compress_params(&comp_param[idx], block, offset);
1062                 qemu_cond_signal(&comp_param[idx].cond);
1063                 qemu_mutex_unlock(&comp_param[idx].mutex);
1064                 pages = 1;
1065                 ram_counters.normal++;
1066                 ram_counters.transferred += bytes_xmit;
1067                 break;
1068             }
1069         }
1070         if (pages > 0) {
1071             break;
1072         } else {
1073             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1074         }
1075     }
1076     qemu_mutex_unlock(&comp_done_lock);
1077
1078     return pages;
1079 }
1080
1081 /**
1082  * ram_save_compressed_page: compress the given page and send it to the stream
1083  *
1084  * Returns the number of pages written.
1085  *
1086  * @rs: current RAM state
1087  * @block: block that contains the page we want to send
1088  * @offset: offset inside the block for the page
1089  * @last_stage: if we are at the completion stage
1090  */
1091 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1092                                     bool last_stage)
1093 {
1094     int pages = -1;
1095     uint64_t bytes_xmit = 0;
1096     uint8_t *p;
1097     int ret, blen;
1098     RAMBlock *block = pss->block;
1099     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1100
1101     p = block->host + offset;
1102
1103     ret = ram_control_save_page(rs->f, block->offset,
1104                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1105     if (bytes_xmit) {
1106         ram_counters.transferred += bytes_xmit;
1107         pages = 1;
1108     }
1109     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1110         if (ret != RAM_SAVE_CONTROL_DELAYED) {
1111             if (bytes_xmit > 0) {
1112                 ram_counters.normal++;
1113             } else if (bytes_xmit == 0) {
1114                 ram_counters.duplicate++;
1115             }
1116         }
1117     } else {
1118         /* When starting the process of a new block, the first page of
1119          * the block should be sent out before other pages in the same
1120          * block, and all the pages in last block should have been sent
1121          * out, keeping this order is important, because the 'cont' flag
1122          * is used to avoid resending the block name.
1123          */
1124         if (block != rs->last_sent_block) {
1125             flush_compressed_data(rs);
1126             pages = save_zero_page(rs, block, offset, p);
1127             if (pages == -1) {
1128                 /* Make sure the first page is sent out before other pages */
1129                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1130                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1131                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1132                                                  migrate_compress_level());
1133                 if (blen > 0) {
1134                     ram_counters.transferred += bytes_xmit + blen;
1135                     ram_counters.normal++;
1136                     pages = 1;
1137                 } else {
1138                     qemu_file_set_error(rs->f, blen);
1139                     error_report("compressed data failed!");
1140                 }
1141             }
1142             if (pages > 0) {
1143                 ram_release_pages(block->idstr, offset, pages);
1144             }
1145         } else {
1146             pages = save_zero_page(rs, block, offset, p);
1147             if (pages == -1) {
1148                 pages = compress_page_with_multi_thread(rs, block, offset);
1149             } else {
1150                 ram_release_pages(block->idstr, offset, pages);
1151             }
1152         }
1153     }
1154
1155     return pages;
1156 }
1157
1158 /**
1159  * find_dirty_block: find the next dirty page and update any state
1160  * associated with the search process.
1161  *
1162  * Returns if a page is found
1163  *
1164  * @rs: current RAM state
1165  * @pss: data about the state of the current dirty page scan
1166  * @again: set to false if the search has scanned the whole of RAM
1167  */
1168 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1169 {
1170     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1171     if (pss->complete_round && pss->block == rs->last_seen_block &&
1172         pss->page >= rs->last_page) {
1173         /*
1174          * We've been once around the RAM and haven't found anything.
1175          * Give up.
1176          */
1177         *again = false;
1178         return false;
1179     }
1180     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1181         /* Didn't find anything in this RAM Block */
1182         pss->page = 0;
1183         pss->block = QLIST_NEXT_RCU(pss->block, next);
1184         if (!pss->block) {
1185             /* Hit the end of the list */
1186             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1187             /* Flag that we've looped */
1188             pss->complete_round = true;
1189             rs->ram_bulk_stage = false;
1190             if (migrate_use_xbzrle()) {
1191                 /* If xbzrle is on, stop using the data compression at this
1192                  * point. In theory, xbzrle can do better than compression.
1193                  */
1194                 flush_compressed_data(rs);
1195             }
1196         }
1197         /* Didn't find anything this time, but try again on the new block */
1198         *again = true;
1199         return false;
1200     } else {
1201         /* Can go around again, but... */
1202         *again = true;
1203         /* We've found something so probably don't need to */
1204         return true;
1205     }
1206 }
1207
1208 /**
1209  * unqueue_page: gets a page of the queue
1210  *
1211  * Helper for 'get_queued_page' - gets a page off the queue
1212  *
1213  * Returns the block of the page (or NULL if none available)
1214  *
1215  * @rs: current RAM state
1216  * @offset: used to return the offset within the RAMBlock
1217  */
1218 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1219 {
1220     RAMBlock *block = NULL;
1221
1222     qemu_mutex_lock(&rs->src_page_req_mutex);
1223     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1224         struct RAMSrcPageRequest *entry =
1225                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1226         block = entry->rb;
1227         *offset = entry->offset;
1228
1229         if (entry->len > TARGET_PAGE_SIZE) {
1230             entry->len -= TARGET_PAGE_SIZE;
1231             entry->offset += TARGET_PAGE_SIZE;
1232         } else {
1233             memory_region_unref(block->mr);
1234             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1235             g_free(entry);
1236         }
1237     }
1238     qemu_mutex_unlock(&rs->src_page_req_mutex);
1239
1240     return block;
1241 }
1242
1243 /**
1244  * get_queued_page: unqueue a page from the postocpy requests
1245  *
1246  * Skips pages that are already sent (!dirty)
1247  *
1248  * Returns if a queued page is found
1249  *
1250  * @rs: current RAM state
1251  * @pss: data about the state of the current dirty page scan
1252  */
1253 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1254 {
1255     RAMBlock  *block;
1256     ram_addr_t offset;
1257     bool dirty;
1258
1259     do {
1260         block = unqueue_page(rs, &offset);
1261         /*
1262          * We're sending this page, and since it's postcopy nothing else
1263          * will dirty it, and we must make sure it doesn't get sent again
1264          * even if this queue request was received after the background
1265          * search already sent it.
1266          */
1267         if (block) {
1268             unsigned long page;
1269
1270             page = offset >> TARGET_PAGE_BITS;
1271             dirty = test_bit(page, block->bmap);
1272             if (!dirty) {
1273                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1274                        page, test_bit(page, block->unsentmap));
1275             } else {
1276                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1277             }
1278         }
1279
1280     } while (block && !dirty);
1281
1282     if (block) {
1283         /*
1284          * As soon as we start servicing pages out of order, then we have
1285          * to kill the bulk stage, since the bulk stage assumes
1286          * in (migration_bitmap_find_and_reset_dirty) that every page is
1287          * dirty, that's no longer true.
1288          */
1289         rs->ram_bulk_stage = false;
1290
1291         /*
1292          * We want the background search to continue from the queued page
1293          * since the guest is likely to want other pages near to the page
1294          * it just requested.
1295          */
1296         pss->block = block;
1297         pss->page = offset >> TARGET_PAGE_BITS;
1298     }
1299
1300     return !!block;
1301 }
1302
1303 /**
1304  * migration_page_queue_free: drop any remaining pages in the ram
1305  * request queue
1306  *
1307  * It should be empty at the end anyway, but in error cases there may
1308  * be some left.  in case that there is any page left, we drop it.
1309  *
1310  */
1311 static void migration_page_queue_free(RAMState *rs)
1312 {
1313     struct RAMSrcPageRequest *mspr, *next_mspr;
1314     /* This queue generally should be empty - but in the case of a failed
1315      * migration might have some droppings in.
1316      */
1317     rcu_read_lock();
1318     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1319         memory_region_unref(mspr->rb->mr);
1320         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1321         g_free(mspr);
1322     }
1323     rcu_read_unlock();
1324 }
1325
1326 /**
1327  * ram_save_queue_pages: queue the page for transmission
1328  *
1329  * A request from postcopy destination for example.
1330  *
1331  * Returns zero on success or negative on error
1332  *
1333  * @rbname: Name of the RAMBLock of the request. NULL means the
1334  *          same that last one.
1335  * @start: starting address from the start of the RAMBlock
1336  * @len: length (in bytes) to send
1337  */
1338 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1339 {
1340     RAMBlock *ramblock;
1341     RAMState *rs = ram_state;
1342
1343     ram_counters.postcopy_requests++;
1344     rcu_read_lock();
1345     if (!rbname) {
1346         /* Reuse last RAMBlock */
1347         ramblock = rs->last_req_rb;
1348
1349         if (!ramblock) {
1350             /*
1351              * Shouldn't happen, we can't reuse the last RAMBlock if
1352              * it's the 1st request.
1353              */
1354             error_report("ram_save_queue_pages no previous block");
1355             goto err;
1356         }
1357     } else {
1358         ramblock = qemu_ram_block_by_name(rbname);
1359
1360         if (!ramblock) {
1361             /* We shouldn't be asked for a non-existent RAMBlock */
1362             error_report("ram_save_queue_pages no block '%s'", rbname);
1363             goto err;
1364         }
1365         rs->last_req_rb = ramblock;
1366     }
1367     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1368     if (start+len > ramblock->used_length) {
1369         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1370                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1371                      __func__, start, len, ramblock->used_length);
1372         goto err;
1373     }
1374
1375     struct RAMSrcPageRequest *new_entry =
1376         g_malloc0(sizeof(struct RAMSrcPageRequest));
1377     new_entry->rb = ramblock;
1378     new_entry->offset = start;
1379     new_entry->len = len;
1380
1381     memory_region_ref(ramblock->mr);
1382     qemu_mutex_lock(&rs->src_page_req_mutex);
1383     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1384     qemu_mutex_unlock(&rs->src_page_req_mutex);
1385     rcu_read_unlock();
1386
1387     return 0;
1388
1389 err:
1390     rcu_read_unlock();
1391     return -1;
1392 }
1393
1394 /**
1395  * ram_save_target_page: save one target page
1396  *
1397  * Returns the number of pages written
1398  *
1399  * @rs: current RAM state
1400  * @ms: current migration state
1401  * @pss: data about the page we want to send
1402  * @last_stage: if we are at the completion stage
1403  */
1404 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1405                                 bool last_stage)
1406 {
1407     int res = 0;
1408
1409     /* Check the pages is dirty and if it is send it */
1410     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1411         /*
1412          * If xbzrle is on, stop using the data compression after first
1413          * round of migration even if compression is enabled. In theory,
1414          * xbzrle can do better than compression.
1415          */
1416         if (migrate_use_compression() &&
1417             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1418             res = ram_save_compressed_page(rs, pss, last_stage);
1419         } else {
1420             res = ram_save_page(rs, pss, last_stage);
1421         }
1422
1423         if (res < 0) {
1424             return res;
1425         }
1426         if (pss->block->unsentmap) {
1427             clear_bit(pss->page, pss->block->unsentmap);
1428         }
1429     }
1430
1431     return res;
1432 }
1433
1434 /**
1435  * ram_save_host_page: save a whole host page
1436  *
1437  * Starting at *offset send pages up to the end of the current host
1438  * page. It's valid for the initial offset to point into the middle of
1439  * a host page in which case the remainder of the hostpage is sent.
1440  * Only dirty target pages are sent. Note that the host page size may
1441  * be a huge page for this block.
1442  * The saving stops at the boundary of the used_length of the block
1443  * if the RAMBlock isn't a multiple of the host page size.
1444  *
1445  * Returns the number of pages written or negative on error
1446  *
1447  * @rs: current RAM state
1448  * @ms: current migration state
1449  * @pss: data about the page we want to send
1450  * @last_stage: if we are at the completion stage
1451  */
1452 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1453                               bool last_stage)
1454 {
1455     int tmppages, pages = 0;
1456     size_t pagesize_bits =
1457         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1458
1459     do {
1460         tmppages = ram_save_target_page(rs, pss, last_stage);
1461         if (tmppages < 0) {
1462             return tmppages;
1463         }
1464
1465         pages += tmppages;
1466         pss->page++;
1467     } while ((pss->page & (pagesize_bits - 1)) &&
1468              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1469
1470     /* The offset we leave with is the last one we looked at */
1471     pss->page--;
1472     return pages;
1473 }
1474
1475 /**
1476  * ram_find_and_save_block: finds a dirty page and sends it to f
1477  *
1478  * Called within an RCU critical section.
1479  *
1480  * Returns the number of pages written where zero means no dirty pages
1481  *
1482  * @rs: current RAM state
1483  * @last_stage: if we are at the completion stage
1484  *
1485  * On systems where host-page-size > target-page-size it will send all the
1486  * pages in a host page that are dirty.
1487  */
1488
1489 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1490 {
1491     PageSearchStatus pss;
1492     int pages = 0;
1493     bool again, found;
1494
1495     /* No dirty page as there is zero RAM */
1496     if (!ram_bytes_total()) {
1497         return pages;
1498     }
1499
1500     pss.block = rs->last_seen_block;
1501     pss.page = rs->last_page;
1502     pss.complete_round = false;
1503
1504     if (!pss.block) {
1505         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1506     }
1507
1508     do {
1509         again = true;
1510         found = get_queued_page(rs, &pss);
1511
1512         if (!found) {
1513             /* priority queue empty, so just search for something dirty */
1514             found = find_dirty_block(rs, &pss, &again);
1515         }
1516
1517         if (found) {
1518             pages = ram_save_host_page(rs, &pss, last_stage);
1519         }
1520     } while (!pages && again);
1521
1522     rs->last_seen_block = pss.block;
1523     rs->last_page = pss.page;
1524
1525     return pages;
1526 }
1527
1528 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1529 {
1530     uint64_t pages = size / TARGET_PAGE_SIZE;
1531
1532     if (zero) {
1533         ram_counters.duplicate += pages;
1534     } else {
1535         ram_counters.normal += pages;
1536         ram_counters.transferred += size;
1537         qemu_update_position(f, size);
1538     }
1539 }
1540
1541 uint64_t ram_bytes_total(void)
1542 {
1543     RAMBlock *block;
1544     uint64_t total = 0;
1545
1546     rcu_read_lock();
1547     RAMBLOCK_FOREACH(block) {
1548         total += block->used_length;
1549     }
1550     rcu_read_unlock();
1551     return total;
1552 }
1553
1554 static void xbzrle_load_setup(void)
1555 {
1556     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1557 }
1558
1559 static void xbzrle_load_cleanup(void)
1560 {
1561     g_free(XBZRLE.decoded_buf);
1562     XBZRLE.decoded_buf = NULL;
1563 }
1564
1565 static void ram_save_cleanup(void *opaque)
1566 {
1567     RAMState **rsp = opaque;
1568     RAMBlock *block;
1569
1570     /* caller have hold iothread lock or is in a bh, so there is
1571      * no writing race against this migration_bitmap
1572      */
1573     memory_global_dirty_log_stop();
1574
1575     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1576         g_free(block->bmap);
1577         block->bmap = NULL;
1578         g_free(block->unsentmap);
1579         block->unsentmap = NULL;
1580     }
1581
1582     XBZRLE_cache_lock();
1583     if (XBZRLE.cache) {
1584         cache_fini(XBZRLE.cache);
1585         g_free(XBZRLE.encoded_buf);
1586         g_free(XBZRLE.current_buf);
1587         g_free(XBZRLE.zero_target_page);
1588         XBZRLE.cache = NULL;
1589         XBZRLE.encoded_buf = NULL;
1590         XBZRLE.current_buf = NULL;
1591         XBZRLE.zero_target_page = NULL;
1592     }
1593     XBZRLE_cache_unlock();
1594     migration_page_queue_free(*rsp);
1595     compress_threads_save_cleanup();
1596     g_free(*rsp);
1597     *rsp = NULL;
1598 }
1599
1600 static void ram_state_reset(RAMState *rs)
1601 {
1602     rs->last_seen_block = NULL;
1603     rs->last_sent_block = NULL;
1604     rs->last_page = 0;
1605     rs->last_version = ram_list.version;
1606     rs->ram_bulk_stage = true;
1607 }
1608
1609 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1610
1611 /*
1612  * 'expected' is the value you expect the bitmap mostly to be full
1613  * of; it won't bother printing lines that are all this value.
1614  * If 'todump' is null the migration bitmap is dumped.
1615  */
1616 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1617                            unsigned long pages)
1618 {
1619     int64_t cur;
1620     int64_t linelen = 128;
1621     char linebuf[129];
1622
1623     for (cur = 0; cur < pages; cur += linelen) {
1624         int64_t curb;
1625         bool found = false;
1626         /*
1627          * Last line; catch the case where the line length
1628          * is longer than remaining ram
1629          */
1630         if (cur + linelen > pages) {
1631             linelen = pages - cur;
1632         }
1633         for (curb = 0; curb < linelen; curb++) {
1634             bool thisbit = test_bit(cur + curb, todump);
1635             linebuf[curb] = thisbit ? '1' : '.';
1636             found = found || (thisbit != expected);
1637         }
1638         if (found) {
1639             linebuf[curb] = '\0';
1640             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1641         }
1642     }
1643 }
1644
1645 /* **** functions for postcopy ***** */
1646
1647 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1648 {
1649     struct RAMBlock *block;
1650
1651     RAMBLOCK_FOREACH(block) {
1652         unsigned long *bitmap = block->bmap;
1653         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1654         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1655
1656         while (run_start < range) {
1657             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1658             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1659                               (run_end - run_start) << TARGET_PAGE_BITS);
1660             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1661         }
1662     }
1663 }
1664
1665 /**
1666  * postcopy_send_discard_bm_ram: discard a RAMBlock
1667  *
1668  * Returns zero on success
1669  *
1670  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1671  * Note: At this point the 'unsentmap' is the processed bitmap combined
1672  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1673  *
1674  * @ms: current migration state
1675  * @pds: state for postcopy
1676  * @start: RAMBlock starting page
1677  * @length: RAMBlock size
1678  */
1679 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1680                                         PostcopyDiscardState *pds,
1681                                         RAMBlock *block)
1682 {
1683     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1684     unsigned long current;
1685     unsigned long *unsentmap = block->unsentmap;
1686
1687     for (current = 0; current < end; ) {
1688         unsigned long one = find_next_bit(unsentmap, end, current);
1689
1690         if (one <= end) {
1691             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1692             unsigned long discard_length;
1693
1694             if (zero >= end) {
1695                 discard_length = end - one;
1696             } else {
1697                 discard_length = zero - one;
1698             }
1699             if (discard_length) {
1700                 postcopy_discard_send_range(ms, pds, one, discard_length);
1701             }
1702             current = one + discard_length;
1703         } else {
1704             current = one;
1705         }
1706     }
1707
1708     return 0;
1709 }
1710
1711 /**
1712  * postcopy_each_ram_send_discard: discard all RAMBlocks
1713  *
1714  * Returns 0 for success or negative for error
1715  *
1716  * Utility for the outgoing postcopy code.
1717  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1718  *   passing it bitmap indexes and name.
1719  * (qemu_ram_foreach_block ends up passing unscaled lengths
1720  *  which would mean postcopy code would have to deal with target page)
1721  *
1722  * @ms: current migration state
1723  */
1724 static int postcopy_each_ram_send_discard(MigrationState *ms)
1725 {
1726     struct RAMBlock *block;
1727     int ret;
1728
1729     RAMBLOCK_FOREACH(block) {
1730         PostcopyDiscardState *pds =
1731             postcopy_discard_send_init(ms, block->idstr);
1732
1733         /*
1734          * Postcopy sends chunks of bitmap over the wire, but it
1735          * just needs indexes at this point, avoids it having
1736          * target page specific code.
1737          */
1738         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1739         postcopy_discard_send_finish(ms, pds);
1740         if (ret) {
1741             return ret;
1742         }
1743     }
1744
1745     return 0;
1746 }
1747
1748 /**
1749  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1750  *
1751  * Helper for postcopy_chunk_hostpages; it's called twice to
1752  * canonicalize the two bitmaps, that are similar, but one is
1753  * inverted.
1754  *
1755  * Postcopy requires that all target pages in a hostpage are dirty or
1756  * clean, not a mix.  This function canonicalizes the bitmaps.
1757  *
1758  * @ms: current migration state
1759  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1760  *               otherwise we need to canonicalize partially dirty host pages
1761  * @block: block that contains the page we want to canonicalize
1762  * @pds: state for postcopy
1763  */
1764 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1765                                           RAMBlock *block,
1766                                           PostcopyDiscardState *pds)
1767 {
1768     RAMState *rs = ram_state;
1769     unsigned long *bitmap = block->bmap;
1770     unsigned long *unsentmap = block->unsentmap;
1771     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1772     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1773     unsigned long run_start;
1774
1775     if (block->page_size == TARGET_PAGE_SIZE) {
1776         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1777         return;
1778     }
1779
1780     if (unsent_pass) {
1781         /* Find a sent page */
1782         run_start = find_next_zero_bit(unsentmap, pages, 0);
1783     } else {
1784         /* Find a dirty page */
1785         run_start = find_next_bit(bitmap, pages, 0);
1786     }
1787
1788     while (run_start < pages) {
1789         bool do_fixup = false;
1790         unsigned long fixup_start_addr;
1791         unsigned long host_offset;
1792
1793         /*
1794          * If the start of this run of pages is in the middle of a host
1795          * page, then we need to fixup this host page.
1796          */
1797         host_offset = run_start % host_ratio;
1798         if (host_offset) {
1799             do_fixup = true;
1800             run_start -= host_offset;
1801             fixup_start_addr = run_start;
1802             /* For the next pass */
1803             run_start = run_start + host_ratio;
1804         } else {
1805             /* Find the end of this run */
1806             unsigned long run_end;
1807             if (unsent_pass) {
1808                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1809             } else {
1810                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1811             }
1812             /*
1813              * If the end isn't at the start of a host page, then the
1814              * run doesn't finish at the end of a host page
1815              * and we need to discard.
1816              */
1817             host_offset = run_end % host_ratio;
1818             if (host_offset) {
1819                 do_fixup = true;
1820                 fixup_start_addr = run_end - host_offset;
1821                 /*
1822                  * This host page has gone, the next loop iteration starts
1823                  * from after the fixup
1824                  */
1825                 run_start = fixup_start_addr + host_ratio;
1826             } else {
1827                 /*
1828                  * No discards on this iteration, next loop starts from
1829                  * next sent/dirty page
1830                  */
1831                 run_start = run_end + 1;
1832             }
1833         }
1834
1835         if (do_fixup) {
1836             unsigned long page;
1837
1838             /* Tell the destination to discard this page */
1839             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1840                 /* For the unsent_pass we:
1841                  *     discard partially sent pages
1842                  * For the !unsent_pass (dirty) we:
1843                  *     discard partially dirty pages that were sent
1844                  *     (any partially sent pages were already discarded
1845                  *     by the previous unsent_pass)
1846                  */
1847                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1848                                             host_ratio);
1849             }
1850
1851             /* Clean up the bitmap */
1852             for (page = fixup_start_addr;
1853                  page < fixup_start_addr + host_ratio; page++) {
1854                 /* All pages in this host page are now not sent */
1855                 set_bit(page, unsentmap);
1856
1857                 /*
1858                  * Remark them as dirty, updating the count for any pages
1859                  * that weren't previously dirty.
1860                  */
1861                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1862             }
1863         }
1864
1865         if (unsent_pass) {
1866             /* Find the next sent page for the next iteration */
1867             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1868         } else {
1869             /* Find the next dirty page for the next iteration */
1870             run_start = find_next_bit(bitmap, pages, run_start);
1871         }
1872     }
1873 }
1874
1875 /**
1876  * postcopy_chuck_hostpages: discrad any partially sent host page
1877  *
1878  * Utility for the outgoing postcopy code.
1879  *
1880  * Discard any partially sent host-page size chunks, mark any partially
1881  * dirty host-page size chunks as all dirty.  In this case the host-page
1882  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1883  *
1884  * Returns zero on success
1885  *
1886  * @ms: current migration state
1887  * @block: block we want to work with
1888  */
1889 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1890 {
1891     PostcopyDiscardState *pds =
1892         postcopy_discard_send_init(ms, block->idstr);
1893
1894     /* First pass: Discard all partially sent host pages */
1895     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1896     /*
1897      * Second pass: Ensure that all partially dirty host pages are made
1898      * fully dirty.
1899      */
1900     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1901
1902     postcopy_discard_send_finish(ms, pds);
1903     return 0;
1904 }
1905
1906 /**
1907  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1908  *
1909  * Returns zero on success
1910  *
1911  * Transmit the set of pages to be discarded after precopy to the target
1912  * these are pages that:
1913  *     a) Have been previously transmitted but are now dirty again
1914  *     b) Pages that have never been transmitted, this ensures that
1915  *        any pages on the destination that have been mapped by background
1916  *        tasks get discarded (transparent huge pages is the specific concern)
1917  * Hopefully this is pretty sparse
1918  *
1919  * @ms: current migration state
1920  */
1921 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1922 {
1923     RAMState *rs = ram_state;
1924     RAMBlock *block;
1925     int ret;
1926
1927     rcu_read_lock();
1928
1929     /* This should be our last sync, the src is now paused */
1930     migration_bitmap_sync(rs);
1931
1932     /* Easiest way to make sure we don't resume in the middle of a host-page */
1933     rs->last_seen_block = NULL;
1934     rs->last_sent_block = NULL;
1935     rs->last_page = 0;
1936
1937     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1938         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1939         unsigned long *bitmap = block->bmap;
1940         unsigned long *unsentmap = block->unsentmap;
1941
1942         if (!unsentmap) {
1943             /* We don't have a safe way to resize the sentmap, so
1944              * if the bitmap was resized it will be NULL at this
1945              * point.
1946              */
1947             error_report("migration ram resized during precopy phase");
1948             rcu_read_unlock();
1949             return -EINVAL;
1950         }
1951         /* Deal with TPS != HPS and huge pages */
1952         ret = postcopy_chunk_hostpages(ms, block);
1953         if (ret) {
1954             rcu_read_unlock();
1955             return ret;
1956         }
1957
1958         /*
1959          * Update the unsentmap to be unsentmap = unsentmap | dirty
1960          */
1961         bitmap_or(unsentmap, unsentmap, bitmap, pages);
1962 #ifdef DEBUG_POSTCOPY
1963         ram_debug_dump_bitmap(unsentmap, true, pages);
1964 #endif
1965     }
1966     trace_ram_postcopy_send_discard_bitmap();
1967
1968     ret = postcopy_each_ram_send_discard(ms);
1969     rcu_read_unlock();
1970
1971     return ret;
1972 }
1973
1974 /**
1975  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1976  *
1977  * Returns zero on success
1978  *
1979  * @rbname: name of the RAMBlock of the request. NULL means the
1980  *          same that last one.
1981  * @start: RAMBlock starting page
1982  * @length: RAMBlock size
1983  */
1984 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1985 {
1986     int ret = -1;
1987
1988     trace_ram_discard_range(rbname, start, length);
1989
1990     rcu_read_lock();
1991     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1992
1993     if (!rb) {
1994         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1995         goto err;
1996     }
1997
1998     ret = ram_block_discard_range(rb, start, length);
1999
2000 err:
2001     rcu_read_unlock();
2002
2003     return ret;
2004 }
2005
2006 static int ram_state_init(RAMState **rsp)
2007 {
2008     *rsp = g_new0(RAMState, 1);
2009
2010     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2011     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2012     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2013
2014     if (migrate_use_xbzrle()) {
2015         XBZRLE_cache_lock();
2016         XBZRLE.zero_target_page = g_malloc0(TARGET_PAGE_SIZE);
2017         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
2018                                   TARGET_PAGE_SIZE,
2019                                   TARGET_PAGE_SIZE);
2020         if (!XBZRLE.cache) {
2021             XBZRLE_cache_unlock();
2022             error_report("Error creating cache");
2023             g_free(*rsp);
2024             *rsp = NULL;
2025             return -1;
2026         }
2027         XBZRLE_cache_unlock();
2028
2029         /* We prefer not to abort if there is no memory */
2030         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2031         if (!XBZRLE.encoded_buf) {
2032             error_report("Error allocating encoded_buf");
2033             g_free(*rsp);
2034             *rsp = NULL;
2035             return -1;
2036         }
2037
2038         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2039         if (!XBZRLE.current_buf) {
2040             error_report("Error allocating current_buf");
2041             g_free(XBZRLE.encoded_buf);
2042             XBZRLE.encoded_buf = NULL;
2043             g_free(*rsp);
2044             *rsp = NULL;
2045             return -1;
2046         }
2047     }
2048
2049     /* For memory_global_dirty_log_start below.  */
2050     qemu_mutex_lock_iothread();
2051
2052     qemu_mutex_lock_ramlist();
2053     rcu_read_lock();
2054     ram_state_reset(*rsp);
2055
2056     /* Skip setting bitmap if there is no RAM */
2057     if (ram_bytes_total()) {
2058         RAMBlock *block;
2059
2060         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2061             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
2062
2063             block->bmap = bitmap_new(pages);
2064             bitmap_set(block->bmap, 0, pages);
2065             if (migrate_postcopy_ram()) {
2066                 block->unsentmap = bitmap_new(pages);
2067                 bitmap_set(block->unsentmap, 0, pages);
2068             }
2069         }
2070     }
2071
2072     /*
2073      * Count the total number of pages used by ram blocks not including any
2074      * gaps due to alignment or unplugs.
2075      */
2076     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2077
2078     memory_global_dirty_log_start();
2079     migration_bitmap_sync(*rsp);
2080     qemu_mutex_unlock_ramlist();
2081     qemu_mutex_unlock_iothread();
2082     rcu_read_unlock();
2083
2084     return 0;
2085 }
2086
2087 /*
2088  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2089  * long-running RCU critical section.  When rcu-reclaims in the code
2090  * start to become numerous it will be necessary to reduce the
2091  * granularity of these critical sections.
2092  */
2093
2094 /**
2095  * ram_save_setup: Setup RAM for migration
2096  *
2097  * Returns zero to indicate success and negative for error
2098  *
2099  * @f: QEMUFile where to send the data
2100  * @opaque: RAMState pointer
2101  */
2102 static int ram_save_setup(QEMUFile *f, void *opaque)
2103 {
2104     RAMState **rsp = opaque;
2105     RAMBlock *block;
2106
2107     /* migration has already setup the bitmap, reuse it. */
2108     if (!migration_in_colo_state()) {
2109         if (ram_state_init(rsp) != 0) {
2110             return -1;
2111         }
2112     }
2113     (*rsp)->f = f;
2114
2115     rcu_read_lock();
2116
2117     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2118
2119     RAMBLOCK_FOREACH(block) {
2120         qemu_put_byte(f, strlen(block->idstr));
2121         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2122         qemu_put_be64(f, block->used_length);
2123         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2124             qemu_put_be64(f, block->page_size);
2125         }
2126     }
2127
2128     rcu_read_unlock();
2129     compress_threads_save_setup();
2130
2131     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2132     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2133
2134     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2135
2136     return 0;
2137 }
2138
2139 /**
2140  * ram_save_iterate: iterative stage for migration
2141  *
2142  * Returns zero to indicate success and negative for error
2143  *
2144  * @f: QEMUFile where to send the data
2145  * @opaque: RAMState pointer
2146  */
2147 static int ram_save_iterate(QEMUFile *f, void *opaque)
2148 {
2149     RAMState **temp = opaque;
2150     RAMState *rs = *temp;
2151     int ret;
2152     int i;
2153     int64_t t0;
2154     int done = 0;
2155
2156     rcu_read_lock();
2157     if (ram_list.version != rs->last_version) {
2158         ram_state_reset(rs);
2159     }
2160
2161     /* Read version before ram_list.blocks */
2162     smp_rmb();
2163
2164     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2165
2166     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2167     i = 0;
2168     while ((ret = qemu_file_rate_limit(f)) == 0) {
2169         int pages;
2170
2171         pages = ram_find_and_save_block(rs, false);
2172         /* no more pages to sent */
2173         if (pages == 0) {
2174             done = 1;
2175             break;
2176         }
2177         rs->iterations++;
2178
2179         /* we want to check in the 1st loop, just in case it was the 1st time
2180            and we had to sync the dirty bitmap.
2181            qemu_get_clock_ns() is a bit expensive, so we only check each some
2182            iterations
2183         */
2184         if ((i & 63) == 0) {
2185             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2186             if (t1 > MAX_WAIT) {
2187                 trace_ram_save_iterate_big_wait(t1, i);
2188                 break;
2189             }
2190         }
2191         i++;
2192     }
2193     flush_compressed_data(rs);
2194     rcu_read_unlock();
2195
2196     /*
2197      * Must occur before EOS (or any QEMUFile operation)
2198      * because of RDMA protocol.
2199      */
2200     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2201
2202     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2203     ram_counters.transferred += 8;
2204
2205     ret = qemu_file_get_error(f);
2206     if (ret < 0) {
2207         return ret;
2208     }
2209
2210     return done;
2211 }
2212
2213 /**
2214  * ram_save_complete: function called to send the remaining amount of ram
2215  *
2216  * Returns zero to indicate success
2217  *
2218  * Called with iothread lock
2219  *
2220  * @f: QEMUFile where to send the data
2221  * @opaque: RAMState pointer
2222  */
2223 static int ram_save_complete(QEMUFile *f, void *opaque)
2224 {
2225     RAMState **temp = opaque;
2226     RAMState *rs = *temp;
2227
2228     rcu_read_lock();
2229
2230     if (!migration_in_postcopy()) {
2231         migration_bitmap_sync(rs);
2232     }
2233
2234     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2235
2236     /* try transferring iterative blocks of memory */
2237
2238     /* flush all remaining blocks regardless of rate limiting */
2239     while (true) {
2240         int pages;
2241
2242         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2243         /* no more blocks to sent */
2244         if (pages == 0) {
2245             break;
2246         }
2247     }
2248
2249     flush_compressed_data(rs);
2250     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2251
2252     rcu_read_unlock();
2253
2254     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2255
2256     return 0;
2257 }
2258
2259 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2260                              uint64_t *non_postcopiable_pending,
2261                              uint64_t *postcopiable_pending)
2262 {
2263     RAMState **temp = opaque;
2264     RAMState *rs = *temp;
2265     uint64_t remaining_size;
2266
2267     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2268
2269     if (!migration_in_postcopy() &&
2270         remaining_size < max_size) {
2271         qemu_mutex_lock_iothread();
2272         rcu_read_lock();
2273         migration_bitmap_sync(rs);
2274         rcu_read_unlock();
2275         qemu_mutex_unlock_iothread();
2276         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2277     }
2278
2279     if (migrate_postcopy_ram()) {
2280         /* We can do postcopy, and all the data is postcopiable */
2281         *postcopiable_pending += remaining_size;
2282     } else {
2283         *non_postcopiable_pending += remaining_size;
2284     }
2285 }
2286
2287 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2288 {
2289     unsigned int xh_len;
2290     int xh_flags;
2291     uint8_t *loaded_data;
2292
2293     /* extract RLE header */
2294     xh_flags = qemu_get_byte(f);
2295     xh_len = qemu_get_be16(f);
2296
2297     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2298         error_report("Failed to load XBZRLE page - wrong compression!");
2299         return -1;
2300     }
2301
2302     if (xh_len > TARGET_PAGE_SIZE) {
2303         error_report("Failed to load XBZRLE page - len overflow!");
2304         return -1;
2305     }
2306     loaded_data = XBZRLE.decoded_buf;
2307     /* load data and decode */
2308     /* it can change loaded_data to point to an internal buffer */
2309     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2310
2311     /* decode RLE */
2312     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2313                              TARGET_PAGE_SIZE) == -1) {
2314         error_report("Failed to load XBZRLE page - decode error!");
2315         return -1;
2316     }
2317
2318     return 0;
2319 }
2320
2321 /**
2322  * ram_block_from_stream: read a RAMBlock id from the migration stream
2323  *
2324  * Must be called from within a rcu critical section.
2325  *
2326  * Returns a pointer from within the RCU-protected ram_list.
2327  *
2328  * @f: QEMUFile where to read the data from
2329  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2330  */
2331 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2332 {
2333     static RAMBlock *block = NULL;
2334     char id[256];
2335     uint8_t len;
2336
2337     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2338         if (!block) {
2339             error_report("Ack, bad migration stream!");
2340             return NULL;
2341         }
2342         return block;
2343     }
2344
2345     len = qemu_get_byte(f);
2346     qemu_get_buffer(f, (uint8_t *)id, len);
2347     id[len] = 0;
2348
2349     block = qemu_ram_block_by_name(id);
2350     if (!block) {
2351         error_report("Can't find block %s", id);
2352         return NULL;
2353     }
2354
2355     return block;
2356 }
2357
2358 static inline void *host_from_ram_block_offset(RAMBlock *block,
2359                                                ram_addr_t offset)
2360 {
2361     if (!offset_in_ramblock(block, offset)) {
2362         return NULL;
2363     }
2364
2365     return block->host + offset;
2366 }
2367
2368 /**
2369  * ram_handle_compressed: handle the zero page case
2370  *
2371  * If a page (or a whole RDMA chunk) has been
2372  * determined to be zero, then zap it.
2373  *
2374  * @host: host address for the zero page
2375  * @ch: what the page is filled from.  We only support zero
2376  * @size: size of the zero page
2377  */
2378 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2379 {
2380     if (ch != 0 || !is_zero_range(host, size)) {
2381         memset(host, ch, size);
2382     }
2383 }
2384
2385 static void *do_data_decompress(void *opaque)
2386 {
2387     DecompressParam *param = opaque;
2388     unsigned long pagesize;
2389     uint8_t *des;
2390     int len;
2391
2392     qemu_mutex_lock(&param->mutex);
2393     while (!param->quit) {
2394         if (param->des) {
2395             des = param->des;
2396             len = param->len;
2397             param->des = 0;
2398             qemu_mutex_unlock(&param->mutex);
2399
2400             pagesize = TARGET_PAGE_SIZE;
2401             /* uncompress() will return failed in some case, especially
2402              * when the page is dirted when doing the compression, it's
2403              * not a problem because the dirty page will be retransferred
2404              * and uncompress() won't break the data in other pages.
2405              */
2406             uncompress((Bytef *)des, &pagesize,
2407                        (const Bytef *)param->compbuf, len);
2408
2409             qemu_mutex_lock(&decomp_done_lock);
2410             param->done = true;
2411             qemu_cond_signal(&decomp_done_cond);
2412             qemu_mutex_unlock(&decomp_done_lock);
2413
2414             qemu_mutex_lock(&param->mutex);
2415         } else {
2416             qemu_cond_wait(&param->cond, &param->mutex);
2417         }
2418     }
2419     qemu_mutex_unlock(&param->mutex);
2420
2421     return NULL;
2422 }
2423
2424 static void wait_for_decompress_done(void)
2425 {
2426     int idx, thread_count;
2427
2428     if (!migrate_use_compression()) {
2429         return;
2430     }
2431
2432     thread_count = migrate_decompress_threads();
2433     qemu_mutex_lock(&decomp_done_lock);
2434     for (idx = 0; idx < thread_count; idx++) {
2435         while (!decomp_param[idx].done) {
2436             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2437         }
2438     }
2439     qemu_mutex_unlock(&decomp_done_lock);
2440 }
2441
2442 static void compress_threads_load_setup(void)
2443 {
2444     int i, thread_count;
2445
2446     if (!migrate_use_compression()) {
2447         return;
2448     }
2449     thread_count = migrate_decompress_threads();
2450     decompress_threads = g_new0(QemuThread, thread_count);
2451     decomp_param = g_new0(DecompressParam, thread_count);
2452     qemu_mutex_init(&decomp_done_lock);
2453     qemu_cond_init(&decomp_done_cond);
2454     for (i = 0; i < thread_count; i++) {
2455         qemu_mutex_init(&decomp_param[i].mutex);
2456         qemu_cond_init(&decomp_param[i].cond);
2457         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2458         decomp_param[i].done = true;
2459         decomp_param[i].quit = false;
2460         qemu_thread_create(decompress_threads + i, "decompress",
2461                            do_data_decompress, decomp_param + i,
2462                            QEMU_THREAD_JOINABLE);
2463     }
2464 }
2465
2466 static void compress_threads_load_cleanup(void)
2467 {
2468     int i, thread_count;
2469
2470     if (!migrate_use_compression()) {
2471         return;
2472     }
2473     thread_count = migrate_decompress_threads();
2474     for (i = 0; i < thread_count; i++) {
2475         qemu_mutex_lock(&decomp_param[i].mutex);
2476         decomp_param[i].quit = true;
2477         qemu_cond_signal(&decomp_param[i].cond);
2478         qemu_mutex_unlock(&decomp_param[i].mutex);
2479     }
2480     for (i = 0; i < thread_count; i++) {
2481         qemu_thread_join(decompress_threads + i);
2482         qemu_mutex_destroy(&decomp_param[i].mutex);
2483         qemu_cond_destroy(&decomp_param[i].cond);
2484         g_free(decomp_param[i].compbuf);
2485     }
2486     g_free(decompress_threads);
2487     g_free(decomp_param);
2488     decompress_threads = NULL;
2489     decomp_param = NULL;
2490 }
2491
2492 static void decompress_data_with_multi_threads(QEMUFile *f,
2493                                                void *host, int len)
2494 {
2495     int idx, thread_count;
2496
2497     thread_count = migrate_decompress_threads();
2498     qemu_mutex_lock(&decomp_done_lock);
2499     while (true) {
2500         for (idx = 0; idx < thread_count; idx++) {
2501             if (decomp_param[idx].done) {
2502                 decomp_param[idx].done = false;
2503                 qemu_mutex_lock(&decomp_param[idx].mutex);
2504                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2505                 decomp_param[idx].des = host;
2506                 decomp_param[idx].len = len;
2507                 qemu_cond_signal(&decomp_param[idx].cond);
2508                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2509                 break;
2510             }
2511         }
2512         if (idx < thread_count) {
2513             break;
2514         } else {
2515             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2516         }
2517     }
2518     qemu_mutex_unlock(&decomp_done_lock);
2519 }
2520
2521 /**
2522  * ram_load_setup: Setup RAM for migration incoming side
2523  *
2524  * Returns zero to indicate success and negative for error
2525  *
2526  * @f: QEMUFile where to receive the data
2527  * @opaque: RAMState pointer
2528  */
2529 static int ram_load_setup(QEMUFile *f, void *opaque)
2530 {
2531     xbzrle_load_setup();
2532     compress_threads_load_setup();
2533     return 0;
2534 }
2535
2536 static int ram_load_cleanup(void *opaque)
2537 {
2538     xbzrle_load_cleanup();
2539     compress_threads_load_cleanup();
2540     return 0;
2541 }
2542
2543 /**
2544  * ram_postcopy_incoming_init: allocate postcopy data structures
2545  *
2546  * Returns 0 for success and negative if there was one error
2547  *
2548  * @mis: current migration incoming state
2549  *
2550  * Allocate data structures etc needed by incoming migration with
2551  * postcopy-ram. postcopy-ram's similarly names
2552  * postcopy_ram_incoming_init does the work.
2553  */
2554 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2555 {
2556     unsigned long ram_pages = last_ram_page();
2557
2558     return postcopy_ram_incoming_init(mis, ram_pages);
2559 }
2560
2561 /**
2562  * ram_load_postcopy: load a page in postcopy case
2563  *
2564  * Returns 0 for success or -errno in case of error
2565  *
2566  * Called in postcopy mode by ram_load().
2567  * rcu_read_lock is taken prior to this being called.
2568  *
2569  * @f: QEMUFile where to send the data
2570  */
2571 static int ram_load_postcopy(QEMUFile *f)
2572 {
2573     int flags = 0, ret = 0;
2574     bool place_needed = false;
2575     bool matching_page_sizes = false;
2576     MigrationIncomingState *mis = migration_incoming_get_current();
2577     /* Temporary page that is later 'placed' */
2578     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2579     void *last_host = NULL;
2580     bool all_zero = false;
2581
2582     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2583         ram_addr_t addr;
2584         void *host = NULL;
2585         void *page_buffer = NULL;
2586         void *place_source = NULL;
2587         RAMBlock *block = NULL;
2588         uint8_t ch;
2589
2590         addr = qemu_get_be64(f);
2591         flags = addr & ~TARGET_PAGE_MASK;
2592         addr &= TARGET_PAGE_MASK;
2593
2594         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2595         place_needed = false;
2596         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2597             block = ram_block_from_stream(f, flags);
2598
2599             host = host_from_ram_block_offset(block, addr);
2600             if (!host) {
2601                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2602                 ret = -EINVAL;
2603                 break;
2604             }
2605             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2606             /*
2607              * Postcopy requires that we place whole host pages atomically;
2608              * these may be huge pages for RAMBlocks that are backed by
2609              * hugetlbfs.
2610              * To make it atomic, the data is read into a temporary page
2611              * that's moved into place later.
2612              * The migration protocol uses,  possibly smaller, target-pages
2613              * however the source ensures it always sends all the components
2614              * of a host page in order.
2615              */
2616             page_buffer = postcopy_host_page +
2617                           ((uintptr_t)host & (block->page_size - 1));
2618             /* If all TP are zero then we can optimise the place */
2619             if (!((uintptr_t)host & (block->page_size - 1))) {
2620                 all_zero = true;
2621             } else {
2622                 /* not the 1st TP within the HP */
2623                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2624                     error_report("Non-sequential target page %p/%p",
2625                                   host, last_host);
2626                     ret = -EINVAL;
2627                     break;
2628                 }
2629             }
2630
2631
2632             /*
2633              * If it's the last part of a host page then we place the host
2634              * page
2635              */
2636             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2637                                      (block->page_size - 1)) == 0;
2638             place_source = postcopy_host_page;
2639         }
2640         last_host = host;
2641
2642         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2643         case RAM_SAVE_FLAG_ZERO:
2644             ch = qemu_get_byte(f);
2645             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2646             if (ch) {
2647                 all_zero = false;
2648             }
2649             break;
2650
2651         case RAM_SAVE_FLAG_PAGE:
2652             all_zero = false;
2653             if (!place_needed || !matching_page_sizes) {
2654                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2655             } else {
2656                 /* Avoids the qemu_file copy during postcopy, which is
2657                  * going to do a copy later; can only do it when we
2658                  * do this read in one go (matching page sizes)
2659                  */
2660                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2661                                          TARGET_PAGE_SIZE);
2662             }
2663             break;
2664         case RAM_SAVE_FLAG_EOS:
2665             /* normal exit */
2666             break;
2667         default:
2668             error_report("Unknown combination of migration flags: %#x"
2669                          " (postcopy mode)", flags);
2670             ret = -EINVAL;
2671         }
2672
2673         if (place_needed) {
2674             /* This gets called at the last target page in the host page */
2675             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2676
2677             if (all_zero) {
2678                 ret = postcopy_place_page_zero(mis, place_dest,
2679                                                block->page_size);
2680             } else {
2681                 ret = postcopy_place_page(mis, place_dest,
2682                                           place_source, block->page_size);
2683             }
2684         }
2685         if (!ret) {
2686             ret = qemu_file_get_error(f);
2687         }
2688     }
2689
2690     return ret;
2691 }
2692
2693 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2694 {
2695     int flags = 0, ret = 0, invalid_flags = 0;
2696     static uint64_t seq_iter;
2697     int len = 0;
2698     /*
2699      * If system is running in postcopy mode, page inserts to host memory must
2700      * be atomic
2701      */
2702     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2703     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2704     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2705
2706     seq_iter++;
2707
2708     if (version_id != 4) {
2709         ret = -EINVAL;
2710     }
2711
2712     if (!migrate_use_compression()) {
2713         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2714     }
2715     /* This RCU critical section can be very long running.
2716      * When RCU reclaims in the code start to become numerous,
2717      * it will be necessary to reduce the granularity of this
2718      * critical section.
2719      */
2720     rcu_read_lock();
2721
2722     if (postcopy_running) {
2723         ret = ram_load_postcopy(f);
2724     }
2725
2726     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2727         ram_addr_t addr, total_ram_bytes;
2728         void *host = NULL;
2729         uint8_t ch;
2730
2731         addr = qemu_get_be64(f);
2732         flags = addr & ~TARGET_PAGE_MASK;
2733         addr &= TARGET_PAGE_MASK;
2734
2735         if (flags & invalid_flags) {
2736             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2737                 error_report("Received an unexpected compressed page");
2738             }
2739
2740             ret = -EINVAL;
2741             break;
2742         }
2743
2744         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2745                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2746             RAMBlock *block = ram_block_from_stream(f, flags);
2747
2748             host = host_from_ram_block_offset(block, addr);
2749             if (!host) {
2750                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2751                 ret = -EINVAL;
2752                 break;
2753             }
2754             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2755         }
2756
2757         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2758         case RAM_SAVE_FLAG_MEM_SIZE:
2759             /* Synchronize RAM block list */
2760             total_ram_bytes = addr;
2761             while (!ret && total_ram_bytes) {
2762                 RAMBlock *block;
2763                 char id[256];
2764                 ram_addr_t length;
2765
2766                 len = qemu_get_byte(f);
2767                 qemu_get_buffer(f, (uint8_t *)id, len);
2768                 id[len] = 0;
2769                 length = qemu_get_be64(f);
2770
2771                 block = qemu_ram_block_by_name(id);
2772                 if (block) {
2773                     if (length != block->used_length) {
2774                         Error *local_err = NULL;
2775
2776                         ret = qemu_ram_resize(block, length,
2777                                               &local_err);
2778                         if (local_err) {
2779                             error_report_err(local_err);
2780                         }
2781                     }
2782                     /* For postcopy we need to check hugepage sizes match */
2783                     if (postcopy_advised &&
2784                         block->page_size != qemu_host_page_size) {
2785                         uint64_t remote_page_size = qemu_get_be64(f);
2786                         if (remote_page_size != block->page_size) {
2787                             error_report("Mismatched RAM page size %s "
2788                                          "(local) %zd != %" PRId64,
2789                                          id, block->page_size,
2790                                          remote_page_size);
2791                             ret = -EINVAL;
2792                         }
2793                     }
2794                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2795                                           block->idstr);
2796                 } else {
2797                     error_report("Unknown ramblock \"%s\", cannot "
2798                                  "accept migration", id);
2799                     ret = -EINVAL;
2800                 }
2801
2802                 total_ram_bytes -= length;
2803             }
2804             break;
2805
2806         case RAM_SAVE_FLAG_ZERO:
2807             ch = qemu_get_byte(f);
2808             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2809             break;
2810
2811         case RAM_SAVE_FLAG_PAGE:
2812             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2813             break;
2814
2815         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2816             len = qemu_get_be32(f);
2817             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2818                 error_report("Invalid compressed data length: %d", len);
2819                 ret = -EINVAL;
2820                 break;
2821             }
2822             decompress_data_with_multi_threads(f, host, len);
2823             break;
2824
2825         case RAM_SAVE_FLAG_XBZRLE:
2826             if (load_xbzrle(f, addr, host) < 0) {
2827                 error_report("Failed to decompress XBZRLE page at "
2828                              RAM_ADDR_FMT, addr);
2829                 ret = -EINVAL;
2830                 break;
2831             }
2832             break;
2833         case RAM_SAVE_FLAG_EOS:
2834             /* normal exit */
2835             break;
2836         default:
2837             if (flags & RAM_SAVE_FLAG_HOOK) {
2838                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2839             } else {
2840                 error_report("Unknown combination of migration flags: %#x",
2841                              flags);
2842                 ret = -EINVAL;
2843             }
2844         }
2845         if (!ret) {
2846             ret = qemu_file_get_error(f);
2847         }
2848     }
2849
2850     wait_for_decompress_done();
2851     rcu_read_unlock();
2852     trace_ram_load_complete(ret, seq_iter);
2853     return ret;
2854 }
2855
2856 static bool ram_has_postcopy(void *opaque)
2857 {
2858     return migrate_postcopy_ram();
2859 }
2860
2861 static SaveVMHandlers savevm_ram_handlers = {
2862     .save_setup = ram_save_setup,
2863     .save_live_iterate = ram_save_iterate,
2864     .save_live_complete_postcopy = ram_save_complete,
2865     .save_live_complete_precopy = ram_save_complete,
2866     .has_postcopy = ram_has_postcopy,
2867     .save_live_pending = ram_save_pending,
2868     .load_state = ram_load,
2869     .save_cleanup = ram_save_cleanup,
2870     .load_setup = ram_load_setup,
2871     .load_cleanup = ram_load_cleanup,
2872 };
2873
2874 void ram_mig_init(void)
2875 {
2876     qemu_mutex_init(&XBZRLE.lock);
2877     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2878 }