migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "cpu.h"
  30 #include <zlib.h>
  31 #include "qapi-event.h"
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration/register.h"
  40 #include "migration/misc.h"
  41 #include "qemu-file.h"
  42 #include "postcopy-ram.h"
  43 #include "migration/page_cache.h"
  44 #include "qemu/error-report.h"
  45 #include "trace.h"
  46 #include "exec/ram_addr.h"
  47 #include "qemu/rcu_queue.h"
  48 #include "migration/colo.h"
  49 #include "migration/block.h"
  50
  51 /***********************************************************/
  52 /* ram save/restore */
  53
  54 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  55  * worked for pages that where filled with the same char.  We switched
  56  * it to only search for the zero value.  And to avoid confusion with
  57  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  58  */
  59
  60 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  61 #define RAM_SAVE_FLAG_ZERO     0x02
  62 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  63 #define RAM_SAVE_FLAG_PAGE     0x08
  64 #define RAM_SAVE_FLAG_EOS      0x10
  65 #define RAM_SAVE_FLAG_CONTINUE 0x20
  66 #define RAM_SAVE_FLAG_XBZRLE   0x40
  67 /* 0x80 is reserved in migration.h start with 0x100 next */
  68 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  69
  70 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  71 {
  72     return buffer_is_zero(p, size);
  73 }
  74
  75 XBZRLECacheStats xbzrle_counters;
  76
  77 /* struct contains XBZRLE cache and a static page
  78    used by the compression */
  79 static struct {
  80     /* buffer used for XBZRLE encoding */
  81     uint8_t *encoded_buf;
  82     /* buffer for storing page content */
  83     uint8_t *current_buf;
  84     /* Cache for XBZRLE, Protected by lock. */
  85     PageCache *cache;
  86     QemuMutex lock;
  87     /* it will store a page full of zeros */
  88     uint8_t *zero_target_page;
  89     /* buffer used for XBZRLE decoding */
  90     uint8_t *decoded_buf;
  91 } XBZRLE;
  92
  93 static void XBZRLE_cache_lock(void)
  94 {
  95     if (migrate_use_xbzrle())
  96         qemu_mutex_lock(&XBZRLE.lock);
  97 }
  98
  99 static void XBZRLE_cache_unlock(void)
 100 {
 101     if (migrate_use_xbzrle())
 102         qemu_mutex_unlock(&XBZRLE.lock);
 103 }
 104
 105 /**
 106  * xbzrle_cache_resize: resize the xbzrle cache
 107  *
 108  * This function is called from qmp_migrate_set_cache_size in main
 109  * thread, possibly while a migration is in progress.  A running
 110  * migration may be using the cache and might finish during this call,
 111  * hence changes to the cache are protected by XBZRLE.lock().
 112  *
 113  * Returns the new_size or negative in case of error.
 114  *
 115  * @new_size: new cache size
 116  */
 117 int64_t xbzrle_cache_resize(int64_t new_size)
 118 {
 119     PageCache *new_cache;
 120     int64_t ret;
 121
 122     if (new_size < TARGET_PAGE_SIZE) {
 123         return -1;
 124     }
 125
 126     XBZRLE_cache_lock();
 127
 128     if (XBZRLE.cache != NULL) {
 129         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 130             goto out_new_size;
 131         }
 132         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 133                                         TARGET_PAGE_SIZE);
 134         if (!new_cache) {
 135             error_report("Error creating cache");
 136             ret = -1;
 137             goto out;
 138         }
 139
 140         cache_fini(XBZRLE.cache);
 141         XBZRLE.cache = new_cache;
 142     }
 143
 144 out_new_size:
 145     ret = pow2floor(new_size);
 146 out:
 147     XBZRLE_cache_unlock();
 148     return ret;
 149 }
 150
 151 /*
 152  * An outstanding page request, on the source, having been received
 153  * and queued
 154  */
 155 struct RAMSrcPageRequest {
 156     RAMBlock *rb;
 157     hwaddr    offset;
 158     hwaddr    len;
 159
 160     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 161 };
 162
 163 /* State of RAM for migration */
 164 struct RAMState {
 165     /* QEMUFile used for this migration */
 166     QEMUFile *f;
 167     /* Last block that we have visited searching for dirty pages */
 168     RAMBlock *last_seen_block;
 169     /* Last block from where we have sent data */
 170     RAMBlock *last_sent_block;
 171     /* Last dirty target page we have sent */
 172     ram_addr_t last_page;
 173     /* last ram version we have seen */
 174     uint32_t last_version;
 175     /* We are in the first round */
 176     bool ram_bulk_stage;
 177     /* How many times we have dirty too many pages */
 178     int dirty_rate_high_cnt;
 179     /* these variables are used for bitmap sync */
 180     /* last time we did a full bitmap_sync */
 181     int64_t time_last_bitmap_sync;
 182     /* bytes transferred at start_time */
 183     uint64_t bytes_xfer_prev;
 184     /* number of dirty pages since start_time */
 185     uint64_t num_dirty_pages_period;
 186     /* xbzrle misses since the beginning of the period */
 187     uint64_t xbzrle_cache_miss_prev;
 188     /* number of iterations at the beginning of period */
 189     uint64_t iterations_prev;
 190     /* Iterations since start */
 191     uint64_t iterations;
 192     /* number of dirty bits in the bitmap */
 193     uint64_t migration_dirty_pages;
 194     /* protects modification of the bitmap */
 195     QemuMutex bitmap_mutex;
 196     /* The RAMBlock used in the last src_page_requests */
 197     RAMBlock *last_req_rb;
 198     /* Queue of outstanding page requests from the destination */
 199     QemuMutex src_page_req_mutex;
 200     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 201 };
 202 typedef struct RAMState RAMState;
 203
 204 static RAMState *ram_state;
 205
 206 uint64_t ram_bytes_remaining(void)
 207 {
 208     return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
 209 }
 210
 211 MigrationStats ram_counters;
 212
 213 /* used by the search for pages to send */
 214 struct PageSearchStatus {
 215     /* Current block being searched */
 216     RAMBlock    *block;
 217     /* Current page to search from */
 218     unsigned long page;
 219     /* Set once we wrap around */
 220     bool         complete_round;
 221 };
 222 typedef struct PageSearchStatus PageSearchStatus;
 223
 224 struct CompressParam {
 225     bool done;
 226     bool quit;
 227     QEMUFile *file;
 228     QemuMutex mutex;
 229     QemuCond cond;
 230     RAMBlock *block;
 231     ram_addr_t offset;
 232 };
 233 typedef struct CompressParam CompressParam;
 234
 235 struct DecompressParam {
 236     bool done;
 237     bool quit;
 238     QemuMutex mutex;
 239     QemuCond cond;
 240     void *des;
 241     uint8_t *compbuf;
 242     int len;
 243 };
 244 typedef struct DecompressParam DecompressParam;
 245
 246 static CompressParam *comp_param;
 247 static QemuThread *compress_threads;
 248 /* comp_done_cond is used to wake up the migration thread when
 249  * one of the compression threads has finished the compression.
 250  * comp_done_lock is used to co-work with comp_done_cond.
 251  */
 252 static QemuMutex comp_done_lock;
 253 static QemuCond comp_done_cond;
 254 /* The empty QEMUFileOps will be used by file in CompressParam */
 255 static const QEMUFileOps empty_ops = { };
 256
 257 static DecompressParam *decomp_param;
 258 static QemuThread *decompress_threads;
 259 static QemuMutex decomp_done_lock;
 260 static QemuCond decomp_done_cond;
 261
 262 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 263                                 ram_addr_t offset);
 264
 265 static void *do_data_compress(void *opaque)
 266 {
 267     CompressParam *param = opaque;
 268     RAMBlock *block;
 269     ram_addr_t offset;
 270
 271     qemu_mutex_lock(&param->mutex);
 272     while (!param->quit) {
 273         if (param->block) {
 274             block = param->block;
 275             offset = param->offset;
 276             param->block = NULL;
 277             qemu_mutex_unlock(&param->mutex);
 278
 279             do_compress_ram_page(param->file, block, offset);
 280
 281             qemu_mutex_lock(&comp_done_lock);
 282             param->done = true;
 283             qemu_cond_signal(&comp_done_cond);
 284             qemu_mutex_unlock(&comp_done_lock);
 285
 286             qemu_mutex_lock(&param->mutex);
 287         } else {
 288             qemu_cond_wait(&param->cond, &param->mutex);
 289         }
 290     }
 291     qemu_mutex_unlock(&param->mutex);
 292
 293     return NULL;
 294 }
 295
 296 static inline void terminate_compression_threads(void)
 297 {
 298     int idx, thread_count;
 299
 300     thread_count = migrate_compress_threads();
 301
 302     for (idx = 0; idx < thread_count; idx++) {
 303         qemu_mutex_lock(&comp_param[idx].mutex);
 304         comp_param[idx].quit = true;
 305         qemu_cond_signal(&comp_param[idx].cond);
 306         qemu_mutex_unlock(&comp_param[idx].mutex);
 307     }
 308 }
 309
 310 static void compress_threads_save_cleanup(void)
 311 {
 312     int i, thread_count;
 313
 314     if (!migrate_use_compression()) {
 315         return;
 316     }
 317     terminate_compression_threads();
 318     thread_count = migrate_compress_threads();
 319     for (i = 0; i < thread_count; i++) {
 320         qemu_thread_join(compress_threads + i);
 321         qemu_fclose(comp_param[i].file);
 322         qemu_mutex_destroy(&comp_param[i].mutex);
 323         qemu_cond_destroy(&comp_param[i].cond);
 324     }
 325     qemu_mutex_destroy(&comp_done_lock);
 326     qemu_cond_destroy(&comp_done_cond);
 327     g_free(compress_threads);
 328     g_free(comp_param);
 329     compress_threads = NULL;
 330     comp_param = NULL;
 331 }
 332
 333 static void compress_threads_save_setup(void)
 334 {
 335     int i, thread_count;
 336
 337     if (!migrate_use_compression()) {
 338         return;
 339     }
 340     thread_count = migrate_compress_threads();
 341     compress_threads = g_new0(QemuThread, thread_count);
 342     comp_param = g_new0(CompressParam, thread_count);
 343     qemu_cond_init(&comp_done_cond);
 344     qemu_mutex_init(&comp_done_lock);
 345     for (i = 0; i < thread_count; i++) {
 346         /* comp_param[i].file is just used as a dummy buffer to save data,
 347          * set its ops to empty.
 348          */
 349         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 350         comp_param[i].done = true;
 351         comp_param[i].quit = false;
 352         qemu_mutex_init(&comp_param[i].mutex);
 353         qemu_cond_init(&comp_param[i].cond);
 354         qemu_thread_create(compress_threads + i, "compress",
 355                            do_data_compress, comp_param + i,
 356                            QEMU_THREAD_JOINABLE);
 357     }
 358 }
 359
 360 /* Multiple fd's */
 361
 362 struct MultiFDSendParams {
 363     uint8_t id;
 364     char *name;
 365     QemuThread thread;
 366     QemuSemaphore sem;
 367     QemuMutex mutex;
 368     bool quit;
 369 };
 370 typedef struct MultiFDSendParams MultiFDSendParams;
 371
 372 struct {
 373     MultiFDSendParams *params;
 374     /* number of created threads */
 375     int count;
 376 } *multifd_send_state;
 377
 378 static void terminate_multifd_send_threads(Error *errp)
 379 {
 380     int i;
 381
 382     for (i = 0; i < multifd_send_state->count; i++) {
 383         MultiFDSendParams *p = &multifd_send_state->params[i];
 384
 385         qemu_mutex_lock(&p->mutex);
 386         p->quit = true;
 387         qemu_sem_post(&p->sem);
 388         qemu_mutex_unlock(&p->mutex);
 389     }
 390 }
 391
 392 int multifd_save_cleanup(Error **errp)
 393 {
 394     int i;
 395     int ret = 0;
 396
 397     if (!migrate_use_multifd()) {
 398         return 0;
 399     }
 400     terminate_multifd_send_threads(NULL);
 401     for (i = 0; i < multifd_send_state->count; i++) {
 402         MultiFDSendParams *p = &multifd_send_state->params[i];
 403
 404         qemu_thread_join(&p->thread);
 405         qemu_mutex_destroy(&p->mutex);
 406         qemu_sem_destroy(&p->sem);
 407         g_free(p->name);
 408         p->name = NULL;
 409     }
 410     g_free(multifd_send_state->params);
 411     multifd_send_state->params = NULL;
 412     g_free(multifd_send_state);
 413     multifd_send_state = NULL;
 414     return ret;
 415 }
 416
 417 static void *multifd_send_thread(void *opaque)
 418 {
 419     MultiFDSendParams *p = opaque;
 420
 421     while (true) {
 422         qemu_mutex_lock(&p->mutex);
 423         if (p->quit) {
 424             qemu_mutex_unlock(&p->mutex);
 425             break;
 426         }
 427         qemu_mutex_unlock(&p->mutex);
 428         qemu_sem_wait(&p->sem);
 429     }
 430
 431     return NULL;
 432 }
 433
 434 int multifd_save_setup(void)
 435 {
 436     int thread_count;
 437     uint8_t i;
 438
 439     if (!migrate_use_multifd()) {
 440         return 0;
 441     }
 442     thread_count = migrate_multifd_channels();
 443     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
 444     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
 445     multifd_send_state->count = 0;
 446     for (i = 0; i < thread_count; i++) {
 447         MultiFDSendParams *p = &multifd_send_state->params[i];
 448
 449         qemu_mutex_init(&p->mutex);
 450         qemu_sem_init(&p->sem, 0);
 451         p->quit = false;
 452         p->id = i;
 453         p->name = g_strdup_printf("multifdsend_%d", i);
 454         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
 455                            QEMU_THREAD_JOINABLE);
 456
 457         multifd_send_state->count++;
 458     }
 459     return 0;
 460 }
 461
 462 struct MultiFDRecvParams {
 463     uint8_t id;
 464     char *name;
 465     QemuThread thread;
 466     QemuSemaphore sem;
 467     QemuMutex mutex;
 468     bool quit;
 469 };
 470 typedef struct MultiFDRecvParams MultiFDRecvParams;
 471
 472 struct {
 473     MultiFDRecvParams *params;
 474     /* number of created threads */
 475     int count;
 476 } *multifd_recv_state;
 477
 478 static void terminate_multifd_recv_threads(Error *errp)
 479 {
 480     int i;
 481
 482     for (i = 0; i < multifd_recv_state->count; i++) {
 483         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 484
 485         qemu_mutex_lock(&p->mutex);
 486         p->quit = true;
 487         qemu_sem_post(&p->sem);
 488         qemu_mutex_unlock(&p->mutex);
 489     }
 490 }
 491
 492 int multifd_load_cleanup(Error **errp)
 493 {
 494     int i;
 495     int ret = 0;
 496
 497     if (!migrate_use_multifd()) {
 498         return 0;
 499     }
 500     terminate_multifd_recv_threads(NULL);
 501     for (i = 0; i < multifd_recv_state->count; i++) {
 502         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 503
 504         qemu_thread_join(&p->thread);
 505         qemu_mutex_destroy(&p->mutex);
 506         qemu_sem_destroy(&p->sem);
 507         g_free(p->name);
 508         p->name = NULL;
 509     }
 510     g_free(multifd_recv_state->params);
 511     multifd_recv_state->params = NULL;
 512     g_free(multifd_recv_state);
 513     multifd_recv_state = NULL;
 514
 515     return ret;
 516 }
 517
 518 static void *multifd_recv_thread(void *opaque)
 519 {
 520     MultiFDRecvParams *p = opaque;
 521
 522     while (true) {
 523         qemu_mutex_lock(&p->mutex);
 524         if (p->quit) {
 525             qemu_mutex_unlock(&p->mutex);
 526             break;
 527         }
 528         qemu_mutex_unlock(&p->mutex);
 529         qemu_sem_wait(&p->sem);
 530     }
 531
 532     return NULL;
 533 }
 534
 535 int multifd_load_setup(void)
 536 {
 537     int thread_count;
 538     uint8_t i;
 539
 540     if (!migrate_use_multifd()) {
 541         return 0;
 542     }
 543     thread_count = migrate_multifd_channels();
 544     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
 545     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
 546     multifd_recv_state->count = 0;
 547     for (i = 0; i < thread_count; i++) {
 548         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 549
 550         qemu_mutex_init(&p->mutex);
 551         qemu_sem_init(&p->sem, 0);
 552         p->quit = false;
 553         p->id = i;
 554         p->name = g_strdup_printf("multifdrecv_%d", i);
 555         qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
 556                            QEMU_THREAD_JOINABLE);
 557         multifd_recv_state->count++;
 558     }
 559     return 0;
 560 }
 561
 562 /**
 563  * save_page_header: write page header to wire
 564  *
 565  * If this is the 1st block, it also writes the block identification
 566  *
 567  * Returns the number of bytes written
 568  *
 569  * @f: QEMUFile where to send the data
 570  * @block: block that contains the page we want to send
 571  * @offset: offset inside the block for the page
 572  *          in the lower bits, it contains flags
 573  */
 574 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 575                                ram_addr_t offset)
 576 {
 577     size_t size, len;
 578
 579     if (block == rs->last_sent_block) {
 580         offset |= RAM_SAVE_FLAG_CONTINUE;
 581     }
 582     qemu_put_be64(f, offset);
 583     size = 8;
 584
 585     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 586         len = strlen(block->idstr);
 587         qemu_put_byte(f, len);
 588         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 589         size += 1 + len;
 590         rs->last_sent_block = block;
 591     }
 592     return size;
 593 }
 594
 595 /**
 596  * mig_throttle_guest_down: throotle down the guest
 597  *
 598  * Reduce amount of guest cpu execution to hopefully slow down memory
 599  * writes. If guest dirty memory rate is reduced below the rate at
 600  * which we can transfer pages to the destination then we should be
 601  * able to complete migration. Some workloads dirty memory way too
 602  * fast and will not effectively converge, even with auto-converge.
 603  */
 604 static void mig_throttle_guest_down(void)
 605 {
 606     MigrationState *s = migrate_get_current();
 607     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 608     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 609
 610     /* We have not started throttling yet. Let's start it. */
 611     if (!cpu_throttle_active()) {
 612         cpu_throttle_set(pct_initial);
 613     } else {
 614         /* Throttling already on, just increase the rate */
 615         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 616     }
 617 }
 618
 619 /**
 620  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 621  *
 622  * @rs: current RAM state
 623  * @current_addr: address for the zero page
 624  *
 625  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 626  * The important thing is that a stale (not-yet-0'd) page be replaced
 627  * by the new data.
 628  * As a bonus, if the page wasn't in the cache it gets added so that
 629  * when a small write is made into the 0'd page it gets XBZRLE sent.
 630  */
 631 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 632 {
 633     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 634         return;
 635     }
 636
 637     /* We don't care if this fails to allocate a new cache page
 638      * as long as it updated an old one */
 639     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 640                  ram_counters.dirty_sync_count);
 641 }
 642
 643 #define ENCODING_FLAG_XBZRLE 0x1
 644
 645 /**
 646  * save_xbzrle_page: compress and send current page
 647  *
 648  * Returns: 1 means that we wrote the page
 649  *          0 means that page is identical to the one already sent
 650  *          -1 means that xbzrle would be longer than normal
 651  *
 652  * @rs: current RAM state
 653  * @current_data: pointer to the address of the page contents
 654  * @current_addr: addr of the page
 655  * @block: block that contains the page we want to send
 656  * @offset: offset inside the block for the page
 657  * @last_stage: if we are at the completion stage
 658  */
 659 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 660                             ram_addr_t current_addr, RAMBlock *block,
 661                             ram_addr_t offset, bool last_stage)
 662 {
 663     int encoded_len = 0, bytes_xbzrle;
 664     uint8_t *prev_cached_page;
 665
 666     if (!cache_is_cached(XBZRLE.cache, current_addr,
 667                          ram_counters.dirty_sync_count)) {
 668         xbzrle_counters.cache_miss++;
 669         if (!last_stage) {
 670             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 671                              ram_counters.dirty_sync_count) == -1) {
 672                 return -1;
 673             } else {
 674                 /* update *current_data when the page has been
 675                    inserted into cache */
 676                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 677             }
 678         }
 679         return -1;
 680     }
 681
 682     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 683
 684     /* save current buffer into memory */
 685     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 686
 687     /* XBZRLE encoding (if there is no overflow) */
 688     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 689                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 690                                        TARGET_PAGE_SIZE);
 691     if (encoded_len == 0) {
 692         trace_save_xbzrle_page_skipping();
 693         return 0;
 694     } else if (encoded_len == -1) {
 695         trace_save_xbzrle_page_overflow();
 696         xbzrle_counters.overflow++;
 697         /* update data in the cache */
 698         if (!last_stage) {
 699             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 700             *current_data = prev_cached_page;
 701         }
 702         return -1;
 703     }
 704
 705     /* we need to update the data in the cache, in order to get the same data */
 706     if (!last_stage) {
 707         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 708     }
 709
 710     /* Send XBZRLE based compressed page */
 711     bytes_xbzrle = save_page_header(rs, rs->f, block,
 712                                     offset | RAM_SAVE_FLAG_XBZRLE);
 713     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 714     qemu_put_be16(rs->f, encoded_len);
 715     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 716     bytes_xbzrle += encoded_len + 1 + 2;
 717     xbzrle_counters.pages++;
 718     xbzrle_counters.bytes += bytes_xbzrle;
 719     ram_counters.transferred += bytes_xbzrle;
 720
 721     return 1;
 722 }
 723
 724 /**
 725  * migration_bitmap_find_dirty: find the next dirty page from start
 726  *
 727  * Called with rcu_read_lock() to protect migration_bitmap
 728  *
 729  * Returns the byte offset within memory region of the start of a dirty page
 730  *
 731  * @rs: current RAM state
 732  * @rb: RAMBlock where to search for dirty pages
 733  * @start: page where we start the search
 734  */
 735 static inline
 736 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 737                                           unsigned long start)
 738 {
 739     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 740     unsigned long *bitmap = rb->bmap;
 741     unsigned long next;
 742
 743     if (rs->ram_bulk_stage && start > 0) {
 744         next = start + 1;
 745     } else {
 746         next = find_next_bit(bitmap, size, start);
 747     }
 748
 749     return next;
 750 }
 751
 752 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 753                                                 RAMBlock *rb,
 754                                                 unsigned long page)
 755 {
 756     bool ret;
 757
 758     ret = test_and_clear_bit(page, rb->bmap);
 759
 760     if (ret) {
 761         rs->migration_dirty_pages--;
 762     }
 763     return ret;
 764 }
 765
 766 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 767                                         ram_addr_t start, ram_addr_t length)
 768 {
 769     rs->migration_dirty_pages +=
 770         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 771                                               &rs->num_dirty_pages_period);
 772 }
 773
 774 /**
 775  * ram_pagesize_summary: calculate all the pagesizes of a VM
 776  *
 777  * Returns a summary bitmap of the page sizes of all RAMBlocks
 778  *
 779  * For VMs with just normal pages this is equivalent to the host page
 780  * size. If it's got some huge pages then it's the OR of all the
 781  * different page sizes.
 782  */
 783 uint64_t ram_pagesize_summary(void)
 784 {
 785     RAMBlock *block;
 786     uint64_t summary = 0;
 787
 788     RAMBLOCK_FOREACH(block) {
 789         summary |= block->page_size;
 790     }
 791
 792     return summary;
 793 }
 794
 795 static void migration_bitmap_sync(RAMState *rs)
 796 {
 797     RAMBlock *block;
 798     int64_t end_time;
 799     uint64_t bytes_xfer_now;
 800
 801     ram_counters.dirty_sync_count++;
 802
 803     if (!rs->time_last_bitmap_sync) {
 804         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 805     }
 806
 807     trace_migration_bitmap_sync_start();
 808     memory_global_dirty_log_sync();
 809
 810     qemu_mutex_lock(&rs->bitmap_mutex);
 811     rcu_read_lock();
 812     RAMBLOCK_FOREACH(block) {
 813         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 814     }
 815     rcu_read_unlock();
 816     qemu_mutex_unlock(&rs->bitmap_mutex);
 817
 818     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 819
 820     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 821
 822     /* more than 1 second = 1000 millisecons */
 823     if (end_time > rs->time_last_bitmap_sync + 1000) {
 824         /* calculate period counters */
 825         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 826             / (end_time - rs->time_last_bitmap_sync);
 827         bytes_xfer_now = ram_counters.transferred;
 828
 829         /* During block migration the auto-converge logic incorrectly detects
 830          * that ram migration makes no progress. Avoid this by disabling the
 831          * throttling logic during the bulk phase of block migration. */
 832         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 833             /* The following detection logic can be refined later. For now:
 834                Check to see if the dirtied bytes is 50% more than the approx.
 835                amount of bytes that just got transferred since the last time we
 836                were in this routine. If that happens twice, start or increase
 837                throttling */
 838
 839             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 840                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 841                 (++rs->dirty_rate_high_cnt >= 2)) {
 842                     trace_migration_throttle();
 843                     rs->dirty_rate_high_cnt = 0;
 844                     mig_throttle_guest_down();
 845             }
 846         }
 847
 848         if (migrate_use_xbzrle()) {
 849             if (rs->iterations_prev != rs->iterations) {
 850                 xbzrle_counters.cache_miss_rate =
 851                    (double)(xbzrle_counters.cache_miss -
 852                             rs->xbzrle_cache_miss_prev) /
 853                    (rs->iterations - rs->iterations_prev);
 854             }
 855             rs->iterations_prev = rs->iterations;
 856             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 857         }
 858
 859         /* reset period counters */
 860         rs->time_last_bitmap_sync = end_time;
 861         rs->num_dirty_pages_period = 0;
 862         rs->bytes_xfer_prev = bytes_xfer_now;
 863     }
 864     if (migrate_use_events()) {
 865         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
 866     }
 867 }
 868
 869 /**
 870  * save_zero_page: send the zero page to the stream
 871  *
 872  * Returns the number of pages written.
 873  *
 874  * @rs: current RAM state
 875  * @block: block that contains the page we want to send
 876  * @offset: offset inside the block for the page
 877  * @p: pointer to the page
 878  */
 879 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 880                           uint8_t *p)
 881 {
 882     int pages = -1;
 883
 884     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 885         ram_counters.duplicate++;
 886         ram_counters.transferred +=
 887             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 888         qemu_put_byte(rs->f, 0);
 889         ram_counters.transferred += 1;
 890         pages = 1;
 891     }
 892
 893     return pages;
 894 }
 895
 896 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 897 {
 898     if (!migrate_release_ram() || !migration_in_postcopy()) {
 899         return;
 900     }
 901
 902     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 903 }
 904
 905 /**
 906  * ram_save_page: send the given page to the stream
 907  *
 908  * Returns the number of pages written.
 909  *          < 0 - error
 910  *          >=0 - Number of pages written - this might legally be 0
 911  *                if xbzrle noticed the page was the same.
 912  *
 913  * @rs: current RAM state
 914  * @block: block that contains the page we want to send
 915  * @offset: offset inside the block for the page
 916  * @last_stage: if we are at the completion stage
 917  */
 918 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 919 {
 920     int pages = -1;
 921     uint64_t bytes_xmit;
 922     ram_addr_t current_addr;
 923     uint8_t *p;
 924     int ret;
 925     bool send_async = true;
 926     RAMBlock *block = pss->block;
 927     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 928
 929     p = block->host + offset;
 930     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 931
 932     /* In doubt sent page as normal */
 933     bytes_xmit = 0;
 934     ret = ram_control_save_page(rs->f, block->offset,
 935                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 936     if (bytes_xmit) {
 937         ram_counters.transferred += bytes_xmit;
 938         pages = 1;
 939     }
 940
 941     XBZRLE_cache_lock();
 942
 943     current_addr = block->offset + offset;
 944
 945     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 946         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 947             if (bytes_xmit > 0) {
 948                 ram_counters.normal++;
 949             } else if (bytes_xmit == 0) {
 950                 ram_counters.duplicate++;
 951             }
 952         }
 953     } else {
 954         pages = save_zero_page(rs, block, offset, p);
 955         if (pages > 0) {
 956             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 957              * page would be stale
 958              */
 959             xbzrle_cache_zero_page(rs, current_addr);
 960             ram_release_pages(block->idstr, offset, pages);
 961         } else if (!rs->ram_bulk_stage &&
 962                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 963             pages = save_xbzrle_page(rs, &p, current_addr, block,
 964                                      offset, last_stage);
 965             if (!last_stage) {
 966                 /* Can't send this cached data async, since the cache page
 967                  * might get updated before it gets to the wire
 968                  */
 969                 send_async = false;
 970             }
 971         }
 972     }
 973
 974     /* XBZRLE overflow or normal page */
 975     if (pages == -1) {
 976         ram_counters.transferred +=
 977             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
 978         if (send_async) {
 979             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
 980                                   migrate_release_ram() &
 981                                   migration_in_postcopy());
 982         } else {
 983             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
 984         }
 985         ram_counters.transferred += TARGET_PAGE_SIZE;
 986         pages = 1;
 987         ram_counters.normal++;
 988     }
 989
 990     XBZRLE_cache_unlock();
 991
 992     return pages;
 993 }
 994
 995 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 996                                 ram_addr_t offset)
 997 {
 998     RAMState *rs = ram_state;
 999     int bytes_sent, blen;
1000     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1001
1002     bytes_sent = save_page_header(rs, f, block, offset |
1003                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1004     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1005                                      migrate_compress_level());
1006     if (blen < 0) {
1007         bytes_sent = 0;
1008         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1009         error_report("compressed data failed!");
1010     } else {
1011         bytes_sent += blen;
1012         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1013     }
1014
1015     return bytes_sent;
1016 }
1017
1018 static void flush_compressed_data(RAMState *rs)
1019 {
1020     int idx, len, thread_count;
1021
1022     if (!migrate_use_compression()) {
1023         return;
1024     }
1025     thread_count = migrate_compress_threads();
1026
1027     qemu_mutex_lock(&comp_done_lock);
1028     for (idx = 0; idx < thread_count; idx++) {
1029         while (!comp_param[idx].done) {
1030             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1031         }
1032     }
1033     qemu_mutex_unlock(&comp_done_lock);
1034
1035     for (idx = 0; idx < thread_count; idx++) {
1036         qemu_mutex_lock(&comp_param[idx].mutex);
1037         if (!comp_param[idx].quit) {
1038             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1039             ram_counters.transferred += len;
1040         }
1041         qemu_mutex_unlock(&comp_param[idx].mutex);
1042     }
1043 }
1044
1045 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1046                                        ram_addr_t offset)
1047 {
1048     param->block = block;
1049     param->offset = offset;
1050 }
1051
1052 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1053                                            ram_addr_t offset)
1054 {
1055     int idx, thread_count, bytes_xmit = -1, pages = -1;
1056
1057     thread_count = migrate_compress_threads();
1058     qemu_mutex_lock(&comp_done_lock);
1059     while (true) {
1060         for (idx = 0; idx < thread_count; idx++) {
1061             if (comp_param[idx].done) {
1062                 comp_param[idx].done = false;
1063                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1064                 qemu_mutex_lock(&comp_param[idx].mutex);
1065                 set_compress_params(&comp_param[idx], block, offset);
1066                 qemu_cond_signal(&comp_param[idx].cond);
1067                 qemu_mutex_unlock(&comp_param[idx].mutex);
1068                 pages = 1;
1069                 ram_counters.normal++;
1070                 ram_counters.transferred += bytes_xmit;
1071                 break;
1072             }
1073         }
1074         if (pages > 0) {
1075             break;
1076         } else {
1077             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1078         }
1079     }
1080     qemu_mutex_unlock(&comp_done_lock);
1081
1082     return pages;
1083 }
1084
1085 /**
1086  * ram_save_compressed_page: compress the given page and send it to the stream
1087  *
1088  * Returns the number of pages written.
1089  *
1090  * @rs: current RAM state
1091  * @block: block that contains the page we want to send
1092  * @offset: offset inside the block for the page
1093  * @last_stage: if we are at the completion stage
1094  */
1095 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1096                                     bool last_stage)
1097 {
1098     int pages = -1;
1099     uint64_t bytes_xmit = 0;
1100     uint8_t *p;
1101     int ret, blen;
1102     RAMBlock *block = pss->block;
1103     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1104
1105     p = block->host + offset;
1106
1107     ret = ram_control_save_page(rs->f, block->offset,
1108                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1109     if (bytes_xmit) {
1110         ram_counters.transferred += bytes_xmit;
1111         pages = 1;
1112     }
1113     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1114         if (ret != RAM_SAVE_CONTROL_DELAYED) {
1115             if (bytes_xmit > 0) {
1116                 ram_counters.normal++;
1117             } else if (bytes_xmit == 0) {
1118                 ram_counters.duplicate++;
1119             }
1120         }
1121     } else {
1122         /* When starting the process of a new block, the first page of
1123          * the block should be sent out before other pages in the same
1124          * block, and all the pages in last block should have been sent
1125          * out, keeping this order is important, because the 'cont' flag
1126          * is used to avoid resending the block name.
1127          */
1128         if (block != rs->last_sent_block) {
1129             flush_compressed_data(rs);
1130             pages = save_zero_page(rs, block, offset, p);
1131             if (pages == -1) {
1132                 /* Make sure the first page is sent out before other pages */
1133                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1134                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1135                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1136                                                  migrate_compress_level());
1137                 if (blen > 0) {
1138                     ram_counters.transferred += bytes_xmit + blen;
1139                     ram_counters.normal++;
1140                     pages = 1;
1141                 } else {
1142                     qemu_file_set_error(rs->f, blen);
1143                     error_report("compressed data failed!");
1144                 }
1145             }
1146             if (pages > 0) {
1147                 ram_release_pages(block->idstr, offset, pages);
1148             }
1149         } else {
1150             pages = save_zero_page(rs, block, offset, p);
1151             if (pages == -1) {
1152                 pages = compress_page_with_multi_thread(rs, block, offset);
1153             } else {
1154                 ram_release_pages(block->idstr, offset, pages);
1155             }
1156         }
1157     }
1158
1159     return pages;
1160 }
1161
1162 /**
1163  * find_dirty_block: find the next dirty page and update any state
1164  * associated with the search process.
1165  *
1166  * Returns if a page is found
1167  *
1168  * @rs: current RAM state
1169  * @pss: data about the state of the current dirty page scan
1170  * @again: set to false if the search has scanned the whole of RAM
1171  */
1172 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1173 {
1174     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1175     if (pss->complete_round && pss->block == rs->last_seen_block &&
1176         pss->page >= rs->last_page) {
1177         /*
1178          * We've been once around the RAM and haven't found anything.
1179          * Give up.
1180          */
1181         *again = false;
1182         return false;
1183     }
1184     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1185         /* Didn't find anything in this RAM Block */
1186         pss->page = 0;
1187         pss->block = QLIST_NEXT_RCU(pss->block, next);
1188         if (!pss->block) {
1189             /* Hit the end of the list */
1190             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1191             /* Flag that we've looped */
1192             pss->complete_round = true;
1193             rs->ram_bulk_stage = false;
1194             if (migrate_use_xbzrle()) {
1195                 /* If xbzrle is on, stop using the data compression at this
1196                  * point. In theory, xbzrle can do better than compression.
1197                  */
1198                 flush_compressed_data(rs);
1199             }
1200         }
1201         /* Didn't find anything this time, but try again on the new block */
1202         *again = true;
1203         return false;
1204     } else {
1205         /* Can go around again, but... */
1206         *again = true;
1207         /* We've found something so probably don't need to */
1208         return true;
1209     }
1210 }
1211
1212 /**
1213  * unqueue_page: gets a page of the queue
1214  *
1215  * Helper for 'get_queued_page' - gets a page off the queue
1216  *
1217  * Returns the block of the page (or NULL if none available)
1218  *
1219  * @rs: current RAM state
1220  * @offset: used to return the offset within the RAMBlock
1221  */
1222 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1223 {
1224     RAMBlock *block = NULL;
1225
1226     qemu_mutex_lock(&rs->src_page_req_mutex);
1227     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1228         struct RAMSrcPageRequest *entry =
1229                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1230         block = entry->rb;
1231         *offset = entry->offset;
1232
1233         if (entry->len > TARGET_PAGE_SIZE) {
1234             entry->len -= TARGET_PAGE_SIZE;
1235             entry->offset += TARGET_PAGE_SIZE;
1236         } else {
1237             memory_region_unref(block->mr);
1238             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1239             g_free(entry);
1240         }
1241     }
1242     qemu_mutex_unlock(&rs->src_page_req_mutex);
1243
1244     return block;
1245 }
1246
1247 /**
1248  * get_queued_page: unqueue a page from the postocpy requests
1249  *
1250  * Skips pages that are already sent (!dirty)
1251  *
1252  * Returns if a queued page is found
1253  *
1254  * @rs: current RAM state
1255  * @pss: data about the state of the current dirty page scan
1256  */
1257 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1258 {
1259     RAMBlock  *block;
1260     ram_addr_t offset;
1261     bool dirty;
1262
1263     do {
1264         block = unqueue_page(rs, &offset);
1265         /*
1266          * We're sending this page, and since it's postcopy nothing else
1267          * will dirty it, and we must make sure it doesn't get sent again
1268          * even if this queue request was received after the background
1269          * search already sent it.
1270          */
1271         if (block) {
1272             unsigned long page;
1273
1274             page = offset >> TARGET_PAGE_BITS;
1275             dirty = test_bit(page, block->bmap);
1276             if (!dirty) {
1277                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1278                        page, test_bit(page, block->unsentmap));
1279             } else {
1280                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1281             }
1282         }
1283
1284     } while (block && !dirty);
1285
1286     if (block) {
1287         /*
1288          * As soon as we start servicing pages out of order, then we have
1289          * to kill the bulk stage, since the bulk stage assumes
1290          * in (migration_bitmap_find_and_reset_dirty) that every page is
1291          * dirty, that's no longer true.
1292          */
1293         rs->ram_bulk_stage = false;
1294
1295         /*
1296          * We want the background search to continue from the queued page
1297          * since the guest is likely to want other pages near to the page
1298          * it just requested.
1299          */
1300         pss->block = block;
1301         pss->page = offset >> TARGET_PAGE_BITS;
1302     }
1303
1304     return !!block;
1305 }
1306
1307 /**
1308  * migration_page_queue_free: drop any remaining pages in the ram
1309  * request queue
1310  *
1311  * It should be empty at the end anyway, but in error cases there may
1312  * be some left.  in case that there is any page left, we drop it.
1313  *
1314  */
1315 static void migration_page_queue_free(RAMState *rs)
1316 {
1317     struct RAMSrcPageRequest *mspr, *next_mspr;
1318     /* This queue generally should be empty - but in the case of a failed
1319      * migration might have some droppings in.
1320      */
1321     rcu_read_lock();
1322     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1323         memory_region_unref(mspr->rb->mr);
1324         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1325         g_free(mspr);
1326     }
1327     rcu_read_unlock();
1328 }
1329
1330 /**
1331  * ram_save_queue_pages: queue the page for transmission
1332  *
1333  * A request from postcopy destination for example.
1334  *
1335  * Returns zero on success or negative on error
1336  *
1337  * @rbname: Name of the RAMBLock of the request. NULL means the
1338  *          same that last one.
1339  * @start: starting address from the start of the RAMBlock
1340  * @len: length (in bytes) to send
1341  */
1342 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1343 {
1344     RAMBlock *ramblock;
1345     RAMState *rs = ram_state;
1346
1347     ram_counters.postcopy_requests++;
1348     rcu_read_lock();
1349     if (!rbname) {
1350         /* Reuse last RAMBlock */
1351         ramblock = rs->last_req_rb;
1352
1353         if (!ramblock) {
1354             /*
1355              * Shouldn't happen, we can't reuse the last RAMBlock if
1356              * it's the 1st request.
1357              */
1358             error_report("ram_save_queue_pages no previous block");
1359             goto err;
1360         }
1361     } else {
1362         ramblock = qemu_ram_block_by_name(rbname);
1363
1364         if (!ramblock) {
1365             /* We shouldn't be asked for a non-existent RAMBlock */
1366             error_report("ram_save_queue_pages no block '%s'", rbname);
1367             goto err;
1368         }
1369         rs->last_req_rb = ramblock;
1370     }
1371     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1372     if (start+len > ramblock->used_length) {
1373         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1374                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1375                      __func__, start, len, ramblock->used_length);
1376         goto err;
1377     }
1378
1379     struct RAMSrcPageRequest *new_entry =
1380         g_malloc0(sizeof(struct RAMSrcPageRequest));
1381     new_entry->rb = ramblock;
1382     new_entry->offset = start;
1383     new_entry->len = len;
1384
1385     memory_region_ref(ramblock->mr);
1386     qemu_mutex_lock(&rs->src_page_req_mutex);
1387     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1388     qemu_mutex_unlock(&rs->src_page_req_mutex);
1389     rcu_read_unlock();
1390
1391     return 0;
1392
1393 err:
1394     rcu_read_unlock();
1395     return -1;
1396 }
1397
1398 /**
1399  * ram_save_target_page: save one target page
1400  *
1401  * Returns the number of pages written
1402  *
1403  * @rs: current RAM state
1404  * @ms: current migration state
1405  * @pss: data about the page we want to send
1406  * @last_stage: if we are at the completion stage
1407  */
1408 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1409                                 bool last_stage)
1410 {
1411     int res = 0;
1412
1413     /* Check the pages is dirty and if it is send it */
1414     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1415         /*
1416          * If xbzrle is on, stop using the data compression after first
1417          * round of migration even if compression is enabled. In theory,
1418          * xbzrle can do better than compression.
1419          */
1420         if (migrate_use_compression() &&
1421             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1422             res = ram_save_compressed_page(rs, pss, last_stage);
1423         } else {
1424             res = ram_save_page(rs, pss, last_stage);
1425         }
1426
1427         if (res < 0) {
1428             return res;
1429         }
1430         if (pss->block->unsentmap) {
1431             clear_bit(pss->page, pss->block->unsentmap);
1432         }
1433     }
1434
1435     return res;
1436 }
1437
1438 /**
1439  * ram_save_host_page: save a whole host page
1440  *
1441  * Starting at *offset send pages up to the end of the current host
1442  * page. It's valid for the initial offset to point into the middle of
1443  * a host page in which case the remainder of the hostpage is sent.
1444  * Only dirty target pages are sent. Note that the host page size may
1445  * be a huge page for this block.
1446  * The saving stops at the boundary of the used_length of the block
1447  * if the RAMBlock isn't a multiple of the host page size.
1448  *
1449  * Returns the number of pages written or negative on error
1450  *
1451  * @rs: current RAM state
1452  * @ms: current migration state
1453  * @pss: data about the page we want to send
1454  * @last_stage: if we are at the completion stage
1455  */
1456 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1457                               bool last_stage)
1458 {
1459     int tmppages, pages = 0;
1460     size_t pagesize_bits =
1461         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1462
1463     do {
1464         tmppages = ram_save_target_page(rs, pss, last_stage);
1465         if (tmppages < 0) {
1466             return tmppages;
1467         }
1468
1469         pages += tmppages;
1470         pss->page++;
1471     } while ((pss->page & (pagesize_bits - 1)) &&
1472              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1473
1474     /* The offset we leave with is the last one we looked at */
1475     pss->page--;
1476     return pages;
1477 }
1478
1479 /**
1480  * ram_find_and_save_block: finds a dirty page and sends it to f
1481  *
1482  * Called within an RCU critical section.
1483  *
1484  * Returns the number of pages written where zero means no dirty pages
1485  *
1486  * @rs: current RAM state
1487  * @last_stage: if we are at the completion stage
1488  *
1489  * On systems where host-page-size > target-page-size it will send all the
1490  * pages in a host page that are dirty.
1491  */
1492
1493 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1494 {
1495     PageSearchStatus pss;
1496     int pages = 0;
1497     bool again, found;
1498
1499     /* No dirty page as there is zero RAM */
1500     if (!ram_bytes_total()) {
1501         return pages;
1502     }
1503
1504     pss.block = rs->last_seen_block;
1505     pss.page = rs->last_page;
1506     pss.complete_round = false;
1507
1508     if (!pss.block) {
1509         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1510     }
1511
1512     do {
1513         again = true;
1514         found = get_queued_page(rs, &pss);
1515
1516         if (!found) {
1517             /* priority queue empty, so just search for something dirty */
1518             found = find_dirty_block(rs, &pss, &again);
1519         }
1520
1521         if (found) {
1522             pages = ram_save_host_page(rs, &pss, last_stage);
1523         }
1524     } while (!pages && again);
1525
1526     rs->last_seen_block = pss.block;
1527     rs->last_page = pss.page;
1528
1529     return pages;
1530 }
1531
1532 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1533 {
1534     uint64_t pages = size / TARGET_PAGE_SIZE;
1535
1536     if (zero) {
1537         ram_counters.duplicate += pages;
1538     } else {
1539         ram_counters.normal += pages;
1540         ram_counters.transferred += size;
1541         qemu_update_position(f, size);
1542     }
1543 }
1544
1545 uint64_t ram_bytes_total(void)
1546 {
1547     RAMBlock *block;
1548     uint64_t total = 0;
1549
1550     rcu_read_lock();
1551     RAMBLOCK_FOREACH(block) {
1552         total += block->used_length;
1553     }
1554     rcu_read_unlock();
1555     return total;
1556 }
1557
1558 static void xbzrle_load_setup(void)
1559 {
1560     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1561 }
1562
1563 static void xbzrle_load_cleanup(void)
1564 {
1565     g_free(XBZRLE.decoded_buf);
1566     XBZRLE.decoded_buf = NULL;
1567 }
1568
1569 static void ram_save_cleanup(void *opaque)
1570 {
1571     RAMState **rsp = opaque;
1572     RAMBlock *block;
1573
1574     /* caller have hold iothread lock or is in a bh, so there is
1575      * no writing race against this migration_bitmap
1576      */
1577     memory_global_dirty_log_stop();
1578
1579     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1580         g_free(block->bmap);
1581         block->bmap = NULL;
1582         g_free(block->unsentmap);
1583         block->unsentmap = NULL;
1584     }
1585
1586     XBZRLE_cache_lock();
1587     if (XBZRLE.cache) {
1588         cache_fini(XBZRLE.cache);
1589         g_free(XBZRLE.encoded_buf);
1590         g_free(XBZRLE.current_buf);
1591         g_free(XBZRLE.zero_target_page);
1592         XBZRLE.cache = NULL;
1593         XBZRLE.encoded_buf = NULL;
1594         XBZRLE.current_buf = NULL;
1595         XBZRLE.zero_target_page = NULL;
1596     }
1597     XBZRLE_cache_unlock();
1598     migration_page_queue_free(*rsp);
1599     compress_threads_save_cleanup();
1600     g_free(*rsp);
1601     *rsp = NULL;
1602 }
1603
1604 static void ram_state_reset(RAMState *rs)
1605 {
1606     rs->last_seen_block = NULL;
1607     rs->last_sent_block = NULL;
1608     rs->last_page = 0;
1609     rs->last_version = ram_list.version;
1610     rs->ram_bulk_stage = true;
1611 }
1612
1613 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1614
1615 /*
1616  * 'expected' is the value you expect the bitmap mostly to be full
1617  * of; it won't bother printing lines that are all this value.
1618  * If 'todump' is null the migration bitmap is dumped.
1619  */
1620 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1621                            unsigned long pages)
1622 {
1623     int64_t cur;
1624     int64_t linelen = 128;
1625     char linebuf[129];
1626
1627     for (cur = 0; cur < pages; cur += linelen) {
1628         int64_t curb;
1629         bool found = false;
1630         /*
1631          * Last line; catch the case where the line length
1632          * is longer than remaining ram
1633          */
1634         if (cur + linelen > pages) {
1635             linelen = pages - cur;
1636         }
1637         for (curb = 0; curb < linelen; curb++) {
1638             bool thisbit = test_bit(cur + curb, todump);
1639             linebuf[curb] = thisbit ? '1' : '.';
1640             found = found || (thisbit != expected);
1641         }
1642         if (found) {
1643             linebuf[curb] = '\0';
1644             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1645         }
1646     }
1647 }
1648
1649 /* **** functions for postcopy ***** */
1650
1651 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1652 {
1653     struct RAMBlock *block;
1654
1655     RAMBLOCK_FOREACH(block) {
1656         unsigned long *bitmap = block->bmap;
1657         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1658         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1659
1660         while (run_start < range) {
1661             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1662             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1663                               (run_end - run_start) << TARGET_PAGE_BITS);
1664             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1665         }
1666     }
1667 }
1668
1669 /**
1670  * postcopy_send_discard_bm_ram: discard a RAMBlock
1671  *
1672  * Returns zero on success
1673  *
1674  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1675  * Note: At this point the 'unsentmap' is the processed bitmap combined
1676  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1677  *
1678  * @ms: current migration state
1679  * @pds: state for postcopy
1680  * @start: RAMBlock starting page
1681  * @length: RAMBlock size
1682  */
1683 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1684                                         PostcopyDiscardState *pds,
1685                                         RAMBlock *block)
1686 {
1687     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1688     unsigned long current;
1689     unsigned long *unsentmap = block->unsentmap;
1690
1691     for (current = 0; current < end; ) {
1692         unsigned long one = find_next_bit(unsentmap, end, current);
1693
1694         if (one <= end) {
1695             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1696             unsigned long discard_length;
1697
1698             if (zero >= end) {
1699                 discard_length = end - one;
1700             } else {
1701                 discard_length = zero - one;
1702             }
1703             if (discard_length) {
1704                 postcopy_discard_send_range(ms, pds, one, discard_length);
1705             }
1706             current = one + discard_length;
1707         } else {
1708             current = one;
1709         }
1710     }
1711
1712     return 0;
1713 }
1714
1715 /**
1716  * postcopy_each_ram_send_discard: discard all RAMBlocks
1717  *
1718  * Returns 0 for success or negative for error
1719  *
1720  * Utility for the outgoing postcopy code.
1721  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1722  *   passing it bitmap indexes and name.
1723  * (qemu_ram_foreach_block ends up passing unscaled lengths
1724  *  which would mean postcopy code would have to deal with target page)
1725  *
1726  * @ms: current migration state
1727  */
1728 static int postcopy_each_ram_send_discard(MigrationState *ms)
1729 {
1730     struct RAMBlock *block;
1731     int ret;
1732
1733     RAMBLOCK_FOREACH(block) {
1734         PostcopyDiscardState *pds =
1735             postcopy_discard_send_init(ms, block->idstr);
1736
1737         /*
1738          * Postcopy sends chunks of bitmap over the wire, but it
1739          * just needs indexes at this point, avoids it having
1740          * target page specific code.
1741          */
1742         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1743         postcopy_discard_send_finish(ms, pds);
1744         if (ret) {
1745             return ret;
1746         }
1747     }
1748
1749     return 0;
1750 }
1751
1752 /**
1753  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1754  *
1755  * Helper for postcopy_chunk_hostpages; it's called twice to
1756  * canonicalize the two bitmaps, that are similar, but one is
1757  * inverted.
1758  *
1759  * Postcopy requires that all target pages in a hostpage are dirty or
1760  * clean, not a mix.  This function canonicalizes the bitmaps.
1761  *
1762  * @ms: current migration state
1763  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1764  *               otherwise we need to canonicalize partially dirty host pages
1765  * @block: block that contains the page we want to canonicalize
1766  * @pds: state for postcopy
1767  */
1768 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1769                                           RAMBlock *block,
1770                                           PostcopyDiscardState *pds)
1771 {
1772     RAMState *rs = ram_state;
1773     unsigned long *bitmap = block->bmap;
1774     unsigned long *unsentmap = block->unsentmap;
1775     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1776     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1777     unsigned long run_start;
1778
1779     if (block->page_size == TARGET_PAGE_SIZE) {
1780         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1781         return;
1782     }
1783
1784     if (unsent_pass) {
1785         /* Find a sent page */
1786         run_start = find_next_zero_bit(unsentmap, pages, 0);
1787     } else {
1788         /* Find a dirty page */
1789         run_start = find_next_bit(bitmap, pages, 0);
1790     }
1791
1792     while (run_start < pages) {
1793         bool do_fixup = false;
1794         unsigned long fixup_start_addr;
1795         unsigned long host_offset;
1796
1797         /*
1798          * If the start of this run of pages is in the middle of a host
1799          * page, then we need to fixup this host page.
1800          */
1801         host_offset = run_start % host_ratio;
1802         if (host_offset) {
1803             do_fixup = true;
1804             run_start -= host_offset;
1805             fixup_start_addr = run_start;
1806             /* For the next pass */
1807             run_start = run_start + host_ratio;
1808         } else {
1809             /* Find the end of this run */
1810             unsigned long run_end;
1811             if (unsent_pass) {
1812                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1813             } else {
1814                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1815             }
1816             /*
1817              * If the end isn't at the start of a host page, then the
1818              * run doesn't finish at the end of a host page
1819              * and we need to discard.
1820              */
1821             host_offset = run_end % host_ratio;
1822             if (host_offset) {
1823                 do_fixup = true;
1824                 fixup_start_addr = run_end - host_offset;
1825                 /*
1826                  * This host page has gone, the next loop iteration starts
1827                  * from after the fixup
1828                  */
1829                 run_start = fixup_start_addr + host_ratio;
1830             } else {
1831                 /*
1832                  * No discards on this iteration, next loop starts from
1833                  * next sent/dirty page
1834                  */
1835                 run_start = run_end + 1;
1836             }
1837         }
1838
1839         if (do_fixup) {
1840             unsigned long page;
1841
1842             /* Tell the destination to discard this page */
1843             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1844                 /* For the unsent_pass we:
1845                  *     discard partially sent pages
1846                  * For the !unsent_pass (dirty) we:
1847                  *     discard partially dirty pages that were sent
1848                  *     (any partially sent pages were already discarded
1849                  *     by the previous unsent_pass)
1850                  */
1851                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1852                                             host_ratio);
1853             }
1854
1855             /* Clean up the bitmap */
1856             for (page = fixup_start_addr;
1857                  page < fixup_start_addr + host_ratio; page++) {
1858                 /* All pages in this host page are now not sent */
1859                 set_bit(page, unsentmap);
1860
1861                 /*
1862                  * Remark them as dirty, updating the count for any pages
1863                  * that weren't previously dirty.
1864                  */
1865                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1866             }
1867         }
1868
1869         if (unsent_pass) {
1870             /* Find the next sent page for the next iteration */
1871             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1872         } else {
1873             /* Find the next dirty page for the next iteration */
1874             run_start = find_next_bit(bitmap, pages, run_start);
1875         }
1876     }
1877 }
1878
1879 /**
1880  * postcopy_chuck_hostpages: discrad any partially sent host page
1881  *
1882  * Utility for the outgoing postcopy code.
1883  *
1884  * Discard any partially sent host-page size chunks, mark any partially
1885  * dirty host-page size chunks as all dirty.  In this case the host-page
1886  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1887  *
1888  * Returns zero on success
1889  *
1890  * @ms: current migration state
1891  * @block: block we want to work with
1892  */
1893 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1894 {
1895     PostcopyDiscardState *pds =
1896         postcopy_discard_send_init(ms, block->idstr);
1897
1898     /* First pass: Discard all partially sent host pages */
1899     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1900     /*
1901      * Second pass: Ensure that all partially dirty host pages are made
1902      * fully dirty.
1903      */
1904     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1905
1906     postcopy_discard_send_finish(ms, pds);
1907     return 0;
1908 }
1909
1910 /**
1911  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1912  *
1913  * Returns zero on success
1914  *
1915  * Transmit the set of pages to be discarded after precopy to the target
1916  * these are pages that:
1917  *     a) Have been previously transmitted but are now dirty again
1918  *     b) Pages that have never been transmitted, this ensures that
1919  *        any pages on the destination that have been mapped by background
1920  *        tasks get discarded (transparent huge pages is the specific concern)
1921  * Hopefully this is pretty sparse
1922  *
1923  * @ms: current migration state
1924  */
1925 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1926 {
1927     RAMState *rs = ram_state;
1928     RAMBlock *block;
1929     int ret;
1930
1931     rcu_read_lock();
1932
1933     /* This should be our last sync, the src is now paused */
1934     migration_bitmap_sync(rs);
1935
1936     /* Easiest way to make sure we don't resume in the middle of a host-page */
1937     rs->last_seen_block = NULL;
1938     rs->last_sent_block = NULL;
1939     rs->last_page = 0;
1940
1941     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1942         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1943         unsigned long *bitmap = block->bmap;
1944         unsigned long *unsentmap = block->unsentmap;
1945
1946         if (!unsentmap) {
1947             /* We don't have a safe way to resize the sentmap, so
1948              * if the bitmap was resized it will be NULL at this
1949              * point.
1950              */
1951             error_report("migration ram resized during precopy phase");
1952             rcu_read_unlock();
1953             return -EINVAL;
1954         }
1955         /* Deal with TPS != HPS and huge pages */
1956         ret = postcopy_chunk_hostpages(ms, block);
1957         if (ret) {
1958             rcu_read_unlock();
1959             return ret;
1960         }
1961
1962         /*
1963          * Update the unsentmap to be unsentmap = unsentmap | dirty
1964          */
1965         bitmap_or(unsentmap, unsentmap, bitmap, pages);
1966 #ifdef DEBUG_POSTCOPY
1967         ram_debug_dump_bitmap(unsentmap, true, pages);
1968 #endif
1969     }
1970     trace_ram_postcopy_send_discard_bitmap();
1971
1972     ret = postcopy_each_ram_send_discard(ms);
1973     rcu_read_unlock();
1974
1975     return ret;
1976 }
1977
1978 /**
1979  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1980  *
1981  * Returns zero on success
1982  *
1983  * @rbname: name of the RAMBlock of the request. NULL means the
1984  *          same that last one.
1985  * @start: RAMBlock starting page
1986  * @length: RAMBlock size
1987  */
1988 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1989 {
1990     int ret = -1;
1991
1992     trace_ram_discard_range(rbname, start, length);
1993
1994     rcu_read_lock();
1995     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1996
1997     if (!rb) {
1998         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1999         goto err;
2000     }
2001
2002     ret = ram_block_discard_range(rb, start, length);
2003
2004 err:
2005     rcu_read_unlock();
2006
2007     return ret;
2008 }
2009
2010 static int ram_state_init(RAMState **rsp)
2011 {
2012     *rsp = g_new0(RAMState, 1);
2013
2014     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2015     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2016     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2017
2018     if (migrate_use_xbzrle()) {
2019         XBZRLE_cache_lock();
2020         XBZRLE.zero_target_page = g_malloc0(TARGET_PAGE_SIZE);
2021         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
2022                                   TARGET_PAGE_SIZE,
2023                                   TARGET_PAGE_SIZE);
2024         if (!XBZRLE.cache) {
2025             XBZRLE_cache_unlock();
2026             error_report("Error creating cache");
2027             g_free(*rsp);
2028             *rsp = NULL;
2029             return -1;
2030         }
2031         XBZRLE_cache_unlock();
2032
2033         /* We prefer not to abort if there is no memory */
2034         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2035         if (!XBZRLE.encoded_buf) {
2036             error_report("Error allocating encoded_buf");
2037             g_free(*rsp);
2038             *rsp = NULL;
2039             return -1;
2040         }
2041
2042         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2043         if (!XBZRLE.current_buf) {
2044             error_report("Error allocating current_buf");
2045             g_free(XBZRLE.encoded_buf);
2046             XBZRLE.encoded_buf = NULL;
2047             g_free(*rsp);
2048             *rsp = NULL;
2049             return -1;
2050         }
2051     }
2052
2053     /* For memory_global_dirty_log_start below.  */
2054     qemu_mutex_lock_iothread();
2055
2056     qemu_mutex_lock_ramlist();
2057     rcu_read_lock();
2058     ram_state_reset(*rsp);
2059
2060     /* Skip setting bitmap if there is no RAM */
2061     if (ram_bytes_total()) {
2062         RAMBlock *block;
2063
2064         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2065             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
2066
2067             block->bmap = bitmap_new(pages);
2068             bitmap_set(block->bmap, 0, pages);
2069             if (migrate_postcopy_ram()) {
2070                 block->unsentmap = bitmap_new(pages);
2071                 bitmap_set(block->unsentmap, 0, pages);
2072             }
2073         }
2074     }
2075
2076     /*
2077      * Count the total number of pages used by ram blocks not including any
2078      * gaps due to alignment or unplugs.
2079      */
2080     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2081
2082     memory_global_dirty_log_start();
2083     migration_bitmap_sync(*rsp);
2084     qemu_mutex_unlock_ramlist();
2085     qemu_mutex_unlock_iothread();
2086     rcu_read_unlock();
2087
2088     return 0;
2089 }
2090
2091 /*
2092  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2093  * long-running RCU critical section.  When rcu-reclaims in the code
2094  * start to become numerous it will be necessary to reduce the
2095  * granularity of these critical sections.
2096  */
2097
2098 /**
2099  * ram_save_setup: Setup RAM for migration
2100  *
2101  * Returns zero to indicate success and negative for error
2102  *
2103  * @f: QEMUFile where to send the data
2104  * @opaque: RAMState pointer
2105  */
2106 static int ram_save_setup(QEMUFile *f, void *opaque)
2107 {
2108     RAMState **rsp = opaque;
2109     RAMBlock *block;
2110
2111     /* migration has already setup the bitmap, reuse it. */
2112     if (!migration_in_colo_state()) {
2113         if (ram_state_init(rsp) != 0) {
2114             return -1;
2115         }
2116     }
2117     (*rsp)->f = f;
2118
2119     rcu_read_lock();
2120
2121     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2122
2123     RAMBLOCK_FOREACH(block) {
2124         qemu_put_byte(f, strlen(block->idstr));
2125         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2126         qemu_put_be64(f, block->used_length);
2127         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2128             qemu_put_be64(f, block->page_size);
2129         }
2130     }
2131
2132     rcu_read_unlock();
2133     compress_threads_save_setup();
2134
2135     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2136     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2137
2138     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2139
2140     return 0;
2141 }
2142
2143 /**
2144  * ram_save_iterate: iterative stage for migration
2145  *
2146  * Returns zero to indicate success and negative for error
2147  *
2148  * @f: QEMUFile where to send the data
2149  * @opaque: RAMState pointer
2150  */
2151 static int ram_save_iterate(QEMUFile *f, void *opaque)
2152 {
2153     RAMState **temp = opaque;
2154     RAMState *rs = *temp;
2155     int ret;
2156     int i;
2157     int64_t t0;
2158     int done = 0;
2159
2160     rcu_read_lock();
2161     if (ram_list.version != rs->last_version) {
2162         ram_state_reset(rs);
2163     }
2164
2165     /* Read version before ram_list.blocks */
2166     smp_rmb();
2167
2168     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2169
2170     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2171     i = 0;
2172     while ((ret = qemu_file_rate_limit(f)) == 0) {
2173         int pages;
2174
2175         pages = ram_find_and_save_block(rs, false);
2176         /* no more pages to sent */
2177         if (pages == 0) {
2178             done = 1;
2179             break;
2180         }
2181         rs->iterations++;
2182
2183         /* we want to check in the 1st loop, just in case it was the 1st time
2184            and we had to sync the dirty bitmap.
2185            qemu_get_clock_ns() is a bit expensive, so we only check each some
2186            iterations
2187         */
2188         if ((i & 63) == 0) {
2189             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2190             if (t1 > MAX_WAIT) {
2191                 trace_ram_save_iterate_big_wait(t1, i);
2192                 break;
2193             }
2194         }
2195         i++;
2196     }
2197     flush_compressed_data(rs);
2198     rcu_read_unlock();
2199
2200     /*
2201      * Must occur before EOS (or any QEMUFile operation)
2202      * because of RDMA protocol.
2203      */
2204     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2205
2206     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2207     ram_counters.transferred += 8;
2208
2209     ret = qemu_file_get_error(f);
2210     if (ret < 0) {
2211         return ret;
2212     }
2213
2214     return done;
2215 }
2216
2217 /**
2218  * ram_save_complete: function called to send the remaining amount of ram
2219  *
2220  * Returns zero to indicate success
2221  *
2222  * Called with iothread lock
2223  *
2224  * @f: QEMUFile where to send the data
2225  * @opaque: RAMState pointer
2226  */
2227 static int ram_save_complete(QEMUFile *f, void *opaque)
2228 {
2229     RAMState **temp = opaque;
2230     RAMState *rs = *temp;
2231
2232     rcu_read_lock();
2233
2234     if (!migration_in_postcopy()) {
2235         migration_bitmap_sync(rs);
2236     }
2237
2238     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2239
2240     /* try transferring iterative blocks of memory */
2241
2242     /* flush all remaining blocks regardless of rate limiting */
2243     while (true) {
2244         int pages;
2245
2246         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2247         /* no more blocks to sent */
2248         if (pages == 0) {
2249             break;
2250         }
2251     }
2252
2253     flush_compressed_data(rs);
2254     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2255
2256     rcu_read_unlock();
2257
2258     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2259
2260     return 0;
2261 }
2262
2263 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2264                              uint64_t *non_postcopiable_pending,
2265                              uint64_t *postcopiable_pending)
2266 {
2267     RAMState **temp = opaque;
2268     RAMState *rs = *temp;
2269     uint64_t remaining_size;
2270
2271     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2272
2273     if (!migration_in_postcopy() &&
2274         remaining_size < max_size) {
2275         qemu_mutex_lock_iothread();
2276         rcu_read_lock();
2277         migration_bitmap_sync(rs);
2278         rcu_read_unlock();
2279         qemu_mutex_unlock_iothread();
2280         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2281     }
2282
2283     if (migrate_postcopy_ram()) {
2284         /* We can do postcopy, and all the data is postcopiable */
2285         *postcopiable_pending += remaining_size;
2286     } else {
2287         *non_postcopiable_pending += remaining_size;
2288     }
2289 }
2290
2291 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2292 {
2293     unsigned int xh_len;
2294     int xh_flags;
2295     uint8_t *loaded_data;
2296
2297     /* extract RLE header */
2298     xh_flags = qemu_get_byte(f);
2299     xh_len = qemu_get_be16(f);
2300
2301     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2302         error_report("Failed to load XBZRLE page - wrong compression!");
2303         return -1;
2304     }
2305
2306     if (xh_len > TARGET_PAGE_SIZE) {
2307         error_report("Failed to load XBZRLE page - len overflow!");
2308         return -1;
2309     }
2310     loaded_data = XBZRLE.decoded_buf;
2311     /* load data and decode */
2312     /* it can change loaded_data to point to an internal buffer */
2313     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2314
2315     /* decode RLE */
2316     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2317                              TARGET_PAGE_SIZE) == -1) {
2318         error_report("Failed to load XBZRLE page - decode error!");
2319         return -1;
2320     }
2321
2322     return 0;
2323 }
2324
2325 /**
2326  * ram_block_from_stream: read a RAMBlock id from the migration stream
2327  *
2328  * Must be called from within a rcu critical section.
2329  *
2330  * Returns a pointer from within the RCU-protected ram_list.
2331  *
2332  * @f: QEMUFile where to read the data from
2333  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2334  */
2335 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2336 {
2337     static RAMBlock *block = NULL;
2338     char id[256];
2339     uint8_t len;
2340
2341     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2342         if (!block) {
2343             error_report("Ack, bad migration stream!");
2344             return NULL;
2345         }
2346         return block;
2347     }
2348
2349     len = qemu_get_byte(f);
2350     qemu_get_buffer(f, (uint8_t *)id, len);
2351     id[len] = 0;
2352
2353     block = qemu_ram_block_by_name(id);
2354     if (!block) {
2355         error_report("Can't find block %s", id);
2356         return NULL;
2357     }
2358
2359     return block;
2360 }
2361
2362 static inline void *host_from_ram_block_offset(RAMBlock *block,
2363                                                ram_addr_t offset)
2364 {
2365     if (!offset_in_ramblock(block, offset)) {
2366         return NULL;
2367     }
2368
2369     return block->host + offset;
2370 }
2371
2372 /**
2373  * ram_handle_compressed: handle the zero page case
2374  *
2375  * If a page (or a whole RDMA chunk) has been
2376  * determined to be zero, then zap it.
2377  *
2378  * @host: host address for the zero page
2379  * @ch: what the page is filled from.  We only support zero
2380  * @size: size of the zero page
2381  */
2382 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2383 {
2384     if (ch != 0 || !is_zero_range(host, size)) {
2385         memset(host, ch, size);
2386     }
2387 }
2388
2389 static void *do_data_decompress(void *opaque)
2390 {
2391     DecompressParam *param = opaque;
2392     unsigned long pagesize;
2393     uint8_t *des;
2394     int len;
2395
2396     qemu_mutex_lock(&param->mutex);
2397     while (!param->quit) {
2398         if (param->des) {
2399             des = param->des;
2400             len = param->len;
2401             param->des = 0;
2402             qemu_mutex_unlock(&param->mutex);
2403
2404             pagesize = TARGET_PAGE_SIZE;
2405             /* uncompress() will return failed in some case, especially
2406              * when the page is dirted when doing the compression, it's
2407              * not a problem because the dirty page will be retransferred
2408              * and uncompress() won't break the data in other pages.
2409              */
2410             uncompress((Bytef *)des, &pagesize,
2411                        (const Bytef *)param->compbuf, len);
2412
2413             qemu_mutex_lock(&decomp_done_lock);
2414             param->done = true;
2415             qemu_cond_signal(&decomp_done_cond);
2416             qemu_mutex_unlock(&decomp_done_lock);
2417
2418             qemu_mutex_lock(&param->mutex);
2419         } else {
2420             qemu_cond_wait(&param->cond, &param->mutex);
2421         }
2422     }
2423     qemu_mutex_unlock(&param->mutex);
2424
2425     return NULL;
2426 }
2427
2428 static void wait_for_decompress_done(void)
2429 {
2430     int idx, thread_count;
2431
2432     if (!migrate_use_compression()) {
2433         return;
2434     }
2435
2436     thread_count = migrate_decompress_threads();
2437     qemu_mutex_lock(&decomp_done_lock);
2438     for (idx = 0; idx < thread_count; idx++) {
2439         while (!decomp_param[idx].done) {
2440             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2441         }
2442     }
2443     qemu_mutex_unlock(&decomp_done_lock);
2444 }
2445
2446 static void compress_threads_load_setup(void)
2447 {
2448     int i, thread_count;
2449
2450     if (!migrate_use_compression()) {
2451         return;
2452     }
2453     thread_count = migrate_decompress_threads();
2454     decompress_threads = g_new0(QemuThread, thread_count);
2455     decomp_param = g_new0(DecompressParam, thread_count);
2456     qemu_mutex_init(&decomp_done_lock);
2457     qemu_cond_init(&decomp_done_cond);
2458     for (i = 0; i < thread_count; i++) {
2459         qemu_mutex_init(&decomp_param[i].mutex);
2460         qemu_cond_init(&decomp_param[i].cond);
2461         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2462         decomp_param[i].done = true;
2463         decomp_param[i].quit = false;
2464         qemu_thread_create(decompress_threads + i, "decompress",
2465                            do_data_decompress, decomp_param + i,
2466                            QEMU_THREAD_JOINABLE);
2467     }
2468 }
2469
2470 static void compress_threads_load_cleanup(void)
2471 {
2472     int i, thread_count;
2473
2474     if (!migrate_use_compression()) {
2475         return;
2476     }
2477     thread_count = migrate_decompress_threads();
2478     for (i = 0; i < thread_count; i++) {
2479         qemu_mutex_lock(&decomp_param[i].mutex);
2480         decomp_param[i].quit = true;
2481         qemu_cond_signal(&decomp_param[i].cond);
2482         qemu_mutex_unlock(&decomp_param[i].mutex);
2483     }
2484     for (i = 0; i < thread_count; i++) {
2485         qemu_thread_join(decompress_threads + i);
2486         qemu_mutex_destroy(&decomp_param[i].mutex);
2487         qemu_cond_destroy(&decomp_param[i].cond);
2488         g_free(decomp_param[i].compbuf);
2489     }
2490     g_free(decompress_threads);
2491     g_free(decomp_param);
2492     decompress_threads = NULL;
2493     decomp_param = NULL;
2494 }
2495
2496 static void decompress_data_with_multi_threads(QEMUFile *f,
2497                                                void *host, int len)
2498 {
2499     int idx, thread_count;
2500
2501     thread_count = migrate_decompress_threads();
2502     qemu_mutex_lock(&decomp_done_lock);
2503     while (true) {
2504         for (idx = 0; idx < thread_count; idx++) {
2505             if (decomp_param[idx].done) {
2506                 decomp_param[idx].done = false;
2507                 qemu_mutex_lock(&decomp_param[idx].mutex);
2508                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2509                 decomp_param[idx].des = host;
2510                 decomp_param[idx].len = len;
2511                 qemu_cond_signal(&decomp_param[idx].cond);
2512                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2513                 break;
2514             }
2515         }
2516         if (idx < thread_count) {
2517             break;
2518         } else {
2519             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2520         }
2521     }
2522     qemu_mutex_unlock(&decomp_done_lock);
2523 }
2524
2525 /**
2526  * ram_load_setup: Setup RAM for migration incoming side
2527  *
2528  * Returns zero to indicate success and negative for error
2529  *
2530  * @f: QEMUFile where to receive the data
2531  * @opaque: RAMState pointer
2532  */
2533 static int ram_load_setup(QEMUFile *f, void *opaque)
2534 {
2535     xbzrle_load_setup();
2536     compress_threads_load_setup();
2537     return 0;
2538 }
2539
2540 static int ram_load_cleanup(void *opaque)
2541 {
2542     xbzrle_load_cleanup();
2543     compress_threads_load_cleanup();
2544     return 0;
2545 }
2546
2547 /**
2548  * ram_postcopy_incoming_init: allocate postcopy data structures
2549  *
2550  * Returns 0 for success and negative if there was one error
2551  *
2552  * @mis: current migration incoming state
2553  *
2554  * Allocate data structures etc needed by incoming migration with
2555  * postcopy-ram. postcopy-ram's similarly names
2556  * postcopy_ram_incoming_init does the work.
2557  */
2558 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2559 {
2560     unsigned long ram_pages = last_ram_page();
2561
2562     return postcopy_ram_incoming_init(mis, ram_pages);
2563 }
2564
2565 /**
2566  * ram_load_postcopy: load a page in postcopy case
2567  *
2568  * Returns 0 for success or -errno in case of error
2569  *
2570  * Called in postcopy mode by ram_load().
2571  * rcu_read_lock is taken prior to this being called.
2572  *
2573  * @f: QEMUFile where to send the data
2574  */
2575 static int ram_load_postcopy(QEMUFile *f)
2576 {
2577     int flags = 0, ret = 0;
2578     bool place_needed = false;
2579     bool matching_page_sizes = false;
2580     MigrationIncomingState *mis = migration_incoming_get_current();
2581     /* Temporary page that is later 'placed' */
2582     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2583     void *last_host = NULL;
2584     bool all_zero = false;
2585
2586     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2587         ram_addr_t addr;
2588         void *host = NULL;
2589         void *page_buffer = NULL;
2590         void *place_source = NULL;
2591         RAMBlock *block = NULL;
2592         uint8_t ch;
2593
2594         addr = qemu_get_be64(f);
2595         flags = addr & ~TARGET_PAGE_MASK;
2596         addr &= TARGET_PAGE_MASK;
2597
2598         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2599         place_needed = false;
2600         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2601             block = ram_block_from_stream(f, flags);
2602
2603             host = host_from_ram_block_offset(block, addr);
2604             if (!host) {
2605                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2606                 ret = -EINVAL;
2607                 break;
2608             }
2609             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2610             /*
2611              * Postcopy requires that we place whole host pages atomically;
2612              * these may be huge pages for RAMBlocks that are backed by
2613              * hugetlbfs.
2614              * To make it atomic, the data is read into a temporary page
2615              * that's moved into place later.
2616              * The migration protocol uses,  possibly smaller, target-pages
2617              * however the source ensures it always sends all the components
2618              * of a host page in order.
2619              */
2620             page_buffer = postcopy_host_page +
2621                           ((uintptr_t)host & (block->page_size - 1));
2622             /* If all TP are zero then we can optimise the place */
2623             if (!((uintptr_t)host & (block->page_size - 1))) {
2624                 all_zero = true;
2625             } else {
2626                 /* not the 1st TP within the HP */
2627                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2628                     error_report("Non-sequential target page %p/%p",
2629                                   host, last_host);
2630                     ret = -EINVAL;
2631                     break;
2632                 }
2633             }
2634
2635
2636             /*
2637              * If it's the last part of a host page then we place the host
2638              * page
2639              */
2640             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2641                                      (block->page_size - 1)) == 0;
2642             place_source = postcopy_host_page;
2643         }
2644         last_host = host;
2645
2646         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2647         case RAM_SAVE_FLAG_ZERO:
2648             ch = qemu_get_byte(f);
2649             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2650             if (ch) {
2651                 all_zero = false;
2652             }
2653             break;
2654
2655         case RAM_SAVE_FLAG_PAGE:
2656             all_zero = false;
2657             if (!place_needed || !matching_page_sizes) {
2658                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2659             } else {
2660                 /* Avoids the qemu_file copy during postcopy, which is
2661                  * going to do a copy later; can only do it when we
2662                  * do this read in one go (matching page sizes)
2663                  */
2664                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2665                                          TARGET_PAGE_SIZE);
2666             }
2667             break;
2668         case RAM_SAVE_FLAG_EOS:
2669             /* normal exit */
2670             break;
2671         default:
2672             error_report("Unknown combination of migration flags: %#x"
2673                          " (postcopy mode)", flags);
2674             ret = -EINVAL;
2675         }
2676
2677         if (place_needed) {
2678             /* This gets called at the last target page in the host page */
2679             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2680
2681             if (all_zero) {
2682                 ret = postcopy_place_page_zero(mis, place_dest,
2683                                                block->page_size);
2684             } else {
2685                 ret = postcopy_place_page(mis, place_dest,
2686                                           place_source, block->page_size);
2687             }
2688         }
2689         if (!ret) {
2690             ret = qemu_file_get_error(f);
2691         }
2692     }
2693
2694     return ret;
2695 }
2696
2697 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2698 {
2699     int flags = 0, ret = 0, invalid_flags = 0;
2700     static uint64_t seq_iter;
2701     int len = 0;
2702     /*
2703      * If system is running in postcopy mode, page inserts to host memory must
2704      * be atomic
2705      */
2706     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2707     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2708     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2709
2710     seq_iter++;
2711
2712     if (version_id != 4) {
2713         ret = -EINVAL;
2714     }
2715
2716     if (!migrate_use_compression()) {
2717         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2718     }
2719     /* This RCU critical section can be very long running.
2720      * When RCU reclaims in the code start to become numerous,
2721      * it will be necessary to reduce the granularity of this
2722      * critical section.
2723      */
2724     rcu_read_lock();
2725
2726     if (postcopy_running) {
2727         ret = ram_load_postcopy(f);
2728     }
2729
2730     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2731         ram_addr_t addr, total_ram_bytes;
2732         void *host = NULL;
2733         uint8_t ch;
2734
2735         addr = qemu_get_be64(f);
2736         flags = addr & ~TARGET_PAGE_MASK;
2737         addr &= TARGET_PAGE_MASK;
2738
2739         if (flags & invalid_flags) {
2740             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2741                 error_report("Received an unexpected compressed page");
2742             }
2743
2744             ret = -EINVAL;
2745             break;
2746         }
2747
2748         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2749                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2750             RAMBlock *block = ram_block_from_stream(f, flags);
2751
2752             host = host_from_ram_block_offset(block, addr);
2753             if (!host) {
2754                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2755                 ret = -EINVAL;
2756                 break;
2757             }
2758             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2759         }
2760
2761         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2762         case RAM_SAVE_FLAG_MEM_SIZE:
2763             /* Synchronize RAM block list */
2764             total_ram_bytes = addr;
2765             while (!ret && total_ram_bytes) {
2766                 RAMBlock *block;
2767                 char id[256];
2768                 ram_addr_t length;
2769
2770                 len = qemu_get_byte(f);
2771                 qemu_get_buffer(f, (uint8_t *)id, len);
2772                 id[len] = 0;
2773                 length = qemu_get_be64(f);
2774
2775                 block = qemu_ram_block_by_name(id);
2776                 if (block) {
2777                     if (length != block->used_length) {
2778                         Error *local_err = NULL;
2779
2780                         ret = qemu_ram_resize(block, length,
2781                                               &local_err);
2782                         if (local_err) {
2783                             error_report_err(local_err);
2784                         }
2785                     }
2786                     /* For postcopy we need to check hugepage sizes match */
2787                     if (postcopy_advised &&
2788                         block->page_size != qemu_host_page_size) {
2789                         uint64_t remote_page_size = qemu_get_be64(f);
2790                         if (remote_page_size != block->page_size) {
2791                             error_report("Mismatched RAM page size %s "
2792                                          "(local) %zd != %" PRId64,
2793                                          id, block->page_size,
2794                                          remote_page_size);
2795                             ret = -EINVAL;
2796                         }
2797                     }
2798                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2799                                           block->idstr);
2800                 } else {
2801                     error_report("Unknown ramblock \"%s\", cannot "
2802                                  "accept migration", id);
2803                     ret = -EINVAL;
2804                 }
2805
2806                 total_ram_bytes -= length;
2807             }
2808             break;
2809
2810         case RAM_SAVE_FLAG_ZERO:
2811             ch = qemu_get_byte(f);
2812             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2813             break;
2814
2815         case RAM_SAVE_FLAG_PAGE:
2816             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2817             break;
2818
2819         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2820             len = qemu_get_be32(f);
2821             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2822                 error_report("Invalid compressed data length: %d", len);
2823                 ret = -EINVAL;
2824                 break;
2825             }
2826             decompress_data_with_multi_threads(f, host, len);
2827             break;
2828
2829         case RAM_SAVE_FLAG_XBZRLE:
2830             if (load_xbzrle(f, addr, host) < 0) {
2831                 error_report("Failed to decompress XBZRLE page at "
2832                              RAM_ADDR_FMT, addr);
2833                 ret = -EINVAL;
2834                 break;
2835             }
2836             break;
2837         case RAM_SAVE_FLAG_EOS:
2838             /* normal exit */
2839             break;
2840         default:
2841             if (flags & RAM_SAVE_FLAG_HOOK) {
2842                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2843             } else {
2844                 error_report("Unknown combination of migration flags: %#x",
2845                              flags);
2846                 ret = -EINVAL;
2847             }
2848         }
2849         if (!ret) {
2850             ret = qemu_file_get_error(f);
2851         }
2852     }
2853
2854     wait_for_decompress_done();
2855     rcu_read_unlock();
2856     trace_ram_load_complete(ret, seq_iter);
2857     return ret;
2858 }
2859
2860 static bool ram_has_postcopy(void *opaque)
2861 {
2862     return migrate_postcopy_ram();
2863 }
2864
2865 static SaveVMHandlers savevm_ram_handlers = {
2866     .save_setup = ram_save_setup,
2867     .save_live_iterate = ram_save_iterate,
2868     .save_live_complete_postcopy = ram_save_complete,
2869     .save_live_complete_precopy = ram_save_complete,
2870     .has_postcopy = ram_has_postcopy,
2871     .save_live_pending = ram_save_pending,
2872     .load_state = ram_load,
2873     .save_cleanup = ram_save_cleanup,
2874     .load_setup = ram_load_setup,
2875     .load_cleanup = ram_load_cleanup,
2876 };
2877
2878 void ram_mig_init(void)
2879 {
2880     qemu_mutex_init(&XBZRLE.lock);
2881     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2882 }