migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "cpu.h"
  30 #include <zlib.h>
  31 #include "qapi-event.h"
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration/register.h"
  40 #include "migration/misc.h"
  41 #include "qemu-file.h"
  42 #include "postcopy-ram.h"
  43 #include "migration/page_cache.h"
  44 #include "qemu/error-report.h"
  45 #include "qapi/qmp/qerror.h"
  46 #include "trace.h"
  47 #include "exec/ram_addr.h"
  48 #include "qemu/rcu_queue.h"
  49 #include "migration/colo.h"
  50 #include "migration/block.h"
  51
  52 /***********************************************************/
  53 /* ram save/restore */
  54
  55 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  56  * worked for pages that where filled with the same char.  We switched
  57  * it to only search for the zero value.  And to avoid confusion with
  58  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  59  */
  60
  61 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  62 #define RAM_SAVE_FLAG_ZERO     0x02
  63 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  64 #define RAM_SAVE_FLAG_PAGE     0x08
  65 #define RAM_SAVE_FLAG_EOS      0x10
  66 #define RAM_SAVE_FLAG_CONTINUE 0x20
  67 #define RAM_SAVE_FLAG_XBZRLE   0x40
  68 /* 0x80 is reserved in migration.h start with 0x100 next */
  69 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  70
  71 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  72 {
  73     return buffer_is_zero(p, size);
  74 }
  75
  76 XBZRLECacheStats xbzrle_counters;
  77
  78 /* struct contains XBZRLE cache and a static page
  79    used by the compression */
  80 static struct {
  81     /* buffer used for XBZRLE encoding */
  82     uint8_t *encoded_buf;
  83     /* buffer for storing page content */
  84     uint8_t *current_buf;
  85     /* Cache for XBZRLE, Protected by lock. */
  86     PageCache *cache;
  87     QemuMutex lock;
  88     /* it will store a page full of zeros */
  89     uint8_t *zero_target_page;
  90     /* buffer used for XBZRLE decoding */
  91     uint8_t *decoded_buf;
  92 } XBZRLE;
  93
  94 static void XBZRLE_cache_lock(void)
  95 {
  96     if (migrate_use_xbzrle())
  97         qemu_mutex_lock(&XBZRLE.lock);
  98 }
  99
 100 static void XBZRLE_cache_unlock(void)
 101 {
 102     if (migrate_use_xbzrle())
 103         qemu_mutex_unlock(&XBZRLE.lock);
 104 }
 105
 106 /**
 107  * xbzrle_cache_resize: resize the xbzrle cache
 108  *
 109  * This function is called from qmp_migrate_set_cache_size in main
 110  * thread, possibly while a migration is in progress.  A running
 111  * migration may be using the cache and might finish during this call,
 112  * hence changes to the cache are protected by XBZRLE.lock().
 113  *
 114  * Returns the new_size or negative in case of error.
 115  *
 116  * @new_size: new cache size
 117  * @errp: set *errp if the check failed, with reason
 118  */
 119 int64_t xbzrle_cache_resize(int64_t new_size, Error **errp)
 120 {
 121     PageCache *new_cache;
 122     int64_t ret;
 123
 124     /* Check for truncation */
 125     if (new_size != (size_t)new_size) {
 126         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 127                    "exceeding address space");
 128         return -1;
 129     }
 130
 131     /* Cache should not be larger than guest ram size */
 132     if (new_size > ram_bytes_total()) {
 133         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 134                    "exceeds guest ram size");
 135         return -1;
 136     }
 137
 138     XBZRLE_cache_lock();
 139
 140     if (XBZRLE.cache != NULL) {
 141         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 142             goto out_new_size;
 143         }
 144         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 145         if (!new_cache) {
 146             ret = -1;
 147             goto out;
 148         }
 149
 150         cache_fini(XBZRLE.cache);
 151         XBZRLE.cache = new_cache;
 152     }
 153
 154 out_new_size:
 155     ret = pow2floor(new_size);
 156 out:
 157     XBZRLE_cache_unlock();
 158     return ret;
 159 }
 160
 161 /*
 162  * An outstanding page request, on the source, having been received
 163  * and queued
 164  */
 165 struct RAMSrcPageRequest {
 166     RAMBlock *rb;
 167     hwaddr    offset;
 168     hwaddr    len;
 169
 170     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 171 };
 172
 173 /* State of RAM for migration */
 174 struct RAMState {
 175     /* QEMUFile used for this migration */
 176     QEMUFile *f;
 177     /* Last block that we have visited searching for dirty pages */
 178     RAMBlock *last_seen_block;
 179     /* Last block from where we have sent data */
 180     RAMBlock *last_sent_block;
 181     /* Last dirty target page we have sent */
 182     ram_addr_t last_page;
 183     /* last ram version we have seen */
 184     uint32_t last_version;
 185     /* We are in the first round */
 186     bool ram_bulk_stage;
 187     /* How many times we have dirty too many pages */
 188     int dirty_rate_high_cnt;
 189     /* these variables are used for bitmap sync */
 190     /* last time we did a full bitmap_sync */
 191     int64_t time_last_bitmap_sync;
 192     /* bytes transferred at start_time */
 193     uint64_t bytes_xfer_prev;
 194     /* number of dirty pages since start_time */
 195     uint64_t num_dirty_pages_period;
 196     /* xbzrle misses since the beginning of the period */
 197     uint64_t xbzrle_cache_miss_prev;
 198     /* number of iterations at the beginning of period */
 199     uint64_t iterations_prev;
 200     /* Iterations since start */
 201     uint64_t iterations;
 202     /* number of dirty bits in the bitmap */
 203     uint64_t migration_dirty_pages;
 204     /* protects modification of the bitmap */
 205     QemuMutex bitmap_mutex;
 206     /* The RAMBlock used in the last src_page_requests */
 207     RAMBlock *last_req_rb;
 208     /* Queue of outstanding page requests from the destination */
 209     QemuMutex src_page_req_mutex;
 210     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 211 };
 212 typedef struct RAMState RAMState;
 213
 214 static RAMState *ram_state;
 215
 216 uint64_t ram_bytes_remaining(void)
 217 {
 218     return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
 219 }
 220
 221 MigrationStats ram_counters;
 222
 223 /* used by the search for pages to send */
 224 struct PageSearchStatus {
 225     /* Current block being searched */
 226     RAMBlock    *block;
 227     /* Current page to search from */
 228     unsigned long page;
 229     /* Set once we wrap around */
 230     bool         complete_round;
 231 };
 232 typedef struct PageSearchStatus PageSearchStatus;
 233
 234 struct CompressParam {
 235     bool done;
 236     bool quit;
 237     QEMUFile *file;
 238     QemuMutex mutex;
 239     QemuCond cond;
 240     RAMBlock *block;
 241     ram_addr_t offset;
 242 };
 243 typedef struct CompressParam CompressParam;
 244
 245 struct DecompressParam {
 246     bool done;
 247     bool quit;
 248     QemuMutex mutex;
 249     QemuCond cond;
 250     void *des;
 251     uint8_t *compbuf;
 252     int len;
 253 };
 254 typedef struct DecompressParam DecompressParam;
 255
 256 static CompressParam *comp_param;
 257 static QemuThread *compress_threads;
 258 /* comp_done_cond is used to wake up the migration thread when
 259  * one of the compression threads has finished the compression.
 260  * comp_done_lock is used to co-work with comp_done_cond.
 261  */
 262 static QemuMutex comp_done_lock;
 263 static QemuCond comp_done_cond;
 264 /* The empty QEMUFileOps will be used by file in CompressParam */
 265 static const QEMUFileOps empty_ops = { };
 266
 267 static DecompressParam *decomp_param;
 268 static QemuThread *decompress_threads;
 269 static QemuMutex decomp_done_lock;
 270 static QemuCond decomp_done_cond;
 271
 272 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 273                                 ram_addr_t offset);
 274
 275 static void *do_data_compress(void *opaque)
 276 {
 277     CompressParam *param = opaque;
 278     RAMBlock *block;
 279     ram_addr_t offset;
 280
 281     qemu_mutex_lock(&param->mutex);
 282     while (!param->quit) {
 283         if (param->block) {
 284             block = param->block;
 285             offset = param->offset;
 286             param->block = NULL;
 287             qemu_mutex_unlock(&param->mutex);
 288
 289             do_compress_ram_page(param->file, block, offset);
 290
 291             qemu_mutex_lock(&comp_done_lock);
 292             param->done = true;
 293             qemu_cond_signal(&comp_done_cond);
 294             qemu_mutex_unlock(&comp_done_lock);
 295
 296             qemu_mutex_lock(&param->mutex);
 297         } else {
 298             qemu_cond_wait(&param->cond, &param->mutex);
 299         }
 300     }
 301     qemu_mutex_unlock(&param->mutex);
 302
 303     return NULL;
 304 }
 305
 306 static inline void terminate_compression_threads(void)
 307 {
 308     int idx, thread_count;
 309
 310     thread_count = migrate_compress_threads();
 311
 312     for (idx = 0; idx < thread_count; idx++) {
 313         qemu_mutex_lock(&comp_param[idx].mutex);
 314         comp_param[idx].quit = true;
 315         qemu_cond_signal(&comp_param[idx].cond);
 316         qemu_mutex_unlock(&comp_param[idx].mutex);
 317     }
 318 }
 319
 320 static void compress_threads_save_cleanup(void)
 321 {
 322     int i, thread_count;
 323
 324     if (!migrate_use_compression()) {
 325         return;
 326     }
 327     terminate_compression_threads();
 328     thread_count = migrate_compress_threads();
 329     for (i = 0; i < thread_count; i++) {
 330         qemu_thread_join(compress_threads + i);
 331         qemu_fclose(comp_param[i].file);
 332         qemu_mutex_destroy(&comp_param[i].mutex);
 333         qemu_cond_destroy(&comp_param[i].cond);
 334     }
 335     qemu_mutex_destroy(&comp_done_lock);
 336     qemu_cond_destroy(&comp_done_cond);
 337     g_free(compress_threads);
 338     g_free(comp_param);
 339     compress_threads = NULL;
 340     comp_param = NULL;
 341 }
 342
 343 static void compress_threads_save_setup(void)
 344 {
 345     int i, thread_count;
 346
 347     if (!migrate_use_compression()) {
 348         return;
 349     }
 350     thread_count = migrate_compress_threads();
 351     compress_threads = g_new0(QemuThread, thread_count);
 352     comp_param = g_new0(CompressParam, thread_count);
 353     qemu_cond_init(&comp_done_cond);
 354     qemu_mutex_init(&comp_done_lock);
 355     for (i = 0; i < thread_count; i++) {
 356         /* comp_param[i].file is just used as a dummy buffer to save data,
 357          * set its ops to empty.
 358          */
 359         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 360         comp_param[i].done = true;
 361         comp_param[i].quit = false;
 362         qemu_mutex_init(&comp_param[i].mutex);
 363         qemu_cond_init(&comp_param[i].cond);
 364         qemu_thread_create(compress_threads + i, "compress",
 365                            do_data_compress, comp_param + i,
 366                            QEMU_THREAD_JOINABLE);
 367     }
 368 }
 369
 370 /* Multiple fd's */
 371
 372 struct MultiFDSendParams {
 373     uint8_t id;
 374     char *name;
 375     QemuThread thread;
 376     QemuSemaphore sem;
 377     QemuMutex mutex;
 378     bool quit;
 379 };
 380 typedef struct MultiFDSendParams MultiFDSendParams;
 381
 382 struct {
 383     MultiFDSendParams *params;
 384     /* number of created threads */
 385     int count;
 386 } *multifd_send_state;
 387
 388 static void terminate_multifd_send_threads(Error *errp)
 389 {
 390     int i;
 391
 392     for (i = 0; i < multifd_send_state->count; i++) {
 393         MultiFDSendParams *p = &multifd_send_state->params[i];
 394
 395         qemu_mutex_lock(&p->mutex);
 396         p->quit = true;
 397         qemu_sem_post(&p->sem);
 398         qemu_mutex_unlock(&p->mutex);
 399     }
 400 }
 401
 402 int multifd_save_cleanup(Error **errp)
 403 {
 404     int i;
 405     int ret = 0;
 406
 407     if (!migrate_use_multifd()) {
 408         return 0;
 409     }
 410     terminate_multifd_send_threads(NULL);
 411     for (i = 0; i < multifd_send_state->count; i++) {
 412         MultiFDSendParams *p = &multifd_send_state->params[i];
 413
 414         qemu_thread_join(&p->thread);
 415         qemu_mutex_destroy(&p->mutex);
 416         qemu_sem_destroy(&p->sem);
 417         g_free(p->name);
 418         p->name = NULL;
 419     }
 420     g_free(multifd_send_state->params);
 421     multifd_send_state->params = NULL;
 422     g_free(multifd_send_state);
 423     multifd_send_state = NULL;
 424     return ret;
 425 }
 426
 427 static void *multifd_send_thread(void *opaque)
 428 {
 429     MultiFDSendParams *p = opaque;
 430
 431     while (true) {
 432         qemu_mutex_lock(&p->mutex);
 433         if (p->quit) {
 434             qemu_mutex_unlock(&p->mutex);
 435             break;
 436         }
 437         qemu_mutex_unlock(&p->mutex);
 438         qemu_sem_wait(&p->sem);
 439     }
 440
 441     return NULL;
 442 }
 443
 444 int multifd_save_setup(void)
 445 {
 446     int thread_count;
 447     uint8_t i;
 448
 449     if (!migrate_use_multifd()) {
 450         return 0;
 451     }
 452     thread_count = migrate_multifd_channels();
 453     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
 454     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
 455     multifd_send_state->count = 0;
 456     for (i = 0; i < thread_count; i++) {
 457         MultiFDSendParams *p = &multifd_send_state->params[i];
 458
 459         qemu_mutex_init(&p->mutex);
 460         qemu_sem_init(&p->sem, 0);
 461         p->quit = false;
 462         p->id = i;
 463         p->name = g_strdup_printf("multifdsend_%d", i);
 464         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
 465                            QEMU_THREAD_JOINABLE);
 466
 467         multifd_send_state->count++;
 468     }
 469     return 0;
 470 }
 471
 472 struct MultiFDRecvParams {
 473     uint8_t id;
 474     char *name;
 475     QemuThread thread;
 476     QemuSemaphore sem;
 477     QemuMutex mutex;
 478     bool quit;
 479 };
 480 typedef struct MultiFDRecvParams MultiFDRecvParams;
 481
 482 struct {
 483     MultiFDRecvParams *params;
 484     /* number of created threads */
 485     int count;
 486 } *multifd_recv_state;
 487
 488 static void terminate_multifd_recv_threads(Error *errp)
 489 {
 490     int i;
 491
 492     for (i = 0; i < multifd_recv_state->count; i++) {
 493         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 494
 495         qemu_mutex_lock(&p->mutex);
 496         p->quit = true;
 497         qemu_sem_post(&p->sem);
 498         qemu_mutex_unlock(&p->mutex);
 499     }
 500 }
 501
 502 int multifd_load_cleanup(Error **errp)
 503 {
 504     int i;
 505     int ret = 0;
 506
 507     if (!migrate_use_multifd()) {
 508         return 0;
 509     }
 510     terminate_multifd_recv_threads(NULL);
 511     for (i = 0; i < multifd_recv_state->count; i++) {
 512         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 513
 514         qemu_thread_join(&p->thread);
 515         qemu_mutex_destroy(&p->mutex);
 516         qemu_sem_destroy(&p->sem);
 517         g_free(p->name);
 518         p->name = NULL;
 519     }
 520     g_free(multifd_recv_state->params);
 521     multifd_recv_state->params = NULL;
 522     g_free(multifd_recv_state);
 523     multifd_recv_state = NULL;
 524
 525     return ret;
 526 }
 527
 528 static void *multifd_recv_thread(void *opaque)
 529 {
 530     MultiFDRecvParams *p = opaque;
 531
 532     while (true) {
 533         qemu_mutex_lock(&p->mutex);
 534         if (p->quit) {
 535             qemu_mutex_unlock(&p->mutex);
 536             break;
 537         }
 538         qemu_mutex_unlock(&p->mutex);
 539         qemu_sem_wait(&p->sem);
 540     }
 541
 542     return NULL;
 543 }
 544
 545 int multifd_load_setup(void)
 546 {
 547     int thread_count;
 548     uint8_t i;
 549
 550     if (!migrate_use_multifd()) {
 551         return 0;
 552     }
 553     thread_count = migrate_multifd_channels();
 554     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
 555     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
 556     multifd_recv_state->count = 0;
 557     for (i = 0; i < thread_count; i++) {
 558         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 559
 560         qemu_mutex_init(&p->mutex);
 561         qemu_sem_init(&p->sem, 0);
 562         p->quit = false;
 563         p->id = i;
 564         p->name = g_strdup_printf("multifdrecv_%d", i);
 565         qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
 566                            QEMU_THREAD_JOINABLE);
 567         multifd_recv_state->count++;
 568     }
 569     return 0;
 570 }
 571
 572 /**
 573  * save_page_header: write page header to wire
 574  *
 575  * If this is the 1st block, it also writes the block identification
 576  *
 577  * Returns the number of bytes written
 578  *
 579  * @f: QEMUFile where to send the data
 580  * @block: block that contains the page we want to send
 581  * @offset: offset inside the block for the page
 582  *          in the lower bits, it contains flags
 583  */
 584 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 585                                ram_addr_t offset)
 586 {
 587     size_t size, len;
 588
 589     if (block == rs->last_sent_block) {
 590         offset |= RAM_SAVE_FLAG_CONTINUE;
 591     }
 592     qemu_put_be64(f, offset);
 593     size = 8;
 594
 595     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 596         len = strlen(block->idstr);
 597         qemu_put_byte(f, len);
 598         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 599         size += 1 + len;
 600         rs->last_sent_block = block;
 601     }
 602     return size;
 603 }
 604
 605 /**
 606  * mig_throttle_guest_down: throotle down the guest
 607  *
 608  * Reduce amount of guest cpu execution to hopefully slow down memory
 609  * writes. If guest dirty memory rate is reduced below the rate at
 610  * which we can transfer pages to the destination then we should be
 611  * able to complete migration. Some workloads dirty memory way too
 612  * fast and will not effectively converge, even with auto-converge.
 613  */
 614 static void mig_throttle_guest_down(void)
 615 {
 616     MigrationState *s = migrate_get_current();
 617     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 618     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 619
 620     /* We have not started throttling yet. Let's start it. */
 621     if (!cpu_throttle_active()) {
 622         cpu_throttle_set(pct_initial);
 623     } else {
 624         /* Throttling already on, just increase the rate */
 625         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 626     }
 627 }
 628
 629 /**
 630  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 631  *
 632  * @rs: current RAM state
 633  * @current_addr: address for the zero page
 634  *
 635  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 636  * The important thing is that a stale (not-yet-0'd) page be replaced
 637  * by the new data.
 638  * As a bonus, if the page wasn't in the cache it gets added so that
 639  * when a small write is made into the 0'd page it gets XBZRLE sent.
 640  */
 641 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 642 {
 643     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 644         return;
 645     }
 646
 647     /* We don't care if this fails to allocate a new cache page
 648      * as long as it updated an old one */
 649     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 650                  ram_counters.dirty_sync_count);
 651 }
 652
 653 #define ENCODING_FLAG_XBZRLE 0x1
 654
 655 /**
 656  * save_xbzrle_page: compress and send current page
 657  *
 658  * Returns: 1 means that we wrote the page
 659  *          0 means that page is identical to the one already sent
 660  *          -1 means that xbzrle would be longer than normal
 661  *
 662  * @rs: current RAM state
 663  * @current_data: pointer to the address of the page contents
 664  * @current_addr: addr of the page
 665  * @block: block that contains the page we want to send
 666  * @offset: offset inside the block for the page
 667  * @last_stage: if we are at the completion stage
 668  */
 669 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 670                             ram_addr_t current_addr, RAMBlock *block,
 671                             ram_addr_t offset, bool last_stage)
 672 {
 673     int encoded_len = 0, bytes_xbzrle;
 674     uint8_t *prev_cached_page;
 675
 676     if (!cache_is_cached(XBZRLE.cache, current_addr,
 677                          ram_counters.dirty_sync_count)) {
 678         xbzrle_counters.cache_miss++;
 679         if (!last_stage) {
 680             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 681                              ram_counters.dirty_sync_count) == -1) {
 682                 return -1;
 683             } else {
 684                 /* update *current_data when the page has been
 685                    inserted into cache */
 686                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 687             }
 688         }
 689         return -1;
 690     }
 691
 692     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 693
 694     /* save current buffer into memory */
 695     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 696
 697     /* XBZRLE encoding (if there is no overflow) */
 698     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 699                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 700                                        TARGET_PAGE_SIZE);
 701     if (encoded_len == 0) {
 702         trace_save_xbzrle_page_skipping();
 703         return 0;
 704     } else if (encoded_len == -1) {
 705         trace_save_xbzrle_page_overflow();
 706         xbzrle_counters.overflow++;
 707         /* update data in the cache */
 708         if (!last_stage) {
 709             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 710             *current_data = prev_cached_page;
 711         }
 712         return -1;
 713     }
 714
 715     /* we need to update the data in the cache, in order to get the same data */
 716     if (!last_stage) {
 717         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 718     }
 719
 720     /* Send XBZRLE based compressed page */
 721     bytes_xbzrle = save_page_header(rs, rs->f, block,
 722                                     offset | RAM_SAVE_FLAG_XBZRLE);
 723     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 724     qemu_put_be16(rs->f, encoded_len);
 725     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 726     bytes_xbzrle += encoded_len + 1 + 2;
 727     xbzrle_counters.pages++;
 728     xbzrle_counters.bytes += bytes_xbzrle;
 729     ram_counters.transferred += bytes_xbzrle;
 730
 731     return 1;
 732 }
 733
 734 /**
 735  * migration_bitmap_find_dirty: find the next dirty page from start
 736  *
 737  * Called with rcu_read_lock() to protect migration_bitmap
 738  *
 739  * Returns the byte offset within memory region of the start of a dirty page
 740  *
 741  * @rs: current RAM state
 742  * @rb: RAMBlock where to search for dirty pages
 743  * @start: page where we start the search
 744  */
 745 static inline
 746 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 747                                           unsigned long start)
 748 {
 749     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 750     unsigned long *bitmap = rb->bmap;
 751     unsigned long next;
 752
 753     if (rs->ram_bulk_stage && start > 0) {
 754         next = start + 1;
 755     } else {
 756         next = find_next_bit(bitmap, size, start);
 757     }
 758
 759     return next;
 760 }
 761
 762 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 763                                                 RAMBlock *rb,
 764                                                 unsigned long page)
 765 {
 766     bool ret;
 767
 768     ret = test_and_clear_bit(page, rb->bmap);
 769
 770     if (ret) {
 771         rs->migration_dirty_pages--;
 772     }
 773     return ret;
 774 }
 775
 776 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 777                                         ram_addr_t start, ram_addr_t length)
 778 {
 779     rs->migration_dirty_pages +=
 780         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 781                                               &rs->num_dirty_pages_period);
 782 }
 783
 784 /**
 785  * ram_pagesize_summary: calculate all the pagesizes of a VM
 786  *
 787  * Returns a summary bitmap of the page sizes of all RAMBlocks
 788  *
 789  * For VMs with just normal pages this is equivalent to the host page
 790  * size. If it's got some huge pages then it's the OR of all the
 791  * different page sizes.
 792  */
 793 uint64_t ram_pagesize_summary(void)
 794 {
 795     RAMBlock *block;
 796     uint64_t summary = 0;
 797
 798     RAMBLOCK_FOREACH(block) {
 799         summary |= block->page_size;
 800     }
 801
 802     return summary;
 803 }
 804
 805 static void migration_bitmap_sync(RAMState *rs)
 806 {
 807     RAMBlock *block;
 808     int64_t end_time;
 809     uint64_t bytes_xfer_now;
 810
 811     ram_counters.dirty_sync_count++;
 812
 813     if (!rs->time_last_bitmap_sync) {
 814         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 815     }
 816
 817     trace_migration_bitmap_sync_start();
 818     memory_global_dirty_log_sync();
 819
 820     qemu_mutex_lock(&rs->bitmap_mutex);
 821     rcu_read_lock();
 822     RAMBLOCK_FOREACH(block) {
 823         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 824     }
 825     rcu_read_unlock();
 826     qemu_mutex_unlock(&rs->bitmap_mutex);
 827
 828     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 829
 830     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 831
 832     /* more than 1 second = 1000 millisecons */
 833     if (end_time > rs->time_last_bitmap_sync + 1000) {
 834         /* calculate period counters */
 835         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 836             / (end_time - rs->time_last_bitmap_sync);
 837         bytes_xfer_now = ram_counters.transferred;
 838
 839         /* During block migration the auto-converge logic incorrectly detects
 840          * that ram migration makes no progress. Avoid this by disabling the
 841          * throttling logic during the bulk phase of block migration. */
 842         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 843             /* The following detection logic can be refined later. For now:
 844                Check to see if the dirtied bytes is 50% more than the approx.
 845                amount of bytes that just got transferred since the last time we
 846                were in this routine. If that happens twice, start or increase
 847                throttling */
 848
 849             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 850                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 851                 (++rs->dirty_rate_high_cnt >= 2)) {
 852                     trace_migration_throttle();
 853                     rs->dirty_rate_high_cnt = 0;
 854                     mig_throttle_guest_down();
 855             }
 856         }
 857
 858         if (migrate_use_xbzrle()) {
 859             if (rs->iterations_prev != rs->iterations) {
 860                 xbzrle_counters.cache_miss_rate =
 861                    (double)(xbzrle_counters.cache_miss -
 862                             rs->xbzrle_cache_miss_prev) /
 863                    (rs->iterations - rs->iterations_prev);
 864             }
 865             rs->iterations_prev = rs->iterations;
 866             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 867         }
 868
 869         /* reset period counters */
 870         rs->time_last_bitmap_sync = end_time;
 871         rs->num_dirty_pages_period = 0;
 872         rs->bytes_xfer_prev = bytes_xfer_now;
 873     }
 874     if (migrate_use_events()) {
 875         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
 876     }
 877 }
 878
 879 /**
 880  * save_zero_page: send the zero page to the stream
 881  *
 882  * Returns the number of pages written.
 883  *
 884  * @rs: current RAM state
 885  * @block: block that contains the page we want to send
 886  * @offset: offset inside the block for the page
 887  * @p: pointer to the page
 888  */
 889 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 890                           uint8_t *p)
 891 {
 892     int pages = -1;
 893
 894     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 895         ram_counters.duplicate++;
 896         ram_counters.transferred +=
 897             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 898         qemu_put_byte(rs->f, 0);
 899         ram_counters.transferred += 1;
 900         pages = 1;
 901     }
 902
 903     return pages;
 904 }
 905
 906 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 907 {
 908     if (!migrate_release_ram() || !migration_in_postcopy()) {
 909         return;
 910     }
 911
 912     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 913 }
 914
 915 /**
 916  * ram_save_page: send the given page to the stream
 917  *
 918  * Returns the number of pages written.
 919  *          < 0 - error
 920  *          >=0 - Number of pages written - this might legally be 0
 921  *                if xbzrle noticed the page was the same.
 922  *
 923  * @rs: current RAM state
 924  * @block: block that contains the page we want to send
 925  * @offset: offset inside the block for the page
 926  * @last_stage: if we are at the completion stage
 927  */
 928 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 929 {
 930     int pages = -1;
 931     uint64_t bytes_xmit;
 932     ram_addr_t current_addr;
 933     uint8_t *p;
 934     int ret;
 935     bool send_async = true;
 936     RAMBlock *block = pss->block;
 937     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 938
 939     p = block->host + offset;
 940     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 941
 942     /* In doubt sent page as normal */
 943     bytes_xmit = 0;
 944     ret = ram_control_save_page(rs->f, block->offset,
 945                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 946     if (bytes_xmit) {
 947         ram_counters.transferred += bytes_xmit;
 948         pages = 1;
 949     }
 950
 951     XBZRLE_cache_lock();
 952
 953     current_addr = block->offset + offset;
 954
 955     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 956         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 957             if (bytes_xmit > 0) {
 958                 ram_counters.normal++;
 959             } else if (bytes_xmit == 0) {
 960                 ram_counters.duplicate++;
 961             }
 962         }
 963     } else {
 964         pages = save_zero_page(rs, block, offset, p);
 965         if (pages > 0) {
 966             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 967              * page would be stale
 968              */
 969             xbzrle_cache_zero_page(rs, current_addr);
 970             ram_release_pages(block->idstr, offset, pages);
 971         } else if (!rs->ram_bulk_stage &&
 972                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 973             pages = save_xbzrle_page(rs, &p, current_addr, block,
 974                                      offset, last_stage);
 975             if (!last_stage) {
 976                 /* Can't send this cached data async, since the cache page
 977                  * might get updated before it gets to the wire
 978                  */
 979                 send_async = false;
 980             }
 981         }
 982     }
 983
 984     /* XBZRLE overflow or normal page */
 985     if (pages == -1) {
 986         ram_counters.transferred +=
 987             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
 988         if (send_async) {
 989             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
 990                                   migrate_release_ram() &
 991                                   migration_in_postcopy());
 992         } else {
 993             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
 994         }
 995         ram_counters.transferred += TARGET_PAGE_SIZE;
 996         pages = 1;
 997         ram_counters.normal++;
 998     }
 999
1000     XBZRLE_cache_unlock();
1001
1002     return pages;
1003 }
1004
1005 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
1006                                 ram_addr_t offset)
1007 {
1008     RAMState *rs = ram_state;
1009     int bytes_sent, blen;
1010     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1011
1012     bytes_sent = save_page_header(rs, f, block, offset |
1013                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1014     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1015                                      migrate_compress_level());
1016     if (blen < 0) {
1017         bytes_sent = 0;
1018         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1019         error_report("compressed data failed!");
1020     } else {
1021         bytes_sent += blen;
1022         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1023     }
1024
1025     return bytes_sent;
1026 }
1027
1028 static void flush_compressed_data(RAMState *rs)
1029 {
1030     int idx, len, thread_count;
1031
1032     if (!migrate_use_compression()) {
1033         return;
1034     }
1035     thread_count = migrate_compress_threads();
1036
1037     qemu_mutex_lock(&comp_done_lock);
1038     for (idx = 0; idx < thread_count; idx++) {
1039         while (!comp_param[idx].done) {
1040             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1041         }
1042     }
1043     qemu_mutex_unlock(&comp_done_lock);
1044
1045     for (idx = 0; idx < thread_count; idx++) {
1046         qemu_mutex_lock(&comp_param[idx].mutex);
1047         if (!comp_param[idx].quit) {
1048             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1049             ram_counters.transferred += len;
1050         }
1051         qemu_mutex_unlock(&comp_param[idx].mutex);
1052     }
1053 }
1054
1055 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1056                                        ram_addr_t offset)
1057 {
1058     param->block = block;
1059     param->offset = offset;
1060 }
1061
1062 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1063                                            ram_addr_t offset)
1064 {
1065     int idx, thread_count, bytes_xmit = -1, pages = -1;
1066
1067     thread_count = migrate_compress_threads();
1068     qemu_mutex_lock(&comp_done_lock);
1069     while (true) {
1070         for (idx = 0; idx < thread_count; idx++) {
1071             if (comp_param[idx].done) {
1072                 comp_param[idx].done = false;
1073                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1074                 qemu_mutex_lock(&comp_param[idx].mutex);
1075                 set_compress_params(&comp_param[idx], block, offset);
1076                 qemu_cond_signal(&comp_param[idx].cond);
1077                 qemu_mutex_unlock(&comp_param[idx].mutex);
1078                 pages = 1;
1079                 ram_counters.normal++;
1080                 ram_counters.transferred += bytes_xmit;
1081                 break;
1082             }
1083         }
1084         if (pages > 0) {
1085             break;
1086         } else {
1087             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1088         }
1089     }
1090     qemu_mutex_unlock(&comp_done_lock);
1091
1092     return pages;
1093 }
1094
1095 /**
1096  * ram_save_compressed_page: compress the given page and send it to the stream
1097  *
1098  * Returns the number of pages written.
1099  *
1100  * @rs: current RAM state
1101  * @block: block that contains the page we want to send
1102  * @offset: offset inside the block for the page
1103  * @last_stage: if we are at the completion stage
1104  */
1105 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1106                                     bool last_stage)
1107 {
1108     int pages = -1;
1109     uint64_t bytes_xmit = 0;
1110     uint8_t *p;
1111     int ret, blen;
1112     RAMBlock *block = pss->block;
1113     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1114
1115     p = block->host + offset;
1116
1117     ret = ram_control_save_page(rs->f, block->offset,
1118                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1119     if (bytes_xmit) {
1120         ram_counters.transferred += bytes_xmit;
1121         pages = 1;
1122     }
1123     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1124         if (ret != RAM_SAVE_CONTROL_DELAYED) {
1125             if (bytes_xmit > 0) {
1126                 ram_counters.normal++;
1127             } else if (bytes_xmit == 0) {
1128                 ram_counters.duplicate++;
1129             }
1130         }
1131     } else {
1132         /* When starting the process of a new block, the first page of
1133          * the block should be sent out before other pages in the same
1134          * block, and all the pages in last block should have been sent
1135          * out, keeping this order is important, because the 'cont' flag
1136          * is used to avoid resending the block name.
1137          */
1138         if (block != rs->last_sent_block) {
1139             flush_compressed_data(rs);
1140             pages = save_zero_page(rs, block, offset, p);
1141             if (pages == -1) {
1142                 /* Make sure the first page is sent out before other pages */
1143                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1144                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1145                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1146                                                  migrate_compress_level());
1147                 if (blen > 0) {
1148                     ram_counters.transferred += bytes_xmit + blen;
1149                     ram_counters.normal++;
1150                     pages = 1;
1151                 } else {
1152                     qemu_file_set_error(rs->f, blen);
1153                     error_report("compressed data failed!");
1154                 }
1155             }
1156             if (pages > 0) {
1157                 ram_release_pages(block->idstr, offset, pages);
1158             }
1159         } else {
1160             pages = save_zero_page(rs, block, offset, p);
1161             if (pages == -1) {
1162                 pages = compress_page_with_multi_thread(rs, block, offset);
1163             } else {
1164                 ram_release_pages(block->idstr, offset, pages);
1165             }
1166         }
1167     }
1168
1169     return pages;
1170 }
1171
1172 /**
1173  * find_dirty_block: find the next dirty page and update any state
1174  * associated with the search process.
1175  *
1176  * Returns if a page is found
1177  *
1178  * @rs: current RAM state
1179  * @pss: data about the state of the current dirty page scan
1180  * @again: set to false if the search has scanned the whole of RAM
1181  */
1182 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1183 {
1184     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1185     if (pss->complete_round && pss->block == rs->last_seen_block &&
1186         pss->page >= rs->last_page) {
1187         /*
1188          * We've been once around the RAM and haven't found anything.
1189          * Give up.
1190          */
1191         *again = false;
1192         return false;
1193     }
1194     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1195         /* Didn't find anything in this RAM Block */
1196         pss->page = 0;
1197         pss->block = QLIST_NEXT_RCU(pss->block, next);
1198         if (!pss->block) {
1199             /* Hit the end of the list */
1200             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1201             /* Flag that we've looped */
1202             pss->complete_round = true;
1203             rs->ram_bulk_stage = false;
1204             if (migrate_use_xbzrle()) {
1205                 /* If xbzrle is on, stop using the data compression at this
1206                  * point. In theory, xbzrle can do better than compression.
1207                  */
1208                 flush_compressed_data(rs);
1209             }
1210         }
1211         /* Didn't find anything this time, but try again on the new block */
1212         *again = true;
1213         return false;
1214     } else {
1215         /* Can go around again, but... */
1216         *again = true;
1217         /* We've found something so probably don't need to */
1218         return true;
1219     }
1220 }
1221
1222 /**
1223  * unqueue_page: gets a page of the queue
1224  *
1225  * Helper for 'get_queued_page' - gets a page off the queue
1226  *
1227  * Returns the block of the page (or NULL if none available)
1228  *
1229  * @rs: current RAM state
1230  * @offset: used to return the offset within the RAMBlock
1231  */
1232 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1233 {
1234     RAMBlock *block = NULL;
1235
1236     qemu_mutex_lock(&rs->src_page_req_mutex);
1237     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1238         struct RAMSrcPageRequest *entry =
1239                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1240         block = entry->rb;
1241         *offset = entry->offset;
1242
1243         if (entry->len > TARGET_PAGE_SIZE) {
1244             entry->len -= TARGET_PAGE_SIZE;
1245             entry->offset += TARGET_PAGE_SIZE;
1246         } else {
1247             memory_region_unref(block->mr);
1248             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1249             g_free(entry);
1250         }
1251     }
1252     qemu_mutex_unlock(&rs->src_page_req_mutex);
1253
1254     return block;
1255 }
1256
1257 /**
1258  * get_queued_page: unqueue a page from the postocpy requests
1259  *
1260  * Skips pages that are already sent (!dirty)
1261  *
1262  * Returns if a queued page is found
1263  *
1264  * @rs: current RAM state
1265  * @pss: data about the state of the current dirty page scan
1266  */
1267 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1268 {
1269     RAMBlock  *block;
1270     ram_addr_t offset;
1271     bool dirty;
1272
1273     do {
1274         block = unqueue_page(rs, &offset);
1275         /*
1276          * We're sending this page, and since it's postcopy nothing else
1277          * will dirty it, and we must make sure it doesn't get sent again
1278          * even if this queue request was received after the background
1279          * search already sent it.
1280          */
1281         if (block) {
1282             unsigned long page;
1283
1284             page = offset >> TARGET_PAGE_BITS;
1285             dirty = test_bit(page, block->bmap);
1286             if (!dirty) {
1287                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1288                        page, test_bit(page, block->unsentmap));
1289             } else {
1290                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1291             }
1292         }
1293
1294     } while (block && !dirty);
1295
1296     if (block) {
1297         /*
1298          * As soon as we start servicing pages out of order, then we have
1299          * to kill the bulk stage, since the bulk stage assumes
1300          * in (migration_bitmap_find_and_reset_dirty) that every page is
1301          * dirty, that's no longer true.
1302          */
1303         rs->ram_bulk_stage = false;
1304
1305         /*
1306          * We want the background search to continue from the queued page
1307          * since the guest is likely to want other pages near to the page
1308          * it just requested.
1309          */
1310         pss->block = block;
1311         pss->page = offset >> TARGET_PAGE_BITS;
1312     }
1313
1314     return !!block;
1315 }
1316
1317 /**
1318  * migration_page_queue_free: drop any remaining pages in the ram
1319  * request queue
1320  *
1321  * It should be empty at the end anyway, but in error cases there may
1322  * be some left.  in case that there is any page left, we drop it.
1323  *
1324  */
1325 static void migration_page_queue_free(RAMState *rs)
1326 {
1327     struct RAMSrcPageRequest *mspr, *next_mspr;
1328     /* This queue generally should be empty - but in the case of a failed
1329      * migration might have some droppings in.
1330      */
1331     rcu_read_lock();
1332     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1333         memory_region_unref(mspr->rb->mr);
1334         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1335         g_free(mspr);
1336     }
1337     rcu_read_unlock();
1338 }
1339
1340 /**
1341  * ram_save_queue_pages: queue the page for transmission
1342  *
1343  * A request from postcopy destination for example.
1344  *
1345  * Returns zero on success or negative on error
1346  *
1347  * @rbname: Name of the RAMBLock of the request. NULL means the
1348  *          same that last one.
1349  * @start: starting address from the start of the RAMBlock
1350  * @len: length (in bytes) to send
1351  */
1352 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1353 {
1354     RAMBlock *ramblock;
1355     RAMState *rs = ram_state;
1356
1357     ram_counters.postcopy_requests++;
1358     rcu_read_lock();
1359     if (!rbname) {
1360         /* Reuse last RAMBlock */
1361         ramblock = rs->last_req_rb;
1362
1363         if (!ramblock) {
1364             /*
1365              * Shouldn't happen, we can't reuse the last RAMBlock if
1366              * it's the 1st request.
1367              */
1368             error_report("ram_save_queue_pages no previous block");
1369             goto err;
1370         }
1371     } else {
1372         ramblock = qemu_ram_block_by_name(rbname);
1373
1374         if (!ramblock) {
1375             /* We shouldn't be asked for a non-existent RAMBlock */
1376             error_report("ram_save_queue_pages no block '%s'", rbname);
1377             goto err;
1378         }
1379         rs->last_req_rb = ramblock;
1380     }
1381     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1382     if (start+len > ramblock->used_length) {
1383         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1384                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1385                      __func__, start, len, ramblock->used_length);
1386         goto err;
1387     }
1388
1389     struct RAMSrcPageRequest *new_entry =
1390         g_malloc0(sizeof(struct RAMSrcPageRequest));
1391     new_entry->rb = ramblock;
1392     new_entry->offset = start;
1393     new_entry->len = len;
1394
1395     memory_region_ref(ramblock->mr);
1396     qemu_mutex_lock(&rs->src_page_req_mutex);
1397     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1398     qemu_mutex_unlock(&rs->src_page_req_mutex);
1399     rcu_read_unlock();
1400
1401     return 0;
1402
1403 err:
1404     rcu_read_unlock();
1405     return -1;
1406 }
1407
1408 /**
1409  * ram_save_target_page: save one target page
1410  *
1411  * Returns the number of pages written
1412  *
1413  * @rs: current RAM state
1414  * @ms: current migration state
1415  * @pss: data about the page we want to send
1416  * @last_stage: if we are at the completion stage
1417  */
1418 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1419                                 bool last_stage)
1420 {
1421     int res = 0;
1422
1423     /* Check the pages is dirty and if it is send it */
1424     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1425         /*
1426          * If xbzrle is on, stop using the data compression after first
1427          * round of migration even if compression is enabled. In theory,
1428          * xbzrle can do better than compression.
1429          */
1430         if (migrate_use_compression() &&
1431             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1432             res = ram_save_compressed_page(rs, pss, last_stage);
1433         } else {
1434             res = ram_save_page(rs, pss, last_stage);
1435         }
1436
1437         if (res < 0) {
1438             return res;
1439         }
1440         if (pss->block->unsentmap) {
1441             clear_bit(pss->page, pss->block->unsentmap);
1442         }
1443     }
1444
1445     return res;
1446 }
1447
1448 /**
1449  * ram_save_host_page: save a whole host page
1450  *
1451  * Starting at *offset send pages up to the end of the current host
1452  * page. It's valid for the initial offset to point into the middle of
1453  * a host page in which case the remainder of the hostpage is sent.
1454  * Only dirty target pages are sent. Note that the host page size may
1455  * be a huge page for this block.
1456  * The saving stops at the boundary of the used_length of the block
1457  * if the RAMBlock isn't a multiple of the host page size.
1458  *
1459  * Returns the number of pages written or negative on error
1460  *
1461  * @rs: current RAM state
1462  * @ms: current migration state
1463  * @pss: data about the page we want to send
1464  * @last_stage: if we are at the completion stage
1465  */
1466 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1467                               bool last_stage)
1468 {
1469     int tmppages, pages = 0;
1470     size_t pagesize_bits =
1471         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1472
1473     do {
1474         tmppages = ram_save_target_page(rs, pss, last_stage);
1475         if (tmppages < 0) {
1476             return tmppages;
1477         }
1478
1479         pages += tmppages;
1480         pss->page++;
1481     } while ((pss->page & (pagesize_bits - 1)) &&
1482              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1483
1484     /* The offset we leave with is the last one we looked at */
1485     pss->page--;
1486     return pages;
1487 }
1488
1489 /**
1490  * ram_find_and_save_block: finds a dirty page and sends it to f
1491  *
1492  * Called within an RCU critical section.
1493  *
1494  * Returns the number of pages written where zero means no dirty pages
1495  *
1496  * @rs: current RAM state
1497  * @last_stage: if we are at the completion stage
1498  *
1499  * On systems where host-page-size > target-page-size it will send all the
1500  * pages in a host page that are dirty.
1501  */
1502
1503 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1504 {
1505     PageSearchStatus pss;
1506     int pages = 0;
1507     bool again, found;
1508
1509     /* No dirty page as there is zero RAM */
1510     if (!ram_bytes_total()) {
1511         return pages;
1512     }
1513
1514     pss.block = rs->last_seen_block;
1515     pss.page = rs->last_page;
1516     pss.complete_round = false;
1517
1518     if (!pss.block) {
1519         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1520     }
1521
1522     do {
1523         again = true;
1524         found = get_queued_page(rs, &pss);
1525
1526         if (!found) {
1527             /* priority queue empty, so just search for something dirty */
1528             found = find_dirty_block(rs, &pss, &again);
1529         }
1530
1531         if (found) {
1532             pages = ram_save_host_page(rs, &pss, last_stage);
1533         }
1534     } while (!pages && again);
1535
1536     rs->last_seen_block = pss.block;
1537     rs->last_page = pss.page;
1538
1539     return pages;
1540 }
1541
1542 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1543 {
1544     uint64_t pages = size / TARGET_PAGE_SIZE;
1545
1546     if (zero) {
1547         ram_counters.duplicate += pages;
1548     } else {
1549         ram_counters.normal += pages;
1550         ram_counters.transferred += size;
1551         qemu_update_position(f, size);
1552     }
1553 }
1554
1555 uint64_t ram_bytes_total(void)
1556 {
1557     RAMBlock *block;
1558     uint64_t total = 0;
1559
1560     rcu_read_lock();
1561     RAMBLOCK_FOREACH(block) {
1562         total += block->used_length;
1563     }
1564     rcu_read_unlock();
1565     return total;
1566 }
1567
1568 static void xbzrle_load_setup(void)
1569 {
1570     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1571 }
1572
1573 static void xbzrle_load_cleanup(void)
1574 {
1575     g_free(XBZRLE.decoded_buf);
1576     XBZRLE.decoded_buf = NULL;
1577 }
1578
1579 static void ram_save_cleanup(void *opaque)
1580 {
1581     RAMState **rsp = opaque;
1582     RAMBlock *block;
1583
1584     /* caller have hold iothread lock or is in a bh, so there is
1585      * no writing race against this migration_bitmap
1586      */
1587     memory_global_dirty_log_stop();
1588
1589     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1590         g_free(block->bmap);
1591         block->bmap = NULL;
1592         g_free(block->unsentmap);
1593         block->unsentmap = NULL;
1594     }
1595
1596     XBZRLE_cache_lock();
1597     if (XBZRLE.cache) {
1598         cache_fini(XBZRLE.cache);
1599         g_free(XBZRLE.encoded_buf);
1600         g_free(XBZRLE.current_buf);
1601         g_free(XBZRLE.zero_target_page);
1602         XBZRLE.cache = NULL;
1603         XBZRLE.encoded_buf = NULL;
1604         XBZRLE.current_buf = NULL;
1605         XBZRLE.zero_target_page = NULL;
1606     }
1607     XBZRLE_cache_unlock();
1608     migration_page_queue_free(*rsp);
1609     compress_threads_save_cleanup();
1610     g_free(*rsp);
1611     *rsp = NULL;
1612 }
1613
1614 static void ram_state_reset(RAMState *rs)
1615 {
1616     rs->last_seen_block = NULL;
1617     rs->last_sent_block = NULL;
1618     rs->last_page = 0;
1619     rs->last_version = ram_list.version;
1620     rs->ram_bulk_stage = true;
1621 }
1622
1623 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1624
1625 /*
1626  * 'expected' is the value you expect the bitmap mostly to be full
1627  * of; it won't bother printing lines that are all this value.
1628  * If 'todump' is null the migration bitmap is dumped.
1629  */
1630 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1631                            unsigned long pages)
1632 {
1633     int64_t cur;
1634     int64_t linelen = 128;
1635     char linebuf[129];
1636
1637     for (cur = 0; cur < pages; cur += linelen) {
1638         int64_t curb;
1639         bool found = false;
1640         /*
1641          * Last line; catch the case where the line length
1642          * is longer than remaining ram
1643          */
1644         if (cur + linelen > pages) {
1645             linelen = pages - cur;
1646         }
1647         for (curb = 0; curb < linelen; curb++) {
1648             bool thisbit = test_bit(cur + curb, todump);
1649             linebuf[curb] = thisbit ? '1' : '.';
1650             found = found || (thisbit != expected);
1651         }
1652         if (found) {
1653             linebuf[curb] = '\0';
1654             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1655         }
1656     }
1657 }
1658
1659 /* **** functions for postcopy ***** */
1660
1661 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1662 {
1663     struct RAMBlock *block;
1664
1665     RAMBLOCK_FOREACH(block) {
1666         unsigned long *bitmap = block->bmap;
1667         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1668         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1669
1670         while (run_start < range) {
1671             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1672             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1673                               (run_end - run_start) << TARGET_PAGE_BITS);
1674             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1675         }
1676     }
1677 }
1678
1679 /**
1680  * postcopy_send_discard_bm_ram: discard a RAMBlock
1681  *
1682  * Returns zero on success
1683  *
1684  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1685  * Note: At this point the 'unsentmap' is the processed bitmap combined
1686  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1687  *
1688  * @ms: current migration state
1689  * @pds: state for postcopy
1690  * @start: RAMBlock starting page
1691  * @length: RAMBlock size
1692  */
1693 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1694                                         PostcopyDiscardState *pds,
1695                                         RAMBlock *block)
1696 {
1697     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1698     unsigned long current;
1699     unsigned long *unsentmap = block->unsentmap;
1700
1701     for (current = 0; current < end; ) {
1702         unsigned long one = find_next_bit(unsentmap, end, current);
1703
1704         if (one <= end) {
1705             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1706             unsigned long discard_length;
1707
1708             if (zero >= end) {
1709                 discard_length = end - one;
1710             } else {
1711                 discard_length = zero - one;
1712             }
1713             if (discard_length) {
1714                 postcopy_discard_send_range(ms, pds, one, discard_length);
1715             }
1716             current = one + discard_length;
1717         } else {
1718             current = one;
1719         }
1720     }
1721
1722     return 0;
1723 }
1724
1725 /**
1726  * postcopy_each_ram_send_discard: discard all RAMBlocks
1727  *
1728  * Returns 0 for success or negative for error
1729  *
1730  * Utility for the outgoing postcopy code.
1731  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1732  *   passing it bitmap indexes and name.
1733  * (qemu_ram_foreach_block ends up passing unscaled lengths
1734  *  which would mean postcopy code would have to deal with target page)
1735  *
1736  * @ms: current migration state
1737  */
1738 static int postcopy_each_ram_send_discard(MigrationState *ms)
1739 {
1740     struct RAMBlock *block;
1741     int ret;
1742
1743     RAMBLOCK_FOREACH(block) {
1744         PostcopyDiscardState *pds =
1745             postcopy_discard_send_init(ms, block->idstr);
1746
1747         /*
1748          * Postcopy sends chunks of bitmap over the wire, but it
1749          * just needs indexes at this point, avoids it having
1750          * target page specific code.
1751          */
1752         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1753         postcopy_discard_send_finish(ms, pds);
1754         if (ret) {
1755             return ret;
1756         }
1757     }
1758
1759     return 0;
1760 }
1761
1762 /**
1763  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1764  *
1765  * Helper for postcopy_chunk_hostpages; it's called twice to
1766  * canonicalize the two bitmaps, that are similar, but one is
1767  * inverted.
1768  *
1769  * Postcopy requires that all target pages in a hostpage are dirty or
1770  * clean, not a mix.  This function canonicalizes the bitmaps.
1771  *
1772  * @ms: current migration state
1773  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1774  *               otherwise we need to canonicalize partially dirty host pages
1775  * @block: block that contains the page we want to canonicalize
1776  * @pds: state for postcopy
1777  */
1778 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1779                                           RAMBlock *block,
1780                                           PostcopyDiscardState *pds)
1781 {
1782     RAMState *rs = ram_state;
1783     unsigned long *bitmap = block->bmap;
1784     unsigned long *unsentmap = block->unsentmap;
1785     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1786     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1787     unsigned long run_start;
1788
1789     if (block->page_size == TARGET_PAGE_SIZE) {
1790         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1791         return;
1792     }
1793
1794     if (unsent_pass) {
1795         /* Find a sent page */
1796         run_start = find_next_zero_bit(unsentmap, pages, 0);
1797     } else {
1798         /* Find a dirty page */
1799         run_start = find_next_bit(bitmap, pages, 0);
1800     }
1801
1802     while (run_start < pages) {
1803         bool do_fixup = false;
1804         unsigned long fixup_start_addr;
1805         unsigned long host_offset;
1806
1807         /*
1808          * If the start of this run of pages is in the middle of a host
1809          * page, then we need to fixup this host page.
1810          */
1811         host_offset = run_start % host_ratio;
1812         if (host_offset) {
1813             do_fixup = true;
1814             run_start -= host_offset;
1815             fixup_start_addr = run_start;
1816             /* For the next pass */
1817             run_start = run_start + host_ratio;
1818         } else {
1819             /* Find the end of this run */
1820             unsigned long run_end;
1821             if (unsent_pass) {
1822                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1823             } else {
1824                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1825             }
1826             /*
1827              * If the end isn't at the start of a host page, then the
1828              * run doesn't finish at the end of a host page
1829              * and we need to discard.
1830              */
1831             host_offset = run_end % host_ratio;
1832             if (host_offset) {
1833                 do_fixup = true;
1834                 fixup_start_addr = run_end - host_offset;
1835                 /*
1836                  * This host page has gone, the next loop iteration starts
1837                  * from after the fixup
1838                  */
1839                 run_start = fixup_start_addr + host_ratio;
1840             } else {
1841                 /*
1842                  * No discards on this iteration, next loop starts from
1843                  * next sent/dirty page
1844                  */
1845                 run_start = run_end + 1;
1846             }
1847         }
1848
1849         if (do_fixup) {
1850             unsigned long page;
1851
1852             /* Tell the destination to discard this page */
1853             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1854                 /* For the unsent_pass we:
1855                  *     discard partially sent pages
1856                  * For the !unsent_pass (dirty) we:
1857                  *     discard partially dirty pages that were sent
1858                  *     (any partially sent pages were already discarded
1859                  *     by the previous unsent_pass)
1860                  */
1861                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1862                                             host_ratio);
1863             }
1864
1865             /* Clean up the bitmap */
1866             for (page = fixup_start_addr;
1867                  page < fixup_start_addr + host_ratio; page++) {
1868                 /* All pages in this host page are now not sent */
1869                 set_bit(page, unsentmap);
1870
1871                 /*
1872                  * Remark them as dirty, updating the count for any pages
1873                  * that weren't previously dirty.
1874                  */
1875                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1876             }
1877         }
1878
1879         if (unsent_pass) {
1880             /* Find the next sent page for the next iteration */
1881             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1882         } else {
1883             /* Find the next dirty page for the next iteration */
1884             run_start = find_next_bit(bitmap, pages, run_start);
1885         }
1886     }
1887 }
1888
1889 /**
1890  * postcopy_chuck_hostpages: discrad any partially sent host page
1891  *
1892  * Utility for the outgoing postcopy code.
1893  *
1894  * Discard any partially sent host-page size chunks, mark any partially
1895  * dirty host-page size chunks as all dirty.  In this case the host-page
1896  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1897  *
1898  * Returns zero on success
1899  *
1900  * @ms: current migration state
1901  * @block: block we want to work with
1902  */
1903 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1904 {
1905     PostcopyDiscardState *pds =
1906         postcopy_discard_send_init(ms, block->idstr);
1907
1908     /* First pass: Discard all partially sent host pages */
1909     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1910     /*
1911      * Second pass: Ensure that all partially dirty host pages are made
1912      * fully dirty.
1913      */
1914     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1915
1916     postcopy_discard_send_finish(ms, pds);
1917     return 0;
1918 }
1919
1920 /**
1921  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1922  *
1923  * Returns zero on success
1924  *
1925  * Transmit the set of pages to be discarded after precopy to the target
1926  * these are pages that:
1927  *     a) Have been previously transmitted but are now dirty again
1928  *     b) Pages that have never been transmitted, this ensures that
1929  *        any pages on the destination that have been mapped by background
1930  *        tasks get discarded (transparent huge pages is the specific concern)
1931  * Hopefully this is pretty sparse
1932  *
1933  * @ms: current migration state
1934  */
1935 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1936 {
1937     RAMState *rs = ram_state;
1938     RAMBlock *block;
1939     int ret;
1940
1941     rcu_read_lock();
1942
1943     /* This should be our last sync, the src is now paused */
1944     migration_bitmap_sync(rs);
1945
1946     /* Easiest way to make sure we don't resume in the middle of a host-page */
1947     rs->last_seen_block = NULL;
1948     rs->last_sent_block = NULL;
1949     rs->last_page = 0;
1950
1951     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1952         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1953         unsigned long *bitmap = block->bmap;
1954         unsigned long *unsentmap = block->unsentmap;
1955
1956         if (!unsentmap) {
1957             /* We don't have a safe way to resize the sentmap, so
1958              * if the bitmap was resized it will be NULL at this
1959              * point.
1960              */
1961             error_report("migration ram resized during precopy phase");
1962             rcu_read_unlock();
1963             return -EINVAL;
1964         }
1965         /* Deal with TPS != HPS and huge pages */
1966         ret = postcopy_chunk_hostpages(ms, block);
1967         if (ret) {
1968             rcu_read_unlock();
1969             return ret;
1970         }
1971
1972         /*
1973          * Update the unsentmap to be unsentmap = unsentmap | dirty
1974          */
1975         bitmap_or(unsentmap, unsentmap, bitmap, pages);
1976 #ifdef DEBUG_POSTCOPY
1977         ram_debug_dump_bitmap(unsentmap, true, pages);
1978 #endif
1979     }
1980     trace_ram_postcopy_send_discard_bitmap();
1981
1982     ret = postcopy_each_ram_send_discard(ms);
1983     rcu_read_unlock();
1984
1985     return ret;
1986 }
1987
1988 /**
1989  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1990  *
1991  * Returns zero on success
1992  *
1993  * @rbname: name of the RAMBlock of the request. NULL means the
1994  *          same that last one.
1995  * @start: RAMBlock starting page
1996  * @length: RAMBlock size
1997  */
1998 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1999 {
2000     int ret = -1;
2001
2002     trace_ram_discard_range(rbname, start, length);
2003
2004     rcu_read_lock();
2005     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2006
2007     if (!rb) {
2008         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2009         goto err;
2010     }
2011
2012     ret = ram_block_discard_range(rb, start, length);
2013
2014 err:
2015     rcu_read_unlock();
2016
2017     return ret;
2018 }
2019
2020 static int ram_state_init(RAMState **rsp)
2021 {
2022     *rsp = g_new0(RAMState, 1);
2023     Error *local_err = NULL;
2024
2025     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2026     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2027     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2028
2029     if (migrate_use_xbzrle()) {
2030         XBZRLE_cache_lock();
2031         XBZRLE.zero_target_page = g_malloc0(TARGET_PAGE_SIZE);
2032         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2033                                   TARGET_PAGE_SIZE, &local_err);
2034         if (!XBZRLE.cache) {
2035             XBZRLE_cache_unlock();
2036             error_report_err(local_err);
2037             g_free(*rsp);
2038             *rsp = NULL;
2039             return -1;
2040         }
2041         XBZRLE_cache_unlock();
2042
2043         /* We prefer not to abort if there is no memory */
2044         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2045         if (!XBZRLE.encoded_buf) {
2046             error_report("Error allocating encoded_buf");
2047             g_free(*rsp);
2048             *rsp = NULL;
2049             return -1;
2050         }
2051
2052         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2053         if (!XBZRLE.current_buf) {
2054             error_report("Error allocating current_buf");
2055             g_free(XBZRLE.encoded_buf);
2056             XBZRLE.encoded_buf = NULL;
2057             g_free(*rsp);
2058             *rsp = NULL;
2059             return -1;
2060         }
2061     }
2062
2063     /* For memory_global_dirty_log_start below.  */
2064     qemu_mutex_lock_iothread();
2065
2066     qemu_mutex_lock_ramlist();
2067     rcu_read_lock();
2068     ram_state_reset(*rsp);
2069
2070     /* Skip setting bitmap if there is no RAM */
2071     if (ram_bytes_total()) {
2072         RAMBlock *block;
2073
2074         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2075             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
2076
2077             block->bmap = bitmap_new(pages);
2078             bitmap_set(block->bmap, 0, pages);
2079             if (migrate_postcopy_ram()) {
2080                 block->unsentmap = bitmap_new(pages);
2081                 bitmap_set(block->unsentmap, 0, pages);
2082             }
2083         }
2084     }
2085
2086     /*
2087      * Count the total number of pages used by ram blocks not including any
2088      * gaps due to alignment or unplugs.
2089      */
2090     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2091
2092     memory_global_dirty_log_start();
2093     migration_bitmap_sync(*rsp);
2094     qemu_mutex_unlock_ramlist();
2095     qemu_mutex_unlock_iothread();
2096     rcu_read_unlock();
2097
2098     return 0;
2099 }
2100
2101 /*
2102  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2103  * long-running RCU critical section.  When rcu-reclaims in the code
2104  * start to become numerous it will be necessary to reduce the
2105  * granularity of these critical sections.
2106  */
2107
2108 /**
2109  * ram_save_setup: Setup RAM for migration
2110  *
2111  * Returns zero to indicate success and negative for error
2112  *
2113  * @f: QEMUFile where to send the data
2114  * @opaque: RAMState pointer
2115  */
2116 static int ram_save_setup(QEMUFile *f, void *opaque)
2117 {
2118     RAMState **rsp = opaque;
2119     RAMBlock *block;
2120
2121     /* migration has already setup the bitmap, reuse it. */
2122     if (!migration_in_colo_state()) {
2123         if (ram_state_init(rsp) != 0) {
2124             return -1;
2125         }
2126     }
2127     (*rsp)->f = f;
2128
2129     rcu_read_lock();
2130
2131     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2132
2133     RAMBLOCK_FOREACH(block) {
2134         qemu_put_byte(f, strlen(block->idstr));
2135         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2136         qemu_put_be64(f, block->used_length);
2137         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2138             qemu_put_be64(f, block->page_size);
2139         }
2140     }
2141
2142     rcu_read_unlock();
2143     compress_threads_save_setup();
2144
2145     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2146     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2147
2148     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2149
2150     return 0;
2151 }
2152
2153 /**
2154  * ram_save_iterate: iterative stage for migration
2155  *
2156  * Returns zero to indicate success and negative for error
2157  *
2158  * @f: QEMUFile where to send the data
2159  * @opaque: RAMState pointer
2160  */
2161 static int ram_save_iterate(QEMUFile *f, void *opaque)
2162 {
2163     RAMState **temp = opaque;
2164     RAMState *rs = *temp;
2165     int ret;
2166     int i;
2167     int64_t t0;
2168     int done = 0;
2169
2170     rcu_read_lock();
2171     if (ram_list.version != rs->last_version) {
2172         ram_state_reset(rs);
2173     }
2174
2175     /* Read version before ram_list.blocks */
2176     smp_rmb();
2177
2178     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2179
2180     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2181     i = 0;
2182     while ((ret = qemu_file_rate_limit(f)) == 0) {
2183         int pages;
2184
2185         pages = ram_find_and_save_block(rs, false);
2186         /* no more pages to sent */
2187         if (pages == 0) {
2188             done = 1;
2189             break;
2190         }
2191         rs->iterations++;
2192
2193         /* we want to check in the 1st loop, just in case it was the 1st time
2194            and we had to sync the dirty bitmap.
2195            qemu_get_clock_ns() is a bit expensive, so we only check each some
2196            iterations
2197         */
2198         if ((i & 63) == 0) {
2199             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2200             if (t1 > MAX_WAIT) {
2201                 trace_ram_save_iterate_big_wait(t1, i);
2202                 break;
2203             }
2204         }
2205         i++;
2206     }
2207     flush_compressed_data(rs);
2208     rcu_read_unlock();
2209
2210     /*
2211      * Must occur before EOS (or any QEMUFile operation)
2212      * because of RDMA protocol.
2213      */
2214     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2215
2216     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2217     ram_counters.transferred += 8;
2218
2219     ret = qemu_file_get_error(f);
2220     if (ret < 0) {
2221         return ret;
2222     }
2223
2224     return done;
2225 }
2226
2227 /**
2228  * ram_save_complete: function called to send the remaining amount of ram
2229  *
2230  * Returns zero to indicate success
2231  *
2232  * Called with iothread lock
2233  *
2234  * @f: QEMUFile where to send the data
2235  * @opaque: RAMState pointer
2236  */
2237 static int ram_save_complete(QEMUFile *f, void *opaque)
2238 {
2239     RAMState **temp = opaque;
2240     RAMState *rs = *temp;
2241
2242     rcu_read_lock();
2243
2244     if (!migration_in_postcopy()) {
2245         migration_bitmap_sync(rs);
2246     }
2247
2248     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2249
2250     /* try transferring iterative blocks of memory */
2251
2252     /* flush all remaining blocks regardless of rate limiting */
2253     while (true) {
2254         int pages;
2255
2256         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2257         /* no more blocks to sent */
2258         if (pages == 0) {
2259             break;
2260         }
2261     }
2262
2263     flush_compressed_data(rs);
2264     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2265
2266     rcu_read_unlock();
2267
2268     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2269
2270     return 0;
2271 }
2272
2273 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2274                              uint64_t *non_postcopiable_pending,
2275                              uint64_t *postcopiable_pending)
2276 {
2277     RAMState **temp = opaque;
2278     RAMState *rs = *temp;
2279     uint64_t remaining_size;
2280
2281     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2282
2283     if (!migration_in_postcopy() &&
2284         remaining_size < max_size) {
2285         qemu_mutex_lock_iothread();
2286         rcu_read_lock();
2287         migration_bitmap_sync(rs);
2288         rcu_read_unlock();
2289         qemu_mutex_unlock_iothread();
2290         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2291     }
2292
2293     if (migrate_postcopy_ram()) {
2294         /* We can do postcopy, and all the data is postcopiable */
2295         *postcopiable_pending += remaining_size;
2296     } else {
2297         *non_postcopiable_pending += remaining_size;
2298     }
2299 }
2300
2301 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2302 {
2303     unsigned int xh_len;
2304     int xh_flags;
2305     uint8_t *loaded_data;
2306
2307     /* extract RLE header */
2308     xh_flags = qemu_get_byte(f);
2309     xh_len = qemu_get_be16(f);
2310
2311     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2312         error_report("Failed to load XBZRLE page - wrong compression!");
2313         return -1;
2314     }
2315
2316     if (xh_len > TARGET_PAGE_SIZE) {
2317         error_report("Failed to load XBZRLE page - len overflow!");
2318         return -1;
2319     }
2320     loaded_data = XBZRLE.decoded_buf;
2321     /* load data and decode */
2322     /* it can change loaded_data to point to an internal buffer */
2323     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2324
2325     /* decode RLE */
2326     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2327                              TARGET_PAGE_SIZE) == -1) {
2328         error_report("Failed to load XBZRLE page - decode error!");
2329         return -1;
2330     }
2331
2332     return 0;
2333 }
2334
2335 /**
2336  * ram_block_from_stream: read a RAMBlock id from the migration stream
2337  *
2338  * Must be called from within a rcu critical section.
2339  *
2340  * Returns a pointer from within the RCU-protected ram_list.
2341  *
2342  * @f: QEMUFile where to read the data from
2343  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2344  */
2345 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2346 {
2347     static RAMBlock *block = NULL;
2348     char id[256];
2349     uint8_t len;
2350
2351     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2352         if (!block) {
2353             error_report("Ack, bad migration stream!");
2354             return NULL;
2355         }
2356         return block;
2357     }
2358
2359     len = qemu_get_byte(f);
2360     qemu_get_buffer(f, (uint8_t *)id, len);
2361     id[len] = 0;
2362
2363     block = qemu_ram_block_by_name(id);
2364     if (!block) {
2365         error_report("Can't find block %s", id);
2366         return NULL;
2367     }
2368
2369     return block;
2370 }
2371
2372 static inline void *host_from_ram_block_offset(RAMBlock *block,
2373                                                ram_addr_t offset)
2374 {
2375     if (!offset_in_ramblock(block, offset)) {
2376         return NULL;
2377     }
2378
2379     return block->host + offset;
2380 }
2381
2382 /**
2383  * ram_handle_compressed: handle the zero page case
2384  *
2385  * If a page (or a whole RDMA chunk) has been
2386  * determined to be zero, then zap it.
2387  *
2388  * @host: host address for the zero page
2389  * @ch: what the page is filled from.  We only support zero
2390  * @size: size of the zero page
2391  */
2392 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2393 {
2394     if (ch != 0 || !is_zero_range(host, size)) {
2395         memset(host, ch, size);
2396     }
2397 }
2398
2399 static void *do_data_decompress(void *opaque)
2400 {
2401     DecompressParam *param = opaque;
2402     unsigned long pagesize;
2403     uint8_t *des;
2404     int len;
2405
2406     qemu_mutex_lock(&param->mutex);
2407     while (!param->quit) {
2408         if (param->des) {
2409             des = param->des;
2410             len = param->len;
2411             param->des = 0;
2412             qemu_mutex_unlock(&param->mutex);
2413
2414             pagesize = TARGET_PAGE_SIZE;
2415             /* uncompress() will return failed in some case, especially
2416              * when the page is dirted when doing the compression, it's
2417              * not a problem because the dirty page will be retransferred
2418              * and uncompress() won't break the data in other pages.
2419              */
2420             uncompress((Bytef *)des, &pagesize,
2421                        (const Bytef *)param->compbuf, len);
2422
2423             qemu_mutex_lock(&decomp_done_lock);
2424             param->done = true;
2425             qemu_cond_signal(&decomp_done_cond);
2426             qemu_mutex_unlock(&decomp_done_lock);
2427
2428             qemu_mutex_lock(&param->mutex);
2429         } else {
2430             qemu_cond_wait(&param->cond, &param->mutex);
2431         }
2432     }
2433     qemu_mutex_unlock(&param->mutex);
2434
2435     return NULL;
2436 }
2437
2438 static void wait_for_decompress_done(void)
2439 {
2440     int idx, thread_count;
2441
2442     if (!migrate_use_compression()) {
2443         return;
2444     }
2445
2446     thread_count = migrate_decompress_threads();
2447     qemu_mutex_lock(&decomp_done_lock);
2448     for (idx = 0; idx < thread_count; idx++) {
2449         while (!decomp_param[idx].done) {
2450             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2451         }
2452     }
2453     qemu_mutex_unlock(&decomp_done_lock);
2454 }
2455
2456 static void compress_threads_load_setup(void)
2457 {
2458     int i, thread_count;
2459
2460     if (!migrate_use_compression()) {
2461         return;
2462     }
2463     thread_count = migrate_decompress_threads();
2464     decompress_threads = g_new0(QemuThread, thread_count);
2465     decomp_param = g_new0(DecompressParam, thread_count);
2466     qemu_mutex_init(&decomp_done_lock);
2467     qemu_cond_init(&decomp_done_cond);
2468     for (i = 0; i < thread_count; i++) {
2469         qemu_mutex_init(&decomp_param[i].mutex);
2470         qemu_cond_init(&decomp_param[i].cond);
2471         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2472         decomp_param[i].done = true;
2473         decomp_param[i].quit = false;
2474         qemu_thread_create(decompress_threads + i, "decompress",
2475                            do_data_decompress, decomp_param + i,
2476                            QEMU_THREAD_JOINABLE);
2477     }
2478 }
2479
2480 static void compress_threads_load_cleanup(void)
2481 {
2482     int i, thread_count;
2483
2484     if (!migrate_use_compression()) {
2485         return;
2486     }
2487     thread_count = migrate_decompress_threads();
2488     for (i = 0; i < thread_count; i++) {
2489         qemu_mutex_lock(&decomp_param[i].mutex);
2490         decomp_param[i].quit = true;
2491         qemu_cond_signal(&decomp_param[i].cond);
2492         qemu_mutex_unlock(&decomp_param[i].mutex);
2493     }
2494     for (i = 0; i < thread_count; i++) {
2495         qemu_thread_join(decompress_threads + i);
2496         qemu_mutex_destroy(&decomp_param[i].mutex);
2497         qemu_cond_destroy(&decomp_param[i].cond);
2498         g_free(decomp_param[i].compbuf);
2499     }
2500     g_free(decompress_threads);
2501     g_free(decomp_param);
2502     decompress_threads = NULL;
2503     decomp_param = NULL;
2504 }
2505
2506 static void decompress_data_with_multi_threads(QEMUFile *f,
2507                                                void *host, int len)
2508 {
2509     int idx, thread_count;
2510
2511     thread_count = migrate_decompress_threads();
2512     qemu_mutex_lock(&decomp_done_lock);
2513     while (true) {
2514         for (idx = 0; idx < thread_count; idx++) {
2515             if (decomp_param[idx].done) {
2516                 decomp_param[idx].done = false;
2517                 qemu_mutex_lock(&decomp_param[idx].mutex);
2518                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2519                 decomp_param[idx].des = host;
2520                 decomp_param[idx].len = len;
2521                 qemu_cond_signal(&decomp_param[idx].cond);
2522                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2523                 break;
2524             }
2525         }
2526         if (idx < thread_count) {
2527             break;
2528         } else {
2529             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2530         }
2531     }
2532     qemu_mutex_unlock(&decomp_done_lock);
2533 }
2534
2535 /**
2536  * ram_load_setup: Setup RAM for migration incoming side
2537  *
2538  * Returns zero to indicate success and negative for error
2539  *
2540  * @f: QEMUFile where to receive the data
2541  * @opaque: RAMState pointer
2542  */
2543 static int ram_load_setup(QEMUFile *f, void *opaque)
2544 {
2545     xbzrle_load_setup();
2546     compress_threads_load_setup();
2547     return 0;
2548 }
2549
2550 static int ram_load_cleanup(void *opaque)
2551 {
2552     xbzrle_load_cleanup();
2553     compress_threads_load_cleanup();
2554     return 0;
2555 }
2556
2557 /**
2558  * ram_postcopy_incoming_init: allocate postcopy data structures
2559  *
2560  * Returns 0 for success and negative if there was one error
2561  *
2562  * @mis: current migration incoming state
2563  *
2564  * Allocate data structures etc needed by incoming migration with
2565  * postcopy-ram. postcopy-ram's similarly names
2566  * postcopy_ram_incoming_init does the work.
2567  */
2568 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2569 {
2570     unsigned long ram_pages = last_ram_page();
2571
2572     return postcopy_ram_incoming_init(mis, ram_pages);
2573 }
2574
2575 /**
2576  * ram_load_postcopy: load a page in postcopy case
2577  *
2578  * Returns 0 for success or -errno in case of error
2579  *
2580  * Called in postcopy mode by ram_load().
2581  * rcu_read_lock is taken prior to this being called.
2582  *
2583  * @f: QEMUFile where to send the data
2584  */
2585 static int ram_load_postcopy(QEMUFile *f)
2586 {
2587     int flags = 0, ret = 0;
2588     bool place_needed = false;
2589     bool matching_page_sizes = false;
2590     MigrationIncomingState *mis = migration_incoming_get_current();
2591     /* Temporary page that is later 'placed' */
2592     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2593     void *last_host = NULL;
2594     bool all_zero = false;
2595
2596     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2597         ram_addr_t addr;
2598         void *host = NULL;
2599         void *page_buffer = NULL;
2600         void *place_source = NULL;
2601         RAMBlock *block = NULL;
2602         uint8_t ch;
2603
2604         addr = qemu_get_be64(f);
2605         flags = addr & ~TARGET_PAGE_MASK;
2606         addr &= TARGET_PAGE_MASK;
2607
2608         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2609         place_needed = false;
2610         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2611             block = ram_block_from_stream(f, flags);
2612
2613             host = host_from_ram_block_offset(block, addr);
2614             if (!host) {
2615                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2616                 ret = -EINVAL;
2617                 break;
2618             }
2619             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2620             /*
2621              * Postcopy requires that we place whole host pages atomically;
2622              * these may be huge pages for RAMBlocks that are backed by
2623              * hugetlbfs.
2624              * To make it atomic, the data is read into a temporary page
2625              * that's moved into place later.
2626              * The migration protocol uses,  possibly smaller, target-pages
2627              * however the source ensures it always sends all the components
2628              * of a host page in order.
2629              */
2630             page_buffer = postcopy_host_page +
2631                           ((uintptr_t)host & (block->page_size - 1));
2632             /* If all TP are zero then we can optimise the place */
2633             if (!((uintptr_t)host & (block->page_size - 1))) {
2634                 all_zero = true;
2635             } else {
2636                 /* not the 1st TP within the HP */
2637                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2638                     error_report("Non-sequential target page %p/%p",
2639                                   host, last_host);
2640                     ret = -EINVAL;
2641                     break;
2642                 }
2643             }
2644
2645
2646             /*
2647              * If it's the last part of a host page then we place the host
2648              * page
2649              */
2650             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2651                                      (block->page_size - 1)) == 0;
2652             place_source = postcopy_host_page;
2653         }
2654         last_host = host;
2655
2656         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2657         case RAM_SAVE_FLAG_ZERO:
2658             ch = qemu_get_byte(f);
2659             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2660             if (ch) {
2661                 all_zero = false;
2662             }
2663             break;
2664
2665         case RAM_SAVE_FLAG_PAGE:
2666             all_zero = false;
2667             if (!place_needed || !matching_page_sizes) {
2668                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2669             } else {
2670                 /* Avoids the qemu_file copy during postcopy, which is
2671                  * going to do a copy later; can only do it when we
2672                  * do this read in one go (matching page sizes)
2673                  */
2674                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2675                                          TARGET_PAGE_SIZE);
2676             }
2677             break;
2678         case RAM_SAVE_FLAG_EOS:
2679             /* normal exit */
2680             break;
2681         default:
2682             error_report("Unknown combination of migration flags: %#x"
2683                          " (postcopy mode)", flags);
2684             ret = -EINVAL;
2685         }
2686
2687         if (place_needed) {
2688             /* This gets called at the last target page in the host page */
2689             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2690
2691             if (all_zero) {
2692                 ret = postcopy_place_page_zero(mis, place_dest,
2693                                                block->page_size);
2694             } else {
2695                 ret = postcopy_place_page(mis, place_dest,
2696                                           place_source, block->page_size);
2697             }
2698         }
2699         if (!ret) {
2700             ret = qemu_file_get_error(f);
2701         }
2702     }
2703
2704     return ret;
2705 }
2706
2707 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2708 {
2709     int flags = 0, ret = 0, invalid_flags = 0;
2710     static uint64_t seq_iter;
2711     int len = 0;
2712     /*
2713      * If system is running in postcopy mode, page inserts to host memory must
2714      * be atomic
2715      */
2716     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2717     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2718     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2719
2720     seq_iter++;
2721
2722     if (version_id != 4) {
2723         ret = -EINVAL;
2724     }
2725
2726     if (!migrate_use_compression()) {
2727         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2728     }
2729     /* This RCU critical section can be very long running.
2730      * When RCU reclaims in the code start to become numerous,
2731      * it will be necessary to reduce the granularity of this
2732      * critical section.
2733      */
2734     rcu_read_lock();
2735
2736     if (postcopy_running) {
2737         ret = ram_load_postcopy(f);
2738     }
2739
2740     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2741         ram_addr_t addr, total_ram_bytes;
2742         void *host = NULL;
2743         uint8_t ch;
2744
2745         addr = qemu_get_be64(f);
2746         flags = addr & ~TARGET_PAGE_MASK;
2747         addr &= TARGET_PAGE_MASK;
2748
2749         if (flags & invalid_flags) {
2750             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2751                 error_report("Received an unexpected compressed page");
2752             }
2753
2754             ret = -EINVAL;
2755             break;
2756         }
2757
2758         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2759                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2760             RAMBlock *block = ram_block_from_stream(f, flags);
2761
2762             host = host_from_ram_block_offset(block, addr);
2763             if (!host) {
2764                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2765                 ret = -EINVAL;
2766                 break;
2767             }
2768             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2769         }
2770
2771         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2772         case RAM_SAVE_FLAG_MEM_SIZE:
2773             /* Synchronize RAM block list */
2774             total_ram_bytes = addr;
2775             while (!ret && total_ram_bytes) {
2776                 RAMBlock *block;
2777                 char id[256];
2778                 ram_addr_t length;
2779
2780                 len = qemu_get_byte(f);
2781                 qemu_get_buffer(f, (uint8_t *)id, len);
2782                 id[len] = 0;
2783                 length = qemu_get_be64(f);
2784
2785                 block = qemu_ram_block_by_name(id);
2786                 if (block) {
2787                     if (length != block->used_length) {
2788                         Error *local_err = NULL;
2789
2790                         ret = qemu_ram_resize(block, length,
2791                                               &local_err);
2792                         if (local_err) {
2793                             error_report_err(local_err);
2794                         }
2795                     }
2796                     /* For postcopy we need to check hugepage sizes match */
2797                     if (postcopy_advised &&
2798                         block->page_size != qemu_host_page_size) {
2799                         uint64_t remote_page_size = qemu_get_be64(f);
2800                         if (remote_page_size != block->page_size) {
2801                             error_report("Mismatched RAM page size %s "
2802                                          "(local) %zd != %" PRId64,
2803                                          id, block->page_size,
2804                                          remote_page_size);
2805                             ret = -EINVAL;
2806                         }
2807                     }
2808                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2809                                           block->idstr);
2810                 } else {
2811                     error_report("Unknown ramblock \"%s\", cannot "
2812                                  "accept migration", id);
2813                     ret = -EINVAL;
2814                 }
2815
2816                 total_ram_bytes -= length;
2817             }
2818             break;
2819
2820         case RAM_SAVE_FLAG_ZERO:
2821             ch = qemu_get_byte(f);
2822             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2823             break;
2824
2825         case RAM_SAVE_FLAG_PAGE:
2826             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2827             break;
2828
2829         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2830             len = qemu_get_be32(f);
2831             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2832                 error_report("Invalid compressed data length: %d", len);
2833                 ret = -EINVAL;
2834                 break;
2835             }
2836             decompress_data_with_multi_threads(f, host, len);
2837             break;
2838
2839         case RAM_SAVE_FLAG_XBZRLE:
2840             if (load_xbzrle(f, addr, host) < 0) {
2841                 error_report("Failed to decompress XBZRLE page at "
2842                              RAM_ADDR_FMT, addr);
2843                 ret = -EINVAL;
2844                 break;
2845             }
2846             break;
2847         case RAM_SAVE_FLAG_EOS:
2848             /* normal exit */
2849             break;
2850         default:
2851             if (flags & RAM_SAVE_FLAG_HOOK) {
2852                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2853             } else {
2854                 error_report("Unknown combination of migration flags: %#x",
2855                              flags);
2856                 ret = -EINVAL;
2857             }
2858         }
2859         if (!ret) {
2860             ret = qemu_file_get_error(f);
2861         }
2862     }
2863
2864     wait_for_decompress_done();
2865     rcu_read_unlock();
2866     trace_ram_load_complete(ret, seq_iter);
2867     return ret;
2868 }
2869
2870 static bool ram_has_postcopy(void *opaque)
2871 {
2872     return migrate_postcopy_ram();
2873 }
2874
2875 static SaveVMHandlers savevm_ram_handlers = {
2876     .save_setup = ram_save_setup,
2877     .save_live_iterate = ram_save_iterate,
2878     .save_live_complete_postcopy = ram_save_complete,
2879     .save_live_complete_precopy = ram_save_complete,
2880     .has_postcopy = ram_has_postcopy,
2881     .save_live_pending = ram_save_pending,
2882     .load_state = ram_load,
2883     .save_cleanup = ram_save_cleanup,
2884     .load_setup = ram_load_setup,
2885     .load_cleanup = ram_load_cleanup,
2886 };
2887
2888 void ram_mig_init(void)
2889 {
2890     qemu_mutex_init(&XBZRLE.lock);
2891     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2892 }