migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "cpu.h"
  30 #include <zlib.h>
  31 #include "qapi-event.h"
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration/register.h"
  40 #include "migration/misc.h"
  41 #include "qemu-file.h"
  42 #include "postcopy-ram.h"
  43 #include "migration/page_cache.h"
  44 #include "qemu/error-report.h"
  45 #include "qapi/qmp/qerror.h"
  46 #include "trace.h"
  47 #include "exec/ram_addr.h"
  48 #include "exec/target_page.h"
  49 #include "qemu/rcu_queue.h"
  50 #include "migration/colo.h"
  51 #include "migration/block.h"
  52
  53 /***********************************************************/
  54 /* ram save/restore */
  55
  56 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  57  * worked for pages that where filled with the same char.  We switched
  58  * it to only search for the zero value.  And to avoid confusion with
  59  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  60  */
  61
  62 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  63 #define RAM_SAVE_FLAG_ZERO     0x02
  64 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  65 #define RAM_SAVE_FLAG_PAGE     0x08
  66 #define RAM_SAVE_FLAG_EOS      0x10
  67 #define RAM_SAVE_FLAG_CONTINUE 0x20
  68 #define RAM_SAVE_FLAG_XBZRLE   0x40
  69 /* 0x80 is reserved in migration.h start with 0x100 next */
  70 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  71
  72 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  73 {
  74     return buffer_is_zero(p, size);
  75 }
  76
  77 XBZRLECacheStats xbzrle_counters;
  78
  79 /* struct contains XBZRLE cache and a static page
  80    used by the compression */
  81 static struct {
  82     /* buffer used for XBZRLE encoding */
  83     uint8_t *encoded_buf;
  84     /* buffer for storing page content */
  85     uint8_t *current_buf;
  86     /* Cache for XBZRLE, Protected by lock. */
  87     PageCache *cache;
  88     QemuMutex lock;
  89     /* it will store a page full of zeros */
  90     uint8_t *zero_target_page;
  91     /* buffer used for XBZRLE decoding */
  92     uint8_t *decoded_buf;
  93 } XBZRLE;
  94
  95 static void XBZRLE_cache_lock(void)
  96 {
  97     if (migrate_use_xbzrle())
  98         qemu_mutex_lock(&XBZRLE.lock);
  99 }
 100
 101 static void XBZRLE_cache_unlock(void)
 102 {
 103     if (migrate_use_xbzrle())
 104         qemu_mutex_unlock(&XBZRLE.lock);
 105 }
 106
 107 /**
 108  * xbzrle_cache_resize: resize the xbzrle cache
 109  *
 110  * This function is called from qmp_migrate_set_cache_size in main
 111  * thread, possibly while a migration is in progress.  A running
 112  * migration may be using the cache and might finish during this call,
 113  * hence changes to the cache are protected by XBZRLE.lock().
 114  *
 115  * Returns 0 for success or -1 for error
 116  *
 117  * @new_size: new cache size
 118  * @errp: set *errp if the check failed, with reason
 119  */
 120 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 121 {
 122     PageCache *new_cache;
 123     int64_t ret = 0;
 124
 125     /* Check for truncation */
 126     if (new_size != (size_t)new_size) {
 127         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 128                    "exceeding address space");
 129         return -1;
 130     }
 131
 132     if (new_size == migrate_xbzrle_cache_size()) {
 133         /* nothing to do */
 134         return 0;
 135     }
 136
 137     XBZRLE_cache_lock();
 138
 139     if (XBZRLE.cache != NULL) {
 140         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 141         if (!new_cache) {
 142             ret = -1;
 143             goto out;
 144         }
 145
 146         cache_fini(XBZRLE.cache);
 147         XBZRLE.cache = new_cache;
 148     }
 149 out:
 150     XBZRLE_cache_unlock();
 151     return ret;
 152 }
 153
 154 static void ramblock_recv_map_init(void)
 155 {
 156     RAMBlock *rb;
 157
 158     RAMBLOCK_FOREACH(rb) {
 159         assert(!rb->receivedmap);
 160         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 161     }
 162 }
 163
 164 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 165 {
 166     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 167                     rb->receivedmap);
 168 }
 169
 170 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 171 {
 172     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 173 }
 174
 175 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 176                                     size_t nr)
 177 {
 178     bitmap_set_atomic(rb->receivedmap,
 179                       ramblock_recv_bitmap_offset(host_addr, rb),
 180                       nr);
 181 }
 182
 183 /*
 184  * An outstanding page request, on the source, having been received
 185  * and queued
 186  */
 187 struct RAMSrcPageRequest {
 188     RAMBlock *rb;
 189     hwaddr    offset;
 190     hwaddr    len;
 191
 192     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 193 };
 194
 195 /* State of RAM for migration */
 196 struct RAMState {
 197     /* QEMUFile used for this migration */
 198     QEMUFile *f;
 199     /* Last block that we have visited searching for dirty pages */
 200     RAMBlock *last_seen_block;
 201     /* Last block from where we have sent data */
 202     RAMBlock *last_sent_block;
 203     /* Last dirty target page we have sent */
 204     ram_addr_t last_page;
 205     /* last ram version we have seen */
 206     uint32_t last_version;
 207     /* We are in the first round */
 208     bool ram_bulk_stage;
 209     /* How many times we have dirty too many pages */
 210     int dirty_rate_high_cnt;
 211     /* these variables are used for bitmap sync */
 212     /* last time we did a full bitmap_sync */
 213     int64_t time_last_bitmap_sync;
 214     /* bytes transferred at start_time */
 215     uint64_t bytes_xfer_prev;
 216     /* number of dirty pages since start_time */
 217     uint64_t num_dirty_pages_period;
 218     /* xbzrle misses since the beginning of the period */
 219     uint64_t xbzrle_cache_miss_prev;
 220     /* number of iterations at the beginning of period */
 221     uint64_t iterations_prev;
 222     /* Iterations since start */
 223     uint64_t iterations;
 224     /* number of dirty bits in the bitmap */
 225     uint64_t migration_dirty_pages;
 226     /* protects modification of the bitmap */
 227     QemuMutex bitmap_mutex;
 228     /* The RAMBlock used in the last src_page_requests */
 229     RAMBlock *last_req_rb;
 230     /* Queue of outstanding page requests from the destination */
 231     QemuMutex src_page_req_mutex;
 232     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 233 };
 234 typedef struct RAMState RAMState;
 235
 236 static RAMState *ram_state;
 237
 238 uint64_t ram_bytes_remaining(void)
 239 {
 240     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 241                        0;
 242 }
 243
 244 MigrationStats ram_counters;
 245
 246 /* used by the search for pages to send */
 247 struct PageSearchStatus {
 248     /* Current block being searched */
 249     RAMBlock    *block;
 250     /* Current page to search from */
 251     unsigned long page;
 252     /* Set once we wrap around */
 253     bool         complete_round;
 254 };
 255 typedef struct PageSearchStatus PageSearchStatus;
 256
 257 struct CompressParam {
 258     bool done;
 259     bool quit;
 260     QEMUFile *file;
 261     QemuMutex mutex;
 262     QemuCond cond;
 263     RAMBlock *block;
 264     ram_addr_t offset;
 265 };
 266 typedef struct CompressParam CompressParam;
 267
 268 struct DecompressParam {
 269     bool done;
 270     bool quit;
 271     QemuMutex mutex;
 272     QemuCond cond;
 273     void *des;
 274     uint8_t *compbuf;
 275     int len;
 276 };
 277 typedef struct DecompressParam DecompressParam;
 278
 279 static CompressParam *comp_param;
 280 static QemuThread *compress_threads;
 281 /* comp_done_cond is used to wake up the migration thread when
 282  * one of the compression threads has finished the compression.
 283  * comp_done_lock is used to co-work with comp_done_cond.
 284  */
 285 static QemuMutex comp_done_lock;
 286 static QemuCond comp_done_cond;
 287 /* The empty QEMUFileOps will be used by file in CompressParam */
 288 static const QEMUFileOps empty_ops = { };
 289
 290 static DecompressParam *decomp_param;
 291 static QemuThread *decompress_threads;
 292 static QemuMutex decomp_done_lock;
 293 static QemuCond decomp_done_cond;
 294
 295 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 296                                 ram_addr_t offset);
 297
 298 static void *do_data_compress(void *opaque)
 299 {
 300     CompressParam *param = opaque;
 301     RAMBlock *block;
 302     ram_addr_t offset;
 303
 304     qemu_mutex_lock(&param->mutex);
 305     while (!param->quit) {
 306         if (param->block) {
 307             block = param->block;
 308             offset = param->offset;
 309             param->block = NULL;
 310             qemu_mutex_unlock(&param->mutex);
 311
 312             do_compress_ram_page(param->file, block, offset);
 313
 314             qemu_mutex_lock(&comp_done_lock);
 315             param->done = true;
 316             qemu_cond_signal(&comp_done_cond);
 317             qemu_mutex_unlock(&comp_done_lock);
 318
 319             qemu_mutex_lock(&param->mutex);
 320         } else {
 321             qemu_cond_wait(&param->cond, &param->mutex);
 322         }
 323     }
 324     qemu_mutex_unlock(&param->mutex);
 325
 326     return NULL;
 327 }
 328
 329 static inline void terminate_compression_threads(void)
 330 {
 331     int idx, thread_count;
 332
 333     thread_count = migrate_compress_threads();
 334
 335     for (idx = 0; idx < thread_count; idx++) {
 336         qemu_mutex_lock(&comp_param[idx].mutex);
 337         comp_param[idx].quit = true;
 338         qemu_cond_signal(&comp_param[idx].cond);
 339         qemu_mutex_unlock(&comp_param[idx].mutex);
 340     }
 341 }
 342
 343 static void compress_threads_save_cleanup(void)
 344 {
 345     int i, thread_count;
 346
 347     if (!migrate_use_compression()) {
 348         return;
 349     }
 350     terminate_compression_threads();
 351     thread_count = migrate_compress_threads();
 352     for (i = 0; i < thread_count; i++) {
 353         qemu_thread_join(compress_threads + i);
 354         qemu_fclose(comp_param[i].file);
 355         qemu_mutex_destroy(&comp_param[i].mutex);
 356         qemu_cond_destroy(&comp_param[i].cond);
 357     }
 358     qemu_mutex_destroy(&comp_done_lock);
 359     qemu_cond_destroy(&comp_done_cond);
 360     g_free(compress_threads);
 361     g_free(comp_param);
 362     compress_threads = NULL;
 363     comp_param = NULL;
 364 }
 365
 366 static void compress_threads_save_setup(void)
 367 {
 368     int i, thread_count;
 369
 370     if (!migrate_use_compression()) {
 371         return;
 372     }
 373     thread_count = migrate_compress_threads();
 374     compress_threads = g_new0(QemuThread, thread_count);
 375     comp_param = g_new0(CompressParam, thread_count);
 376     qemu_cond_init(&comp_done_cond);
 377     qemu_mutex_init(&comp_done_lock);
 378     for (i = 0; i < thread_count; i++) {
 379         /* comp_param[i].file is just used as a dummy buffer to save data,
 380          * set its ops to empty.
 381          */
 382         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 383         comp_param[i].done = true;
 384         comp_param[i].quit = false;
 385         qemu_mutex_init(&comp_param[i].mutex);
 386         qemu_cond_init(&comp_param[i].cond);
 387         qemu_thread_create(compress_threads + i, "compress",
 388                            do_data_compress, comp_param + i,
 389                            QEMU_THREAD_JOINABLE);
 390     }
 391 }
 392
 393 /* Multiple fd's */
 394
 395 struct MultiFDSendParams {
 396     uint8_t id;
 397     char *name;
 398     QemuThread thread;
 399     QemuSemaphore sem;
 400     QemuMutex mutex;
 401     bool quit;
 402 };
 403 typedef struct MultiFDSendParams MultiFDSendParams;
 404
 405 struct {
 406     MultiFDSendParams *params;
 407     /* number of created threads */
 408     int count;
 409 } *multifd_send_state;
 410
 411 static void terminate_multifd_send_threads(Error *errp)
 412 {
 413     int i;
 414
 415     for (i = 0; i < multifd_send_state->count; i++) {
 416         MultiFDSendParams *p = &multifd_send_state->params[i];
 417
 418         qemu_mutex_lock(&p->mutex);
 419         p->quit = true;
 420         qemu_sem_post(&p->sem);
 421         qemu_mutex_unlock(&p->mutex);
 422     }
 423 }
 424
 425 int multifd_save_cleanup(Error **errp)
 426 {
 427     int i;
 428     int ret = 0;
 429
 430     if (!migrate_use_multifd()) {
 431         return 0;
 432     }
 433     terminate_multifd_send_threads(NULL);
 434     for (i = 0; i < multifd_send_state->count; i++) {
 435         MultiFDSendParams *p = &multifd_send_state->params[i];
 436
 437         qemu_thread_join(&p->thread);
 438         qemu_mutex_destroy(&p->mutex);
 439         qemu_sem_destroy(&p->sem);
 440         g_free(p->name);
 441         p->name = NULL;
 442     }
 443     g_free(multifd_send_state->params);
 444     multifd_send_state->params = NULL;
 445     g_free(multifd_send_state);
 446     multifd_send_state = NULL;
 447     return ret;
 448 }
 449
 450 static void *multifd_send_thread(void *opaque)
 451 {
 452     MultiFDSendParams *p = opaque;
 453
 454     while (true) {
 455         qemu_mutex_lock(&p->mutex);
 456         if (p->quit) {
 457             qemu_mutex_unlock(&p->mutex);
 458             break;
 459         }
 460         qemu_mutex_unlock(&p->mutex);
 461         qemu_sem_wait(&p->sem);
 462     }
 463
 464     return NULL;
 465 }
 466
 467 int multifd_save_setup(void)
 468 {
 469     int thread_count;
 470     uint8_t i;
 471
 472     if (!migrate_use_multifd()) {
 473         return 0;
 474     }
 475     thread_count = migrate_multifd_channels();
 476     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
 477     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
 478     multifd_send_state->count = 0;
 479     for (i = 0; i < thread_count; i++) {
 480         MultiFDSendParams *p = &multifd_send_state->params[i];
 481
 482         qemu_mutex_init(&p->mutex);
 483         qemu_sem_init(&p->sem, 0);
 484         p->quit = false;
 485         p->id = i;
 486         p->name = g_strdup_printf("multifdsend_%d", i);
 487         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
 488                            QEMU_THREAD_JOINABLE);
 489
 490         multifd_send_state->count++;
 491     }
 492     return 0;
 493 }
 494
 495 struct MultiFDRecvParams {
 496     uint8_t id;
 497     char *name;
 498     QemuThread thread;
 499     QemuSemaphore sem;
 500     QemuMutex mutex;
 501     bool quit;
 502 };
 503 typedef struct MultiFDRecvParams MultiFDRecvParams;
 504
 505 struct {
 506     MultiFDRecvParams *params;
 507     /* number of created threads */
 508     int count;
 509 } *multifd_recv_state;
 510
 511 static void terminate_multifd_recv_threads(Error *errp)
 512 {
 513     int i;
 514
 515     for (i = 0; i < multifd_recv_state->count; i++) {
 516         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 517
 518         qemu_mutex_lock(&p->mutex);
 519         p->quit = true;
 520         qemu_sem_post(&p->sem);
 521         qemu_mutex_unlock(&p->mutex);
 522     }
 523 }
 524
 525 int multifd_load_cleanup(Error **errp)
 526 {
 527     int i;
 528     int ret = 0;
 529
 530     if (!migrate_use_multifd()) {
 531         return 0;
 532     }
 533     terminate_multifd_recv_threads(NULL);
 534     for (i = 0; i < multifd_recv_state->count; i++) {
 535         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 536
 537         qemu_thread_join(&p->thread);
 538         qemu_mutex_destroy(&p->mutex);
 539         qemu_sem_destroy(&p->sem);
 540         g_free(p->name);
 541         p->name = NULL;
 542     }
 543     g_free(multifd_recv_state->params);
 544     multifd_recv_state->params = NULL;
 545     g_free(multifd_recv_state);
 546     multifd_recv_state = NULL;
 547
 548     return ret;
 549 }
 550
 551 static void *multifd_recv_thread(void *opaque)
 552 {
 553     MultiFDRecvParams *p = opaque;
 554
 555     while (true) {
 556         qemu_mutex_lock(&p->mutex);
 557         if (p->quit) {
 558             qemu_mutex_unlock(&p->mutex);
 559             break;
 560         }
 561         qemu_mutex_unlock(&p->mutex);
 562         qemu_sem_wait(&p->sem);
 563     }
 564
 565     return NULL;
 566 }
 567
 568 int multifd_load_setup(void)
 569 {
 570     int thread_count;
 571     uint8_t i;
 572
 573     if (!migrate_use_multifd()) {
 574         return 0;
 575     }
 576     thread_count = migrate_multifd_channels();
 577     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
 578     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
 579     multifd_recv_state->count = 0;
 580     for (i = 0; i < thread_count; i++) {
 581         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 582
 583         qemu_mutex_init(&p->mutex);
 584         qemu_sem_init(&p->sem, 0);
 585         p->quit = false;
 586         p->id = i;
 587         p->name = g_strdup_printf("multifdrecv_%d", i);
 588         qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
 589                            QEMU_THREAD_JOINABLE);
 590         multifd_recv_state->count++;
 591     }
 592     return 0;
 593 }
 594
 595 /**
 596  * save_page_header: write page header to wire
 597  *
 598  * If this is the 1st block, it also writes the block identification
 599  *
 600  * Returns the number of bytes written
 601  *
 602  * @f: QEMUFile where to send the data
 603  * @block: block that contains the page we want to send
 604  * @offset: offset inside the block for the page
 605  *          in the lower bits, it contains flags
 606  */
 607 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 608                                ram_addr_t offset)
 609 {
 610     size_t size, len;
 611
 612     if (block == rs->last_sent_block) {
 613         offset |= RAM_SAVE_FLAG_CONTINUE;
 614     }
 615     qemu_put_be64(f, offset);
 616     size = 8;
 617
 618     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 619         len = strlen(block->idstr);
 620         qemu_put_byte(f, len);
 621         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 622         size += 1 + len;
 623         rs->last_sent_block = block;
 624     }
 625     return size;
 626 }
 627
 628 /**
 629  * mig_throttle_guest_down: throotle down the guest
 630  *
 631  * Reduce amount of guest cpu execution to hopefully slow down memory
 632  * writes. If guest dirty memory rate is reduced below the rate at
 633  * which we can transfer pages to the destination then we should be
 634  * able to complete migration. Some workloads dirty memory way too
 635  * fast and will not effectively converge, even with auto-converge.
 636  */
 637 static void mig_throttle_guest_down(void)
 638 {
 639     MigrationState *s = migrate_get_current();
 640     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 641     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 642
 643     /* We have not started throttling yet. Let's start it. */
 644     if (!cpu_throttle_active()) {
 645         cpu_throttle_set(pct_initial);
 646     } else {
 647         /* Throttling already on, just increase the rate */
 648         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 649     }
 650 }
 651
 652 /**
 653  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 654  *
 655  * @rs: current RAM state
 656  * @current_addr: address for the zero page
 657  *
 658  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 659  * The important thing is that a stale (not-yet-0'd) page be replaced
 660  * by the new data.
 661  * As a bonus, if the page wasn't in the cache it gets added so that
 662  * when a small write is made into the 0'd page it gets XBZRLE sent.
 663  */
 664 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 665 {
 666     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 667         return;
 668     }
 669
 670     /* We don't care if this fails to allocate a new cache page
 671      * as long as it updated an old one */
 672     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 673                  ram_counters.dirty_sync_count);
 674 }
 675
 676 #define ENCODING_FLAG_XBZRLE 0x1
 677
 678 /**
 679  * save_xbzrle_page: compress and send current page
 680  *
 681  * Returns: 1 means that we wrote the page
 682  *          0 means that page is identical to the one already sent
 683  *          -1 means that xbzrle would be longer than normal
 684  *
 685  * @rs: current RAM state
 686  * @current_data: pointer to the address of the page contents
 687  * @current_addr: addr of the page
 688  * @block: block that contains the page we want to send
 689  * @offset: offset inside the block for the page
 690  * @last_stage: if we are at the completion stage
 691  */
 692 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 693                             ram_addr_t current_addr, RAMBlock *block,
 694                             ram_addr_t offset, bool last_stage)
 695 {
 696     int encoded_len = 0, bytes_xbzrle;
 697     uint8_t *prev_cached_page;
 698
 699     if (!cache_is_cached(XBZRLE.cache, current_addr,
 700                          ram_counters.dirty_sync_count)) {
 701         xbzrle_counters.cache_miss++;
 702         if (!last_stage) {
 703             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 704                              ram_counters.dirty_sync_count) == -1) {
 705                 return -1;
 706             } else {
 707                 /* update *current_data when the page has been
 708                    inserted into cache */
 709                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 710             }
 711         }
 712         return -1;
 713     }
 714
 715     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 716
 717     /* save current buffer into memory */
 718     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 719
 720     /* XBZRLE encoding (if there is no overflow) */
 721     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 722                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 723                                        TARGET_PAGE_SIZE);
 724     if (encoded_len == 0) {
 725         trace_save_xbzrle_page_skipping();
 726         return 0;
 727     } else if (encoded_len == -1) {
 728         trace_save_xbzrle_page_overflow();
 729         xbzrle_counters.overflow++;
 730         /* update data in the cache */
 731         if (!last_stage) {
 732             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 733             *current_data = prev_cached_page;
 734         }
 735         return -1;
 736     }
 737
 738     /* we need to update the data in the cache, in order to get the same data */
 739     if (!last_stage) {
 740         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 741     }
 742
 743     /* Send XBZRLE based compressed page */
 744     bytes_xbzrle = save_page_header(rs, rs->f, block,
 745                                     offset | RAM_SAVE_FLAG_XBZRLE);
 746     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 747     qemu_put_be16(rs->f, encoded_len);
 748     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 749     bytes_xbzrle += encoded_len + 1 + 2;
 750     xbzrle_counters.pages++;
 751     xbzrle_counters.bytes += bytes_xbzrle;
 752     ram_counters.transferred += bytes_xbzrle;
 753
 754     return 1;
 755 }
 756
 757 /**
 758  * migration_bitmap_find_dirty: find the next dirty page from start
 759  *
 760  * Called with rcu_read_lock() to protect migration_bitmap
 761  *
 762  * Returns the byte offset within memory region of the start of a dirty page
 763  *
 764  * @rs: current RAM state
 765  * @rb: RAMBlock where to search for dirty pages
 766  * @start: page where we start the search
 767  */
 768 static inline
 769 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 770                                           unsigned long start)
 771 {
 772     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 773     unsigned long *bitmap = rb->bmap;
 774     unsigned long next;
 775
 776     if (rs->ram_bulk_stage && start > 0) {
 777         next = start + 1;
 778     } else {
 779         next = find_next_bit(bitmap, size, start);
 780     }
 781
 782     return next;
 783 }
 784
 785 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 786                                                 RAMBlock *rb,
 787                                                 unsigned long page)
 788 {
 789     bool ret;
 790
 791     ret = test_and_clear_bit(page, rb->bmap);
 792
 793     if (ret) {
 794         rs->migration_dirty_pages--;
 795     }
 796     return ret;
 797 }
 798
 799 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 800                                         ram_addr_t start, ram_addr_t length)
 801 {
 802     rs->migration_dirty_pages +=
 803         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 804                                               &rs->num_dirty_pages_period);
 805 }
 806
 807 /**
 808  * ram_pagesize_summary: calculate all the pagesizes of a VM
 809  *
 810  * Returns a summary bitmap of the page sizes of all RAMBlocks
 811  *
 812  * For VMs with just normal pages this is equivalent to the host page
 813  * size. If it's got some huge pages then it's the OR of all the
 814  * different page sizes.
 815  */
 816 uint64_t ram_pagesize_summary(void)
 817 {
 818     RAMBlock *block;
 819     uint64_t summary = 0;
 820
 821     RAMBLOCK_FOREACH(block) {
 822         summary |= block->page_size;
 823     }
 824
 825     return summary;
 826 }
 827
 828 static void migration_bitmap_sync(RAMState *rs)
 829 {
 830     RAMBlock *block;
 831     int64_t end_time;
 832     uint64_t bytes_xfer_now;
 833
 834     ram_counters.dirty_sync_count++;
 835
 836     if (!rs->time_last_bitmap_sync) {
 837         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 838     }
 839
 840     trace_migration_bitmap_sync_start();
 841     memory_global_dirty_log_sync();
 842
 843     qemu_mutex_lock(&rs->bitmap_mutex);
 844     rcu_read_lock();
 845     RAMBLOCK_FOREACH(block) {
 846         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 847     }
 848     rcu_read_unlock();
 849     qemu_mutex_unlock(&rs->bitmap_mutex);
 850
 851     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 852
 853     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 854
 855     /* more than 1 second = 1000 millisecons */
 856     if (end_time > rs->time_last_bitmap_sync + 1000) {
 857         /* calculate period counters */
 858         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 859             / (end_time - rs->time_last_bitmap_sync);
 860         bytes_xfer_now = ram_counters.transferred;
 861
 862         /* During block migration the auto-converge logic incorrectly detects
 863          * that ram migration makes no progress. Avoid this by disabling the
 864          * throttling logic during the bulk phase of block migration. */
 865         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 866             /* The following detection logic can be refined later. For now:
 867                Check to see if the dirtied bytes is 50% more than the approx.
 868                amount of bytes that just got transferred since the last time we
 869                were in this routine. If that happens twice, start or increase
 870                throttling */
 871
 872             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 873                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 874                 (++rs->dirty_rate_high_cnt >= 2)) {
 875                     trace_migration_throttle();
 876                     rs->dirty_rate_high_cnt = 0;
 877                     mig_throttle_guest_down();
 878             }
 879         }
 880
 881         if (migrate_use_xbzrle()) {
 882             if (rs->iterations_prev != rs->iterations) {
 883                 xbzrle_counters.cache_miss_rate =
 884                    (double)(xbzrle_counters.cache_miss -
 885                             rs->xbzrle_cache_miss_prev) /
 886                    (rs->iterations - rs->iterations_prev);
 887             }
 888             rs->iterations_prev = rs->iterations;
 889             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 890         }
 891
 892         /* reset period counters */
 893         rs->time_last_bitmap_sync = end_time;
 894         rs->num_dirty_pages_period = 0;
 895         rs->bytes_xfer_prev = bytes_xfer_now;
 896     }
 897     if (migrate_use_events()) {
 898         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
 899     }
 900 }
 901
 902 /**
 903  * save_zero_page: send the zero page to the stream
 904  *
 905  * Returns the number of pages written.
 906  *
 907  * @rs: current RAM state
 908  * @block: block that contains the page we want to send
 909  * @offset: offset inside the block for the page
 910  */
 911 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
 912 {
 913     uint8_t *p = block->host + offset;
 914     int pages = -1;
 915
 916     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 917         ram_counters.duplicate++;
 918         ram_counters.transferred +=
 919             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 920         qemu_put_byte(rs->f, 0);
 921         ram_counters.transferred += 1;
 922         pages = 1;
 923     }
 924
 925     return pages;
 926 }
 927
 928 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 929 {
 930     if (!migrate_release_ram() || !migration_in_postcopy()) {
 931         return;
 932     }
 933
 934     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 935 }
 936
 937 /**
 938  * ram_save_page: send the given page to the stream
 939  *
 940  * Returns the number of pages written.
 941  *          < 0 - error
 942  *          >=0 - Number of pages written - this might legally be 0
 943  *                if xbzrle noticed the page was the same.
 944  *
 945  * @rs: current RAM state
 946  * @block: block that contains the page we want to send
 947  * @offset: offset inside the block for the page
 948  * @last_stage: if we are at the completion stage
 949  */
 950 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 951 {
 952     int pages = -1;
 953     uint64_t bytes_xmit;
 954     ram_addr_t current_addr;
 955     uint8_t *p;
 956     int ret;
 957     bool send_async = true;
 958     RAMBlock *block = pss->block;
 959     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 960
 961     p = block->host + offset;
 962     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 963
 964     /* In doubt sent page as normal */
 965     bytes_xmit = 0;
 966     ret = ram_control_save_page(rs->f, block->offset,
 967                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 968     if (bytes_xmit) {
 969         ram_counters.transferred += bytes_xmit;
 970         pages = 1;
 971     }
 972
 973     XBZRLE_cache_lock();
 974
 975     current_addr = block->offset + offset;
 976
 977     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 978         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 979             if (bytes_xmit > 0) {
 980                 ram_counters.normal++;
 981             } else if (bytes_xmit == 0) {
 982                 ram_counters.duplicate++;
 983             }
 984         }
 985     } else {
 986         pages = save_zero_page(rs, block, offset);
 987         if (pages > 0) {
 988             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 989              * page would be stale
 990              */
 991             xbzrle_cache_zero_page(rs, current_addr);
 992             ram_release_pages(block->idstr, offset, pages);
 993         } else if (!rs->ram_bulk_stage &&
 994                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 995             pages = save_xbzrle_page(rs, &p, current_addr, block,
 996                                      offset, last_stage);
 997             if (!last_stage) {
 998                 /* Can't send this cached data async, since the cache page
 999                  * might get updated before it gets to the wire
1000                  */
1001                 send_async = false;
1002             }
1003         }
1004     }
1005
1006     /* XBZRLE overflow or normal page */
1007     if (pages == -1) {
1008         ram_counters.transferred +=
1009             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
1010         if (send_async) {
1011             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
1012                                   migrate_release_ram() &
1013                                   migration_in_postcopy());
1014         } else {
1015             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
1016         }
1017         ram_counters.transferred += TARGET_PAGE_SIZE;
1018         pages = 1;
1019         ram_counters.normal++;
1020     }
1021
1022     XBZRLE_cache_unlock();
1023
1024     return pages;
1025 }
1026
1027 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
1028                                 ram_addr_t offset)
1029 {
1030     RAMState *rs = ram_state;
1031     int bytes_sent, blen;
1032     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1033
1034     bytes_sent = save_page_header(rs, f, block, offset |
1035                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1036     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1037                                      migrate_compress_level());
1038     if (blen < 0) {
1039         bytes_sent = 0;
1040         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1041         error_report("compressed data failed!");
1042     } else {
1043         bytes_sent += blen;
1044         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1045     }
1046
1047     return bytes_sent;
1048 }
1049
1050 static void flush_compressed_data(RAMState *rs)
1051 {
1052     int idx, len, thread_count;
1053
1054     if (!migrate_use_compression()) {
1055         return;
1056     }
1057     thread_count = migrate_compress_threads();
1058
1059     qemu_mutex_lock(&comp_done_lock);
1060     for (idx = 0; idx < thread_count; idx++) {
1061         while (!comp_param[idx].done) {
1062             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1063         }
1064     }
1065     qemu_mutex_unlock(&comp_done_lock);
1066
1067     for (idx = 0; idx < thread_count; idx++) {
1068         qemu_mutex_lock(&comp_param[idx].mutex);
1069         if (!comp_param[idx].quit) {
1070             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1071             ram_counters.transferred += len;
1072         }
1073         qemu_mutex_unlock(&comp_param[idx].mutex);
1074     }
1075 }
1076
1077 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1078                                        ram_addr_t offset)
1079 {
1080     param->block = block;
1081     param->offset = offset;
1082 }
1083
1084 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1085                                            ram_addr_t offset)
1086 {
1087     int idx, thread_count, bytes_xmit = -1, pages = -1;
1088
1089     thread_count = migrate_compress_threads();
1090     qemu_mutex_lock(&comp_done_lock);
1091     while (true) {
1092         for (idx = 0; idx < thread_count; idx++) {
1093             if (comp_param[idx].done) {
1094                 comp_param[idx].done = false;
1095                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1096                 qemu_mutex_lock(&comp_param[idx].mutex);
1097                 set_compress_params(&comp_param[idx], block, offset);
1098                 qemu_cond_signal(&comp_param[idx].cond);
1099                 qemu_mutex_unlock(&comp_param[idx].mutex);
1100                 pages = 1;
1101                 ram_counters.normal++;
1102                 ram_counters.transferred += bytes_xmit;
1103                 break;
1104             }
1105         }
1106         if (pages > 0) {
1107             break;
1108         } else {
1109             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1110         }
1111     }
1112     qemu_mutex_unlock(&comp_done_lock);
1113
1114     return pages;
1115 }
1116
1117 /**
1118  * ram_save_compressed_page: compress the given page and send it to the stream
1119  *
1120  * Returns the number of pages written.
1121  *
1122  * @rs: current RAM state
1123  * @block: block that contains the page we want to send
1124  * @offset: offset inside the block for the page
1125  * @last_stage: if we are at the completion stage
1126  */
1127 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1128                                     bool last_stage)
1129 {
1130     int pages = -1;
1131     uint64_t bytes_xmit = 0;
1132     uint8_t *p;
1133     int ret, blen;
1134     RAMBlock *block = pss->block;
1135     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1136
1137     p = block->host + offset;
1138
1139     ret = ram_control_save_page(rs->f, block->offset,
1140                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1141     if (bytes_xmit) {
1142         ram_counters.transferred += bytes_xmit;
1143         pages = 1;
1144     }
1145     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1146         if (ret != RAM_SAVE_CONTROL_DELAYED) {
1147             if (bytes_xmit > 0) {
1148                 ram_counters.normal++;
1149             } else if (bytes_xmit == 0) {
1150                 ram_counters.duplicate++;
1151             }
1152         }
1153     } else {
1154         /* When starting the process of a new block, the first page of
1155          * the block should be sent out before other pages in the same
1156          * block, and all the pages in last block should have been sent
1157          * out, keeping this order is important, because the 'cont' flag
1158          * is used to avoid resending the block name.
1159          */
1160         if (block != rs->last_sent_block) {
1161             flush_compressed_data(rs);
1162             pages = save_zero_page(rs, block, offset);
1163             if (pages == -1) {
1164                 /* Make sure the first page is sent out before other pages */
1165                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1166                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1167                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1168                                                  migrate_compress_level());
1169                 if (blen > 0) {
1170                     ram_counters.transferred += bytes_xmit + blen;
1171                     ram_counters.normal++;
1172                     pages = 1;
1173                 } else {
1174                     qemu_file_set_error(rs->f, blen);
1175                     error_report("compressed data failed!");
1176                 }
1177             }
1178             if (pages > 0) {
1179                 ram_release_pages(block->idstr, offset, pages);
1180             }
1181         } else {
1182             pages = save_zero_page(rs, block, offset);
1183             if (pages == -1) {
1184                 pages = compress_page_with_multi_thread(rs, block, offset);
1185             } else {
1186                 ram_release_pages(block->idstr, offset, pages);
1187             }
1188         }
1189     }
1190
1191     return pages;
1192 }
1193
1194 /**
1195  * find_dirty_block: find the next dirty page and update any state
1196  * associated with the search process.
1197  *
1198  * Returns if a page is found
1199  *
1200  * @rs: current RAM state
1201  * @pss: data about the state of the current dirty page scan
1202  * @again: set to false if the search has scanned the whole of RAM
1203  */
1204 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1205 {
1206     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1207     if (pss->complete_round && pss->block == rs->last_seen_block &&
1208         pss->page >= rs->last_page) {
1209         /*
1210          * We've been once around the RAM and haven't found anything.
1211          * Give up.
1212          */
1213         *again = false;
1214         return false;
1215     }
1216     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1217         /* Didn't find anything in this RAM Block */
1218         pss->page = 0;
1219         pss->block = QLIST_NEXT_RCU(pss->block, next);
1220         if (!pss->block) {
1221             /* Hit the end of the list */
1222             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1223             /* Flag that we've looped */
1224             pss->complete_round = true;
1225             rs->ram_bulk_stage = false;
1226             if (migrate_use_xbzrle()) {
1227                 /* If xbzrle is on, stop using the data compression at this
1228                  * point. In theory, xbzrle can do better than compression.
1229                  */
1230                 flush_compressed_data(rs);
1231             }
1232         }
1233         /* Didn't find anything this time, but try again on the new block */
1234         *again = true;
1235         return false;
1236     } else {
1237         /* Can go around again, but... */
1238         *again = true;
1239         /* We've found something so probably don't need to */
1240         return true;
1241     }
1242 }
1243
1244 /**
1245  * unqueue_page: gets a page of the queue
1246  *
1247  * Helper for 'get_queued_page' - gets a page off the queue
1248  *
1249  * Returns the block of the page (or NULL if none available)
1250  *
1251  * @rs: current RAM state
1252  * @offset: used to return the offset within the RAMBlock
1253  */
1254 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1255 {
1256     RAMBlock *block = NULL;
1257
1258     qemu_mutex_lock(&rs->src_page_req_mutex);
1259     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1260         struct RAMSrcPageRequest *entry =
1261                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1262         block = entry->rb;
1263         *offset = entry->offset;
1264
1265         if (entry->len > TARGET_PAGE_SIZE) {
1266             entry->len -= TARGET_PAGE_SIZE;
1267             entry->offset += TARGET_PAGE_SIZE;
1268         } else {
1269             memory_region_unref(block->mr);
1270             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1271             g_free(entry);
1272         }
1273     }
1274     qemu_mutex_unlock(&rs->src_page_req_mutex);
1275
1276     return block;
1277 }
1278
1279 /**
1280  * get_queued_page: unqueue a page from the postocpy requests
1281  *
1282  * Skips pages that are already sent (!dirty)
1283  *
1284  * Returns if a queued page is found
1285  *
1286  * @rs: current RAM state
1287  * @pss: data about the state of the current dirty page scan
1288  */
1289 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1290 {
1291     RAMBlock  *block;
1292     ram_addr_t offset;
1293     bool dirty;
1294
1295     do {
1296         block = unqueue_page(rs, &offset);
1297         /*
1298          * We're sending this page, and since it's postcopy nothing else
1299          * will dirty it, and we must make sure it doesn't get sent again
1300          * even if this queue request was received after the background
1301          * search already sent it.
1302          */
1303         if (block) {
1304             unsigned long page;
1305
1306             page = offset >> TARGET_PAGE_BITS;
1307             dirty = test_bit(page, block->bmap);
1308             if (!dirty) {
1309                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1310                        page, test_bit(page, block->unsentmap));
1311             } else {
1312                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1313             }
1314         }
1315
1316     } while (block && !dirty);
1317
1318     if (block) {
1319         /*
1320          * As soon as we start servicing pages out of order, then we have
1321          * to kill the bulk stage, since the bulk stage assumes
1322          * in (migration_bitmap_find_and_reset_dirty) that every page is
1323          * dirty, that's no longer true.
1324          */
1325         rs->ram_bulk_stage = false;
1326
1327         /*
1328          * We want the background search to continue from the queued page
1329          * since the guest is likely to want other pages near to the page
1330          * it just requested.
1331          */
1332         pss->block = block;
1333         pss->page = offset >> TARGET_PAGE_BITS;
1334     }
1335
1336     return !!block;
1337 }
1338
1339 /**
1340  * migration_page_queue_free: drop any remaining pages in the ram
1341  * request queue
1342  *
1343  * It should be empty at the end anyway, but in error cases there may
1344  * be some left.  in case that there is any page left, we drop it.
1345  *
1346  */
1347 static void migration_page_queue_free(RAMState *rs)
1348 {
1349     struct RAMSrcPageRequest *mspr, *next_mspr;
1350     /* This queue generally should be empty - but in the case of a failed
1351      * migration might have some droppings in.
1352      */
1353     rcu_read_lock();
1354     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1355         memory_region_unref(mspr->rb->mr);
1356         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1357         g_free(mspr);
1358     }
1359     rcu_read_unlock();
1360 }
1361
1362 /**
1363  * ram_save_queue_pages: queue the page for transmission
1364  *
1365  * A request from postcopy destination for example.
1366  *
1367  * Returns zero on success or negative on error
1368  *
1369  * @rbname: Name of the RAMBLock of the request. NULL means the
1370  *          same that last one.
1371  * @start: starting address from the start of the RAMBlock
1372  * @len: length (in bytes) to send
1373  */
1374 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1375 {
1376     RAMBlock *ramblock;
1377     RAMState *rs = ram_state;
1378
1379     ram_counters.postcopy_requests++;
1380     rcu_read_lock();
1381     if (!rbname) {
1382         /* Reuse last RAMBlock */
1383         ramblock = rs->last_req_rb;
1384
1385         if (!ramblock) {
1386             /*
1387              * Shouldn't happen, we can't reuse the last RAMBlock if
1388              * it's the 1st request.
1389              */
1390             error_report("ram_save_queue_pages no previous block");
1391             goto err;
1392         }
1393     } else {
1394         ramblock = qemu_ram_block_by_name(rbname);
1395
1396         if (!ramblock) {
1397             /* We shouldn't be asked for a non-existent RAMBlock */
1398             error_report("ram_save_queue_pages no block '%s'", rbname);
1399             goto err;
1400         }
1401         rs->last_req_rb = ramblock;
1402     }
1403     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1404     if (start+len > ramblock->used_length) {
1405         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1406                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1407                      __func__, start, len, ramblock->used_length);
1408         goto err;
1409     }
1410
1411     struct RAMSrcPageRequest *new_entry =
1412         g_malloc0(sizeof(struct RAMSrcPageRequest));
1413     new_entry->rb = ramblock;
1414     new_entry->offset = start;
1415     new_entry->len = len;
1416
1417     memory_region_ref(ramblock->mr);
1418     qemu_mutex_lock(&rs->src_page_req_mutex);
1419     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1420     qemu_mutex_unlock(&rs->src_page_req_mutex);
1421     rcu_read_unlock();
1422
1423     return 0;
1424
1425 err:
1426     rcu_read_unlock();
1427     return -1;
1428 }
1429
1430 /**
1431  * ram_save_target_page: save one target page
1432  *
1433  * Returns the number of pages written
1434  *
1435  * @rs: current RAM state
1436  * @ms: current migration state
1437  * @pss: data about the page we want to send
1438  * @last_stage: if we are at the completion stage
1439  */
1440 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1441                                 bool last_stage)
1442 {
1443     int res = 0;
1444
1445     /* Check the pages is dirty and if it is send it */
1446     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1447         /*
1448          * If xbzrle is on, stop using the data compression after first
1449          * round of migration even if compression is enabled. In theory,
1450          * xbzrle can do better than compression.
1451          */
1452         if (migrate_use_compression() &&
1453             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1454             res = ram_save_compressed_page(rs, pss, last_stage);
1455         } else {
1456             res = ram_save_page(rs, pss, last_stage);
1457         }
1458
1459         if (res < 0) {
1460             return res;
1461         }
1462         if (pss->block->unsentmap) {
1463             clear_bit(pss->page, pss->block->unsentmap);
1464         }
1465     }
1466
1467     return res;
1468 }
1469
1470 /**
1471  * ram_save_host_page: save a whole host page
1472  *
1473  * Starting at *offset send pages up to the end of the current host
1474  * page. It's valid for the initial offset to point into the middle of
1475  * a host page in which case the remainder of the hostpage is sent.
1476  * Only dirty target pages are sent. Note that the host page size may
1477  * be a huge page for this block.
1478  * The saving stops at the boundary of the used_length of the block
1479  * if the RAMBlock isn't a multiple of the host page size.
1480  *
1481  * Returns the number of pages written or negative on error
1482  *
1483  * @rs: current RAM state
1484  * @ms: current migration state
1485  * @pss: data about the page we want to send
1486  * @last_stage: if we are at the completion stage
1487  */
1488 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1489                               bool last_stage)
1490 {
1491     int tmppages, pages = 0;
1492     size_t pagesize_bits =
1493         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1494
1495     do {
1496         tmppages = ram_save_target_page(rs, pss, last_stage);
1497         if (tmppages < 0) {
1498             return tmppages;
1499         }
1500
1501         pages += tmppages;
1502         pss->page++;
1503     } while ((pss->page & (pagesize_bits - 1)) &&
1504              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1505
1506     /* The offset we leave with is the last one we looked at */
1507     pss->page--;
1508     return pages;
1509 }
1510
1511 /**
1512  * ram_find_and_save_block: finds a dirty page and sends it to f
1513  *
1514  * Called within an RCU critical section.
1515  *
1516  * Returns the number of pages written where zero means no dirty pages
1517  *
1518  * @rs: current RAM state
1519  * @last_stage: if we are at the completion stage
1520  *
1521  * On systems where host-page-size > target-page-size it will send all the
1522  * pages in a host page that are dirty.
1523  */
1524
1525 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1526 {
1527     PageSearchStatus pss;
1528     int pages = 0;
1529     bool again, found;
1530
1531     /* No dirty page as there is zero RAM */
1532     if (!ram_bytes_total()) {
1533         return pages;
1534     }
1535
1536     pss.block = rs->last_seen_block;
1537     pss.page = rs->last_page;
1538     pss.complete_round = false;
1539
1540     if (!pss.block) {
1541         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1542     }
1543
1544     do {
1545         again = true;
1546         found = get_queued_page(rs, &pss);
1547
1548         if (!found) {
1549             /* priority queue empty, so just search for something dirty */
1550             found = find_dirty_block(rs, &pss, &again);
1551         }
1552
1553         if (found) {
1554             pages = ram_save_host_page(rs, &pss, last_stage);
1555         }
1556     } while (!pages && again);
1557
1558     rs->last_seen_block = pss.block;
1559     rs->last_page = pss.page;
1560
1561     return pages;
1562 }
1563
1564 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1565 {
1566     uint64_t pages = size / TARGET_PAGE_SIZE;
1567
1568     if (zero) {
1569         ram_counters.duplicate += pages;
1570     } else {
1571         ram_counters.normal += pages;
1572         ram_counters.transferred += size;
1573         qemu_update_position(f, size);
1574     }
1575 }
1576
1577 uint64_t ram_bytes_total(void)
1578 {
1579     RAMBlock *block;
1580     uint64_t total = 0;
1581
1582     rcu_read_lock();
1583     RAMBLOCK_FOREACH(block) {
1584         total += block->used_length;
1585     }
1586     rcu_read_unlock();
1587     return total;
1588 }
1589
1590 static void xbzrle_load_setup(void)
1591 {
1592     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1593 }
1594
1595 static void xbzrle_load_cleanup(void)
1596 {
1597     g_free(XBZRLE.decoded_buf);
1598     XBZRLE.decoded_buf = NULL;
1599 }
1600
1601 static void ram_state_cleanup(RAMState **rsp)
1602 {
1603     migration_page_queue_free(*rsp);
1604     qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1605     qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1606     g_free(*rsp);
1607     *rsp = NULL;
1608 }
1609
1610 static void xbzrle_cleanup(void)
1611 {
1612     XBZRLE_cache_lock();
1613     if (XBZRLE.cache) {
1614         cache_fini(XBZRLE.cache);
1615         g_free(XBZRLE.encoded_buf);
1616         g_free(XBZRLE.current_buf);
1617         g_free(XBZRLE.zero_target_page);
1618         XBZRLE.cache = NULL;
1619         XBZRLE.encoded_buf = NULL;
1620         XBZRLE.current_buf = NULL;
1621         XBZRLE.zero_target_page = NULL;
1622     }
1623     XBZRLE_cache_unlock();
1624 }
1625
1626 static void ram_save_cleanup(void *opaque)
1627 {
1628     RAMState **rsp = opaque;
1629     RAMBlock *block;
1630
1631     /* caller have hold iothread lock or is in a bh, so there is
1632      * no writing race against this migration_bitmap
1633      */
1634     memory_global_dirty_log_stop();
1635
1636     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1637         g_free(block->bmap);
1638         block->bmap = NULL;
1639         g_free(block->unsentmap);
1640         block->unsentmap = NULL;
1641     }
1642
1643     xbzrle_cleanup();
1644     compress_threads_save_cleanup();
1645     ram_state_cleanup(rsp);
1646 }
1647
1648 static void ram_state_reset(RAMState *rs)
1649 {
1650     rs->last_seen_block = NULL;
1651     rs->last_sent_block = NULL;
1652     rs->last_page = 0;
1653     rs->last_version = ram_list.version;
1654     rs->ram_bulk_stage = true;
1655 }
1656
1657 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1658
1659 /*
1660  * 'expected' is the value you expect the bitmap mostly to be full
1661  * of; it won't bother printing lines that are all this value.
1662  * If 'todump' is null the migration bitmap is dumped.
1663  */
1664 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1665                            unsigned long pages)
1666 {
1667     int64_t cur;
1668     int64_t linelen = 128;
1669     char linebuf[129];
1670
1671     for (cur = 0; cur < pages; cur += linelen) {
1672         int64_t curb;
1673         bool found = false;
1674         /*
1675          * Last line; catch the case where the line length
1676          * is longer than remaining ram
1677          */
1678         if (cur + linelen > pages) {
1679             linelen = pages - cur;
1680         }
1681         for (curb = 0; curb < linelen; curb++) {
1682             bool thisbit = test_bit(cur + curb, todump);
1683             linebuf[curb] = thisbit ? '1' : '.';
1684             found = found || (thisbit != expected);
1685         }
1686         if (found) {
1687             linebuf[curb] = '\0';
1688             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1689         }
1690     }
1691 }
1692
1693 /* **** functions for postcopy ***** */
1694
1695 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1696 {
1697     struct RAMBlock *block;
1698
1699     RAMBLOCK_FOREACH(block) {
1700         unsigned long *bitmap = block->bmap;
1701         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1702         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1703
1704         while (run_start < range) {
1705             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1706             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1707                               (run_end - run_start) << TARGET_PAGE_BITS);
1708             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1709         }
1710     }
1711 }
1712
1713 /**
1714  * postcopy_send_discard_bm_ram: discard a RAMBlock
1715  *
1716  * Returns zero on success
1717  *
1718  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1719  * Note: At this point the 'unsentmap' is the processed bitmap combined
1720  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1721  *
1722  * @ms: current migration state
1723  * @pds: state for postcopy
1724  * @start: RAMBlock starting page
1725  * @length: RAMBlock size
1726  */
1727 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1728                                         PostcopyDiscardState *pds,
1729                                         RAMBlock *block)
1730 {
1731     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1732     unsigned long current;
1733     unsigned long *unsentmap = block->unsentmap;
1734
1735     for (current = 0; current < end; ) {
1736         unsigned long one = find_next_bit(unsentmap, end, current);
1737
1738         if (one <= end) {
1739             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1740             unsigned long discard_length;
1741
1742             if (zero >= end) {
1743                 discard_length = end - one;
1744             } else {
1745                 discard_length = zero - one;
1746             }
1747             if (discard_length) {
1748                 postcopy_discard_send_range(ms, pds, one, discard_length);
1749             }
1750             current = one + discard_length;
1751         } else {
1752             current = one;
1753         }
1754     }
1755
1756     return 0;
1757 }
1758
1759 /**
1760  * postcopy_each_ram_send_discard: discard all RAMBlocks
1761  *
1762  * Returns 0 for success or negative for error
1763  *
1764  * Utility for the outgoing postcopy code.
1765  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1766  *   passing it bitmap indexes and name.
1767  * (qemu_ram_foreach_block ends up passing unscaled lengths
1768  *  which would mean postcopy code would have to deal with target page)
1769  *
1770  * @ms: current migration state
1771  */
1772 static int postcopy_each_ram_send_discard(MigrationState *ms)
1773 {
1774     struct RAMBlock *block;
1775     int ret;
1776
1777     RAMBLOCK_FOREACH(block) {
1778         PostcopyDiscardState *pds =
1779             postcopy_discard_send_init(ms, block->idstr);
1780
1781         /*
1782          * Postcopy sends chunks of bitmap over the wire, but it
1783          * just needs indexes at this point, avoids it having
1784          * target page specific code.
1785          */
1786         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1787         postcopy_discard_send_finish(ms, pds);
1788         if (ret) {
1789             return ret;
1790         }
1791     }
1792
1793     return 0;
1794 }
1795
1796 /**
1797  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1798  *
1799  * Helper for postcopy_chunk_hostpages; it's called twice to
1800  * canonicalize the two bitmaps, that are similar, but one is
1801  * inverted.
1802  *
1803  * Postcopy requires that all target pages in a hostpage are dirty or
1804  * clean, not a mix.  This function canonicalizes the bitmaps.
1805  *
1806  * @ms: current migration state
1807  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1808  *               otherwise we need to canonicalize partially dirty host pages
1809  * @block: block that contains the page we want to canonicalize
1810  * @pds: state for postcopy
1811  */
1812 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1813                                           RAMBlock *block,
1814                                           PostcopyDiscardState *pds)
1815 {
1816     RAMState *rs = ram_state;
1817     unsigned long *bitmap = block->bmap;
1818     unsigned long *unsentmap = block->unsentmap;
1819     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1820     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1821     unsigned long run_start;
1822
1823     if (block->page_size == TARGET_PAGE_SIZE) {
1824         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1825         return;
1826     }
1827
1828     if (unsent_pass) {
1829         /* Find a sent page */
1830         run_start = find_next_zero_bit(unsentmap, pages, 0);
1831     } else {
1832         /* Find a dirty page */
1833         run_start = find_next_bit(bitmap, pages, 0);
1834     }
1835
1836     while (run_start < pages) {
1837         bool do_fixup = false;
1838         unsigned long fixup_start_addr;
1839         unsigned long host_offset;
1840
1841         /*
1842          * If the start of this run of pages is in the middle of a host
1843          * page, then we need to fixup this host page.
1844          */
1845         host_offset = run_start % host_ratio;
1846         if (host_offset) {
1847             do_fixup = true;
1848             run_start -= host_offset;
1849             fixup_start_addr = run_start;
1850             /* For the next pass */
1851             run_start = run_start + host_ratio;
1852         } else {
1853             /* Find the end of this run */
1854             unsigned long run_end;
1855             if (unsent_pass) {
1856                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1857             } else {
1858                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1859             }
1860             /*
1861              * If the end isn't at the start of a host page, then the
1862              * run doesn't finish at the end of a host page
1863              * and we need to discard.
1864              */
1865             host_offset = run_end % host_ratio;
1866             if (host_offset) {
1867                 do_fixup = true;
1868                 fixup_start_addr = run_end - host_offset;
1869                 /*
1870                  * This host page has gone, the next loop iteration starts
1871                  * from after the fixup
1872                  */
1873                 run_start = fixup_start_addr + host_ratio;
1874             } else {
1875                 /*
1876                  * No discards on this iteration, next loop starts from
1877                  * next sent/dirty page
1878                  */
1879                 run_start = run_end + 1;
1880             }
1881         }
1882
1883         if (do_fixup) {
1884             unsigned long page;
1885
1886             /* Tell the destination to discard this page */
1887             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1888                 /* For the unsent_pass we:
1889                  *     discard partially sent pages
1890                  * For the !unsent_pass (dirty) we:
1891                  *     discard partially dirty pages that were sent
1892                  *     (any partially sent pages were already discarded
1893                  *     by the previous unsent_pass)
1894                  */
1895                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1896                                             host_ratio);
1897             }
1898
1899             /* Clean up the bitmap */
1900             for (page = fixup_start_addr;
1901                  page < fixup_start_addr + host_ratio; page++) {
1902                 /* All pages in this host page are now not sent */
1903                 set_bit(page, unsentmap);
1904
1905                 /*
1906                  * Remark them as dirty, updating the count for any pages
1907                  * that weren't previously dirty.
1908                  */
1909                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1910             }
1911         }
1912
1913         if (unsent_pass) {
1914             /* Find the next sent page for the next iteration */
1915             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1916         } else {
1917             /* Find the next dirty page for the next iteration */
1918             run_start = find_next_bit(bitmap, pages, run_start);
1919         }
1920     }
1921 }
1922
1923 /**
1924  * postcopy_chuck_hostpages: discrad any partially sent host page
1925  *
1926  * Utility for the outgoing postcopy code.
1927  *
1928  * Discard any partially sent host-page size chunks, mark any partially
1929  * dirty host-page size chunks as all dirty.  In this case the host-page
1930  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1931  *
1932  * Returns zero on success
1933  *
1934  * @ms: current migration state
1935  * @block: block we want to work with
1936  */
1937 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1938 {
1939     PostcopyDiscardState *pds =
1940         postcopy_discard_send_init(ms, block->idstr);
1941
1942     /* First pass: Discard all partially sent host pages */
1943     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1944     /*
1945      * Second pass: Ensure that all partially dirty host pages are made
1946      * fully dirty.
1947      */
1948     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1949
1950     postcopy_discard_send_finish(ms, pds);
1951     return 0;
1952 }
1953
1954 /**
1955  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1956  *
1957  * Returns zero on success
1958  *
1959  * Transmit the set of pages to be discarded after precopy to the target
1960  * these are pages that:
1961  *     a) Have been previously transmitted but are now dirty again
1962  *     b) Pages that have never been transmitted, this ensures that
1963  *        any pages on the destination that have been mapped by background
1964  *        tasks get discarded (transparent huge pages is the specific concern)
1965  * Hopefully this is pretty sparse
1966  *
1967  * @ms: current migration state
1968  */
1969 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1970 {
1971     RAMState *rs = ram_state;
1972     RAMBlock *block;
1973     int ret;
1974
1975     rcu_read_lock();
1976
1977     /* This should be our last sync, the src is now paused */
1978     migration_bitmap_sync(rs);
1979
1980     /* Easiest way to make sure we don't resume in the middle of a host-page */
1981     rs->last_seen_block = NULL;
1982     rs->last_sent_block = NULL;
1983     rs->last_page = 0;
1984
1985     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1986         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1987         unsigned long *bitmap = block->bmap;
1988         unsigned long *unsentmap = block->unsentmap;
1989
1990         if (!unsentmap) {
1991             /* We don't have a safe way to resize the sentmap, so
1992              * if the bitmap was resized it will be NULL at this
1993              * point.
1994              */
1995             error_report("migration ram resized during precopy phase");
1996             rcu_read_unlock();
1997             return -EINVAL;
1998         }
1999         /* Deal with TPS != HPS and huge pages */
2000         ret = postcopy_chunk_hostpages(ms, block);
2001         if (ret) {
2002             rcu_read_unlock();
2003             return ret;
2004         }
2005
2006         /*
2007          * Update the unsentmap to be unsentmap = unsentmap | dirty
2008          */
2009         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2010 #ifdef DEBUG_POSTCOPY
2011         ram_debug_dump_bitmap(unsentmap, true, pages);
2012 #endif
2013     }
2014     trace_ram_postcopy_send_discard_bitmap();
2015
2016     ret = postcopy_each_ram_send_discard(ms);
2017     rcu_read_unlock();
2018
2019     return ret;
2020 }
2021
2022 /**
2023  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2024  *
2025  * Returns zero on success
2026  *
2027  * @rbname: name of the RAMBlock of the request. NULL means the
2028  *          same that last one.
2029  * @start: RAMBlock starting page
2030  * @length: RAMBlock size
2031  */
2032 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2033 {
2034     int ret = -1;
2035
2036     trace_ram_discard_range(rbname, start, length);
2037
2038     rcu_read_lock();
2039     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2040
2041     if (!rb) {
2042         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2043         goto err;
2044     }
2045
2046     bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2047                  length >> qemu_target_page_bits());
2048     ret = ram_block_discard_range(rb, start, length);
2049
2050 err:
2051     rcu_read_unlock();
2052
2053     return ret;
2054 }
2055
2056 /*
2057  * For every allocation, we will try not to crash the VM if the
2058  * allocation failed.
2059  */
2060 static int xbzrle_init(void)
2061 {
2062     Error *local_err = NULL;
2063
2064     if (!migrate_use_xbzrle()) {
2065         return 0;
2066     }
2067
2068     XBZRLE_cache_lock();
2069
2070     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2071     if (!XBZRLE.zero_target_page) {
2072         error_report("%s: Error allocating zero page", __func__);
2073         goto err_out;
2074     }
2075
2076     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2077                               TARGET_PAGE_SIZE, &local_err);
2078     if (!XBZRLE.cache) {
2079         error_report_err(local_err);
2080         goto free_zero_page;
2081     }
2082
2083     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2084     if (!XBZRLE.encoded_buf) {
2085         error_report("%s: Error allocating encoded_buf", __func__);
2086         goto free_cache;
2087     }
2088
2089     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2090     if (!XBZRLE.current_buf) {
2091         error_report("%s: Error allocating current_buf", __func__);
2092         goto free_encoded_buf;
2093     }
2094
2095     /* We are all good */
2096     XBZRLE_cache_unlock();
2097     return 0;
2098
2099 free_encoded_buf:
2100     g_free(XBZRLE.encoded_buf);
2101     XBZRLE.encoded_buf = NULL;
2102 free_cache:
2103     cache_fini(XBZRLE.cache);
2104     XBZRLE.cache = NULL;
2105 free_zero_page:
2106     g_free(XBZRLE.zero_target_page);
2107     XBZRLE.zero_target_page = NULL;
2108 err_out:
2109     XBZRLE_cache_unlock();
2110     return -ENOMEM;
2111 }
2112
2113 static int ram_state_init(RAMState **rsp)
2114 {
2115     *rsp = g_try_new0(RAMState, 1);
2116
2117     if (!*rsp) {
2118         error_report("%s: Init ramstate fail", __func__);
2119         return -1;
2120     }
2121
2122     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2123     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2124     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2125
2126     /*
2127      * Count the total number of pages used by ram blocks not including any
2128      * gaps due to alignment or unplugs.
2129      */
2130     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2131
2132     ram_state_reset(*rsp);
2133
2134     return 0;
2135 }
2136
2137 static void ram_list_init_bitmaps(void)
2138 {
2139     RAMBlock *block;
2140     unsigned long pages;
2141
2142     /* Skip setting bitmap if there is no RAM */
2143     if (ram_bytes_total()) {
2144         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2145             pages = block->max_length >> TARGET_PAGE_BITS;
2146             block->bmap = bitmap_new(pages);
2147             bitmap_set(block->bmap, 0, pages);
2148             if (migrate_postcopy_ram()) {
2149                 block->unsentmap = bitmap_new(pages);
2150                 bitmap_set(block->unsentmap, 0, pages);
2151             }
2152         }
2153     }
2154 }
2155
2156 static void ram_init_bitmaps(RAMState *rs)
2157 {
2158     /* For memory_global_dirty_log_start below.  */
2159     qemu_mutex_lock_iothread();
2160     qemu_mutex_lock_ramlist();
2161     rcu_read_lock();
2162
2163     ram_list_init_bitmaps();
2164     memory_global_dirty_log_start();
2165     migration_bitmap_sync(rs);
2166
2167     rcu_read_unlock();
2168     qemu_mutex_unlock_ramlist();
2169     qemu_mutex_unlock_iothread();
2170 }
2171
2172 static int ram_init_all(RAMState **rsp)
2173 {
2174     if (ram_state_init(rsp)) {
2175         return -1;
2176     }
2177
2178     if (xbzrle_init()) {
2179         ram_state_cleanup(rsp);
2180         return -1;
2181     }
2182
2183     ram_init_bitmaps(*rsp);
2184
2185     return 0;
2186 }
2187
2188 /*
2189  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2190  * long-running RCU critical section.  When rcu-reclaims in the code
2191  * start to become numerous it will be necessary to reduce the
2192  * granularity of these critical sections.
2193  */
2194
2195 /**
2196  * ram_save_setup: Setup RAM for migration
2197  *
2198  * Returns zero to indicate success and negative for error
2199  *
2200  * @f: QEMUFile where to send the data
2201  * @opaque: RAMState pointer
2202  */
2203 static int ram_save_setup(QEMUFile *f, void *opaque)
2204 {
2205     RAMState **rsp = opaque;
2206     RAMBlock *block;
2207
2208     /* migration has already setup the bitmap, reuse it. */
2209     if (!migration_in_colo_state()) {
2210         if (ram_init_all(rsp) != 0) {
2211             return -1;
2212         }
2213     }
2214     (*rsp)->f = f;
2215
2216     rcu_read_lock();
2217
2218     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2219
2220     RAMBLOCK_FOREACH(block) {
2221         qemu_put_byte(f, strlen(block->idstr));
2222         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2223         qemu_put_be64(f, block->used_length);
2224         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2225             qemu_put_be64(f, block->page_size);
2226         }
2227     }
2228
2229     rcu_read_unlock();
2230     compress_threads_save_setup();
2231
2232     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2233     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2234
2235     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2236
2237     return 0;
2238 }
2239
2240 /**
2241  * ram_save_iterate: iterative stage for migration
2242  *
2243  * Returns zero to indicate success and negative for error
2244  *
2245  * @f: QEMUFile where to send the data
2246  * @opaque: RAMState pointer
2247  */
2248 static int ram_save_iterate(QEMUFile *f, void *opaque)
2249 {
2250     RAMState **temp = opaque;
2251     RAMState *rs = *temp;
2252     int ret;
2253     int i;
2254     int64_t t0;
2255     int done = 0;
2256
2257     rcu_read_lock();
2258     if (ram_list.version != rs->last_version) {
2259         ram_state_reset(rs);
2260     }
2261
2262     /* Read version before ram_list.blocks */
2263     smp_rmb();
2264
2265     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2266
2267     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2268     i = 0;
2269     while ((ret = qemu_file_rate_limit(f)) == 0) {
2270         int pages;
2271
2272         pages = ram_find_and_save_block(rs, false);
2273         /* no more pages to sent */
2274         if (pages == 0) {
2275             done = 1;
2276             break;
2277         }
2278         rs->iterations++;
2279
2280         /* we want to check in the 1st loop, just in case it was the 1st time
2281            and we had to sync the dirty bitmap.
2282            qemu_get_clock_ns() is a bit expensive, so we only check each some
2283            iterations
2284         */
2285         if ((i & 63) == 0) {
2286             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2287             if (t1 > MAX_WAIT) {
2288                 trace_ram_save_iterate_big_wait(t1, i);
2289                 break;
2290             }
2291         }
2292         i++;
2293     }
2294     flush_compressed_data(rs);
2295     rcu_read_unlock();
2296
2297     /*
2298      * Must occur before EOS (or any QEMUFile operation)
2299      * because of RDMA protocol.
2300      */
2301     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2302
2303     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2304     ram_counters.transferred += 8;
2305
2306     ret = qemu_file_get_error(f);
2307     if (ret < 0) {
2308         return ret;
2309     }
2310
2311     return done;
2312 }
2313
2314 /**
2315  * ram_save_complete: function called to send the remaining amount of ram
2316  *
2317  * Returns zero to indicate success
2318  *
2319  * Called with iothread lock
2320  *
2321  * @f: QEMUFile where to send the data
2322  * @opaque: RAMState pointer
2323  */
2324 static int ram_save_complete(QEMUFile *f, void *opaque)
2325 {
2326     RAMState **temp = opaque;
2327     RAMState *rs = *temp;
2328
2329     rcu_read_lock();
2330
2331     if (!migration_in_postcopy()) {
2332         migration_bitmap_sync(rs);
2333     }
2334
2335     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2336
2337     /* try transferring iterative blocks of memory */
2338
2339     /* flush all remaining blocks regardless of rate limiting */
2340     while (true) {
2341         int pages;
2342
2343         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2344         /* no more blocks to sent */
2345         if (pages == 0) {
2346             break;
2347         }
2348     }
2349
2350     flush_compressed_data(rs);
2351     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2352
2353     rcu_read_unlock();
2354
2355     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2356
2357     return 0;
2358 }
2359
2360 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2361                              uint64_t *non_postcopiable_pending,
2362                              uint64_t *postcopiable_pending)
2363 {
2364     RAMState **temp = opaque;
2365     RAMState *rs = *temp;
2366     uint64_t remaining_size;
2367
2368     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2369
2370     if (!migration_in_postcopy() &&
2371         remaining_size < max_size) {
2372         qemu_mutex_lock_iothread();
2373         rcu_read_lock();
2374         migration_bitmap_sync(rs);
2375         rcu_read_unlock();
2376         qemu_mutex_unlock_iothread();
2377         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2378     }
2379
2380     if (migrate_postcopy_ram()) {
2381         /* We can do postcopy, and all the data is postcopiable */
2382         *postcopiable_pending += remaining_size;
2383     } else {
2384         *non_postcopiable_pending += remaining_size;
2385     }
2386 }
2387
2388 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2389 {
2390     unsigned int xh_len;
2391     int xh_flags;
2392     uint8_t *loaded_data;
2393
2394     /* extract RLE header */
2395     xh_flags = qemu_get_byte(f);
2396     xh_len = qemu_get_be16(f);
2397
2398     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2399         error_report("Failed to load XBZRLE page - wrong compression!");
2400         return -1;
2401     }
2402
2403     if (xh_len > TARGET_PAGE_SIZE) {
2404         error_report("Failed to load XBZRLE page - len overflow!");
2405         return -1;
2406     }
2407     loaded_data = XBZRLE.decoded_buf;
2408     /* load data and decode */
2409     /* it can change loaded_data to point to an internal buffer */
2410     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2411
2412     /* decode RLE */
2413     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2414                              TARGET_PAGE_SIZE) == -1) {
2415         error_report("Failed to load XBZRLE page - decode error!");
2416         return -1;
2417     }
2418
2419     return 0;
2420 }
2421
2422 /**
2423  * ram_block_from_stream: read a RAMBlock id from the migration stream
2424  *
2425  * Must be called from within a rcu critical section.
2426  *
2427  * Returns a pointer from within the RCU-protected ram_list.
2428  *
2429  * @f: QEMUFile where to read the data from
2430  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2431  */
2432 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2433 {
2434     static RAMBlock *block = NULL;
2435     char id[256];
2436     uint8_t len;
2437
2438     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2439         if (!block) {
2440             error_report("Ack, bad migration stream!");
2441             return NULL;
2442         }
2443         return block;
2444     }
2445
2446     len = qemu_get_byte(f);
2447     qemu_get_buffer(f, (uint8_t *)id, len);
2448     id[len] = 0;
2449
2450     block = qemu_ram_block_by_name(id);
2451     if (!block) {
2452         error_report("Can't find block %s", id);
2453         return NULL;
2454     }
2455
2456     return block;
2457 }
2458
2459 static inline void *host_from_ram_block_offset(RAMBlock *block,
2460                                                ram_addr_t offset)
2461 {
2462     if (!offset_in_ramblock(block, offset)) {
2463         return NULL;
2464     }
2465
2466     return block->host + offset;
2467 }
2468
2469 /**
2470  * ram_handle_compressed: handle the zero page case
2471  *
2472  * If a page (or a whole RDMA chunk) has been
2473  * determined to be zero, then zap it.
2474  *
2475  * @host: host address for the zero page
2476  * @ch: what the page is filled from.  We only support zero
2477  * @size: size of the zero page
2478  */
2479 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2480 {
2481     if (ch != 0 || !is_zero_range(host, size)) {
2482         memset(host, ch, size);
2483     }
2484 }
2485
2486 static void *do_data_decompress(void *opaque)
2487 {
2488     DecompressParam *param = opaque;
2489     unsigned long pagesize;
2490     uint8_t *des;
2491     int len;
2492
2493     qemu_mutex_lock(&param->mutex);
2494     while (!param->quit) {
2495         if (param->des) {
2496             des = param->des;
2497             len = param->len;
2498             param->des = 0;
2499             qemu_mutex_unlock(&param->mutex);
2500
2501             pagesize = TARGET_PAGE_SIZE;
2502             /* uncompress() will return failed in some case, especially
2503              * when the page is dirted when doing the compression, it's
2504              * not a problem because the dirty page will be retransferred
2505              * and uncompress() won't break the data in other pages.
2506              */
2507             uncompress((Bytef *)des, &pagesize,
2508                        (const Bytef *)param->compbuf, len);
2509
2510             qemu_mutex_lock(&decomp_done_lock);
2511             param->done = true;
2512             qemu_cond_signal(&decomp_done_cond);
2513             qemu_mutex_unlock(&decomp_done_lock);
2514
2515             qemu_mutex_lock(&param->mutex);
2516         } else {
2517             qemu_cond_wait(&param->cond, &param->mutex);
2518         }
2519     }
2520     qemu_mutex_unlock(&param->mutex);
2521
2522     return NULL;
2523 }
2524
2525 static void wait_for_decompress_done(void)
2526 {
2527     int idx, thread_count;
2528
2529     if (!migrate_use_compression()) {
2530         return;
2531     }
2532
2533     thread_count = migrate_decompress_threads();
2534     qemu_mutex_lock(&decomp_done_lock);
2535     for (idx = 0; idx < thread_count; idx++) {
2536         while (!decomp_param[idx].done) {
2537             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2538         }
2539     }
2540     qemu_mutex_unlock(&decomp_done_lock);
2541 }
2542
2543 static void compress_threads_load_setup(void)
2544 {
2545     int i, thread_count;
2546
2547     if (!migrate_use_compression()) {
2548         return;
2549     }
2550     thread_count = migrate_decompress_threads();
2551     decompress_threads = g_new0(QemuThread, thread_count);
2552     decomp_param = g_new0(DecompressParam, thread_count);
2553     qemu_mutex_init(&decomp_done_lock);
2554     qemu_cond_init(&decomp_done_cond);
2555     for (i = 0; i < thread_count; i++) {
2556         qemu_mutex_init(&decomp_param[i].mutex);
2557         qemu_cond_init(&decomp_param[i].cond);
2558         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2559         decomp_param[i].done = true;
2560         decomp_param[i].quit = false;
2561         qemu_thread_create(decompress_threads + i, "decompress",
2562                            do_data_decompress, decomp_param + i,
2563                            QEMU_THREAD_JOINABLE);
2564     }
2565 }
2566
2567 static void compress_threads_load_cleanup(void)
2568 {
2569     int i, thread_count;
2570
2571     if (!migrate_use_compression()) {
2572         return;
2573     }
2574     thread_count = migrate_decompress_threads();
2575     for (i = 0; i < thread_count; i++) {
2576         qemu_mutex_lock(&decomp_param[i].mutex);
2577         decomp_param[i].quit = true;
2578         qemu_cond_signal(&decomp_param[i].cond);
2579         qemu_mutex_unlock(&decomp_param[i].mutex);
2580     }
2581     for (i = 0; i < thread_count; i++) {
2582         qemu_thread_join(decompress_threads + i);
2583         qemu_mutex_destroy(&decomp_param[i].mutex);
2584         qemu_cond_destroy(&decomp_param[i].cond);
2585         g_free(decomp_param[i].compbuf);
2586     }
2587     g_free(decompress_threads);
2588     g_free(decomp_param);
2589     decompress_threads = NULL;
2590     decomp_param = NULL;
2591 }
2592
2593 static void decompress_data_with_multi_threads(QEMUFile *f,
2594                                                void *host, int len)
2595 {
2596     int idx, thread_count;
2597
2598     thread_count = migrate_decompress_threads();
2599     qemu_mutex_lock(&decomp_done_lock);
2600     while (true) {
2601         for (idx = 0; idx < thread_count; idx++) {
2602             if (decomp_param[idx].done) {
2603                 decomp_param[idx].done = false;
2604                 qemu_mutex_lock(&decomp_param[idx].mutex);
2605                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2606                 decomp_param[idx].des = host;
2607                 decomp_param[idx].len = len;
2608                 qemu_cond_signal(&decomp_param[idx].cond);
2609                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2610                 break;
2611             }
2612         }
2613         if (idx < thread_count) {
2614             break;
2615         } else {
2616             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2617         }
2618     }
2619     qemu_mutex_unlock(&decomp_done_lock);
2620 }
2621
2622 /**
2623  * ram_load_setup: Setup RAM for migration incoming side
2624  *
2625  * Returns zero to indicate success and negative for error
2626  *
2627  * @f: QEMUFile where to receive the data
2628  * @opaque: RAMState pointer
2629  */
2630 static int ram_load_setup(QEMUFile *f, void *opaque)
2631 {
2632     xbzrle_load_setup();
2633     compress_threads_load_setup();
2634     ramblock_recv_map_init();
2635     return 0;
2636 }
2637
2638 static int ram_load_cleanup(void *opaque)
2639 {
2640     RAMBlock *rb;
2641     xbzrle_load_cleanup();
2642     compress_threads_load_cleanup();
2643
2644     RAMBLOCK_FOREACH(rb) {
2645         g_free(rb->receivedmap);
2646         rb->receivedmap = NULL;
2647     }
2648     return 0;
2649 }
2650
2651 /**
2652  * ram_postcopy_incoming_init: allocate postcopy data structures
2653  *
2654  * Returns 0 for success and negative if there was one error
2655  *
2656  * @mis: current migration incoming state
2657  *
2658  * Allocate data structures etc needed by incoming migration with
2659  * postcopy-ram. postcopy-ram's similarly names
2660  * postcopy_ram_incoming_init does the work.
2661  */
2662 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2663 {
2664     unsigned long ram_pages = last_ram_page();
2665
2666     return postcopy_ram_incoming_init(mis, ram_pages);
2667 }
2668
2669 /**
2670  * ram_load_postcopy: load a page in postcopy case
2671  *
2672  * Returns 0 for success or -errno in case of error
2673  *
2674  * Called in postcopy mode by ram_load().
2675  * rcu_read_lock is taken prior to this being called.
2676  *
2677  * @f: QEMUFile where to send the data
2678  */
2679 static int ram_load_postcopy(QEMUFile *f)
2680 {
2681     int flags = 0, ret = 0;
2682     bool place_needed = false;
2683     bool matching_page_sizes = false;
2684     MigrationIncomingState *mis = migration_incoming_get_current();
2685     /* Temporary page that is later 'placed' */
2686     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2687     void *last_host = NULL;
2688     bool all_zero = false;
2689
2690     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2691         ram_addr_t addr;
2692         void *host = NULL;
2693         void *page_buffer = NULL;
2694         void *place_source = NULL;
2695         RAMBlock *block = NULL;
2696         uint8_t ch;
2697
2698         addr = qemu_get_be64(f);
2699         flags = addr & ~TARGET_PAGE_MASK;
2700         addr &= TARGET_PAGE_MASK;
2701
2702         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2703         place_needed = false;
2704         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2705             block = ram_block_from_stream(f, flags);
2706
2707             host = host_from_ram_block_offset(block, addr);
2708             if (!host) {
2709                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2710                 ret = -EINVAL;
2711                 break;
2712             }
2713             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2714             /*
2715              * Postcopy requires that we place whole host pages atomically;
2716              * these may be huge pages for RAMBlocks that are backed by
2717              * hugetlbfs.
2718              * To make it atomic, the data is read into a temporary page
2719              * that's moved into place later.
2720              * The migration protocol uses,  possibly smaller, target-pages
2721              * however the source ensures it always sends all the components
2722              * of a host page in order.
2723              */
2724             page_buffer = postcopy_host_page +
2725                           ((uintptr_t)host & (block->page_size - 1));
2726             /* If all TP are zero then we can optimise the place */
2727             if (!((uintptr_t)host & (block->page_size - 1))) {
2728                 all_zero = true;
2729             } else {
2730                 /* not the 1st TP within the HP */
2731                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2732                     error_report("Non-sequential target page %p/%p",
2733                                   host, last_host);
2734                     ret = -EINVAL;
2735                     break;
2736                 }
2737             }
2738
2739
2740             /*
2741              * If it's the last part of a host page then we place the host
2742              * page
2743              */
2744             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2745                                      (block->page_size - 1)) == 0;
2746             place_source = postcopy_host_page;
2747         }
2748         last_host = host;
2749
2750         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2751         case RAM_SAVE_FLAG_ZERO:
2752             ch = qemu_get_byte(f);
2753             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2754             if (ch) {
2755                 all_zero = false;
2756             }
2757             break;
2758
2759         case RAM_SAVE_FLAG_PAGE:
2760             all_zero = false;
2761             if (!place_needed || !matching_page_sizes) {
2762                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2763             } else {
2764                 /* Avoids the qemu_file copy during postcopy, which is
2765                  * going to do a copy later; can only do it when we
2766                  * do this read in one go (matching page sizes)
2767                  */
2768                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2769                                          TARGET_PAGE_SIZE);
2770             }
2771             break;
2772         case RAM_SAVE_FLAG_EOS:
2773             /* normal exit */
2774             break;
2775         default:
2776             error_report("Unknown combination of migration flags: %#x"
2777                          " (postcopy mode)", flags);
2778             ret = -EINVAL;
2779         }
2780
2781         if (place_needed) {
2782             /* This gets called at the last target page in the host page */
2783             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2784
2785             if (all_zero) {
2786                 ret = postcopy_place_page_zero(mis, place_dest,
2787                                                block);
2788             } else {
2789                 ret = postcopy_place_page(mis, place_dest,
2790                                           place_source, block);
2791             }
2792         }
2793         if (!ret) {
2794             ret = qemu_file_get_error(f);
2795         }
2796     }
2797
2798     return ret;
2799 }
2800
2801 static bool postcopy_is_advised(void)
2802 {
2803     PostcopyState ps = postcopy_state_get();
2804     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
2805 }
2806
2807 static bool postcopy_is_running(void)
2808 {
2809     PostcopyState ps = postcopy_state_get();
2810     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
2811 }
2812
2813 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2814 {
2815     int flags = 0, ret = 0, invalid_flags = 0;
2816     static uint64_t seq_iter;
2817     int len = 0;
2818     /*
2819      * If system is running in postcopy mode, page inserts to host memory must
2820      * be atomic
2821      */
2822     bool postcopy_running = postcopy_is_running();
2823     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2824     bool postcopy_advised = postcopy_is_advised();
2825
2826     seq_iter++;
2827
2828     if (version_id != 4) {
2829         ret = -EINVAL;
2830     }
2831
2832     if (!migrate_use_compression()) {
2833         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2834     }
2835     /* This RCU critical section can be very long running.
2836      * When RCU reclaims in the code start to become numerous,
2837      * it will be necessary to reduce the granularity of this
2838      * critical section.
2839      */
2840     rcu_read_lock();
2841
2842     if (postcopy_running) {
2843         ret = ram_load_postcopy(f);
2844     }
2845
2846     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2847         ram_addr_t addr, total_ram_bytes;
2848         void *host = NULL;
2849         uint8_t ch;
2850
2851         addr = qemu_get_be64(f);
2852         flags = addr & ~TARGET_PAGE_MASK;
2853         addr &= TARGET_PAGE_MASK;
2854
2855         if (flags & invalid_flags) {
2856             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2857                 error_report("Received an unexpected compressed page");
2858             }
2859
2860             ret = -EINVAL;
2861             break;
2862         }
2863
2864         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2865                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2866             RAMBlock *block = ram_block_from_stream(f, flags);
2867
2868             host = host_from_ram_block_offset(block, addr);
2869             if (!host) {
2870                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2871                 ret = -EINVAL;
2872                 break;
2873             }
2874             ramblock_recv_bitmap_set(block, host);
2875             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2876         }
2877
2878         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2879         case RAM_SAVE_FLAG_MEM_SIZE:
2880             /* Synchronize RAM block list */
2881             total_ram_bytes = addr;
2882             while (!ret && total_ram_bytes) {
2883                 RAMBlock *block;
2884                 char id[256];
2885                 ram_addr_t length;
2886
2887                 len = qemu_get_byte(f);
2888                 qemu_get_buffer(f, (uint8_t *)id, len);
2889                 id[len] = 0;
2890                 length = qemu_get_be64(f);
2891
2892                 block = qemu_ram_block_by_name(id);
2893                 if (block) {
2894                     if (length != block->used_length) {
2895                         Error *local_err = NULL;
2896
2897                         ret = qemu_ram_resize(block, length,
2898                                               &local_err);
2899                         if (local_err) {
2900                             error_report_err(local_err);
2901                         }
2902                     }
2903                     /* For postcopy we need to check hugepage sizes match */
2904                     if (postcopy_advised &&
2905                         block->page_size != qemu_host_page_size) {
2906                         uint64_t remote_page_size = qemu_get_be64(f);
2907                         if (remote_page_size != block->page_size) {
2908                             error_report("Mismatched RAM page size %s "
2909                                          "(local) %zd != %" PRId64,
2910                                          id, block->page_size,
2911                                          remote_page_size);
2912                             ret = -EINVAL;
2913                         }
2914                     }
2915                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2916                                           block->idstr);
2917                 } else {
2918                     error_report("Unknown ramblock \"%s\", cannot "
2919                                  "accept migration", id);
2920                     ret = -EINVAL;
2921                 }
2922
2923                 total_ram_bytes -= length;
2924             }
2925             break;
2926
2927         case RAM_SAVE_FLAG_ZERO:
2928             ch = qemu_get_byte(f);
2929             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2930             break;
2931
2932         case RAM_SAVE_FLAG_PAGE:
2933             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2934             break;
2935
2936         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2937             len = qemu_get_be32(f);
2938             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2939                 error_report("Invalid compressed data length: %d", len);
2940                 ret = -EINVAL;
2941                 break;
2942             }
2943             decompress_data_with_multi_threads(f, host, len);
2944             break;
2945
2946         case RAM_SAVE_FLAG_XBZRLE:
2947             if (load_xbzrle(f, addr, host) < 0) {
2948                 error_report("Failed to decompress XBZRLE page at "
2949                              RAM_ADDR_FMT, addr);
2950                 ret = -EINVAL;
2951                 break;
2952             }
2953             break;
2954         case RAM_SAVE_FLAG_EOS:
2955             /* normal exit */
2956             break;
2957         default:
2958             if (flags & RAM_SAVE_FLAG_HOOK) {
2959                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2960             } else {
2961                 error_report("Unknown combination of migration flags: %#x",
2962                              flags);
2963                 ret = -EINVAL;
2964             }
2965         }
2966         if (!ret) {
2967             ret = qemu_file_get_error(f);
2968         }
2969     }
2970
2971     wait_for_decompress_done();
2972     rcu_read_unlock();
2973     trace_ram_load_complete(ret, seq_iter);
2974     return ret;
2975 }
2976
2977 static bool ram_has_postcopy(void *opaque)
2978 {
2979     return migrate_postcopy_ram();
2980 }
2981
2982 static SaveVMHandlers savevm_ram_handlers = {
2983     .save_setup = ram_save_setup,
2984     .save_live_iterate = ram_save_iterate,
2985     .save_live_complete_postcopy = ram_save_complete,
2986     .save_live_complete_precopy = ram_save_complete,
2987     .has_postcopy = ram_has_postcopy,
2988     .save_live_pending = ram_save_pending,
2989     .load_state = ram_load,
2990     .save_cleanup = ram_save_cleanup,
2991     .load_setup = ram_load_setup,
2992     .load_cleanup = ram_load_cleanup,
2993 };
2994
2995 void ram_mig_init(void)
2996 {
2997     qemu_mutex_init(&XBZRLE.lock);
2998     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2999 }