migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "cpu.h"
  30 #include <zlib.h>
  31 #include "qapi-event.h"
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration/register.h"
  40 #include "migration/misc.h"
  41 #include "qemu-file.h"
  42 #include "postcopy-ram.h"
  43 #include "migration/page_cache.h"
  44 #include "qemu/error-report.h"
  45 #include "qapi/qmp/qerror.h"
  46 #include "trace.h"
  47 #include "exec/ram_addr.h"
  48 #include "exec/target_page.h"
  49 #include "qemu/rcu_queue.h"
  50 #include "migration/colo.h"
  51 #include "migration/block.h"
  52
  53 /***********************************************************/
  54 /* ram save/restore */
  55
  56 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  57  * worked for pages that where filled with the same char.  We switched
  58  * it to only search for the zero value.  And to avoid confusion with
  59  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  60  */
  61
  62 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  63 #define RAM_SAVE_FLAG_ZERO     0x02
  64 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  65 #define RAM_SAVE_FLAG_PAGE     0x08
  66 #define RAM_SAVE_FLAG_EOS      0x10
  67 #define RAM_SAVE_FLAG_CONTINUE 0x20
  68 #define RAM_SAVE_FLAG_XBZRLE   0x40
  69 /* 0x80 is reserved in migration.h start with 0x100 next */
  70 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  71
  72 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  73 {
  74     return buffer_is_zero(p, size);
  75 }
  76
  77 XBZRLECacheStats xbzrle_counters;
  78
  79 /* struct contains XBZRLE cache and a static page
  80    used by the compression */
  81 static struct {
  82     /* buffer used for XBZRLE encoding */
  83     uint8_t *encoded_buf;
  84     /* buffer for storing page content */
  85     uint8_t *current_buf;
  86     /* Cache for XBZRLE, Protected by lock. */
  87     PageCache *cache;
  88     QemuMutex lock;
  89     /* it will store a page full of zeros */
  90     uint8_t *zero_target_page;
  91     /* buffer used for XBZRLE decoding */
  92     uint8_t *decoded_buf;
  93 } XBZRLE;
  94
  95 static void XBZRLE_cache_lock(void)
  96 {
  97     if (migrate_use_xbzrle())
  98         qemu_mutex_lock(&XBZRLE.lock);
  99 }
 100
 101 static void XBZRLE_cache_unlock(void)
 102 {
 103     if (migrate_use_xbzrle())
 104         qemu_mutex_unlock(&XBZRLE.lock);
 105 }
 106
 107 /**
 108  * xbzrle_cache_resize: resize the xbzrle cache
 109  *
 110  * This function is called from qmp_migrate_set_cache_size in main
 111  * thread, possibly while a migration is in progress.  A running
 112  * migration may be using the cache and might finish during this call,
 113  * hence changes to the cache are protected by XBZRLE.lock().
 114  *
 115  * Returns 0 for success or -1 for error
 116  *
 117  * @new_size: new cache size
 118  * @errp: set *errp if the check failed, with reason
 119  */
 120 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 121 {
 122     PageCache *new_cache;
 123     int64_t ret = 0;
 124
 125     /* Check for truncation */
 126     if (new_size != (size_t)new_size) {
 127         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 128                    "exceeding address space");
 129         return -1;
 130     }
 131
 132     if (new_size == migrate_xbzrle_cache_size()) {
 133         /* nothing to do */
 134         return 0;
 135     }
 136
 137     XBZRLE_cache_lock();
 138
 139     if (XBZRLE.cache != NULL) {
 140         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 141         if (!new_cache) {
 142             ret = -1;
 143             goto out;
 144         }
 145
 146         cache_fini(XBZRLE.cache);
 147         XBZRLE.cache = new_cache;
 148     }
 149 out:
 150     XBZRLE_cache_unlock();
 151     return ret;
 152 }
 153
 154 static void ramblock_recv_map_init(void)
 155 {
 156     RAMBlock *rb;
 157
 158     RAMBLOCK_FOREACH(rb) {
 159         assert(!rb->receivedmap);
 160         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 161     }
 162 }
 163
 164 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 165 {
 166     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 167                     rb->receivedmap);
 168 }
 169
 170 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 171 {
 172     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 173 }
 174
 175 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 176                                     size_t nr)
 177 {
 178     bitmap_set_atomic(rb->receivedmap,
 179                       ramblock_recv_bitmap_offset(host_addr, rb),
 180                       nr);
 181 }
 182
 183 /*
 184  * An outstanding page request, on the source, having been received
 185  * and queued
 186  */
 187 struct RAMSrcPageRequest {
 188     RAMBlock *rb;
 189     hwaddr    offset;
 190     hwaddr    len;
 191
 192     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 193 };
 194
 195 /* State of RAM for migration */
 196 struct RAMState {
 197     /* QEMUFile used for this migration */
 198     QEMUFile *f;
 199     /* Last block that we have visited searching for dirty pages */
 200     RAMBlock *last_seen_block;
 201     /* Last block from where we have sent data */
 202     RAMBlock *last_sent_block;
 203     /* Last dirty target page we have sent */
 204     ram_addr_t last_page;
 205     /* last ram version we have seen */
 206     uint32_t last_version;
 207     /* We are in the first round */
 208     bool ram_bulk_stage;
 209     /* How many times we have dirty too many pages */
 210     int dirty_rate_high_cnt;
 211     /* these variables are used for bitmap sync */
 212     /* last time we did a full bitmap_sync */
 213     int64_t time_last_bitmap_sync;
 214     /* bytes transferred at start_time */
 215     uint64_t bytes_xfer_prev;
 216     /* number of dirty pages since start_time */
 217     uint64_t num_dirty_pages_period;
 218     /* xbzrle misses since the beginning of the period */
 219     uint64_t xbzrle_cache_miss_prev;
 220     /* number of iterations at the beginning of period */
 221     uint64_t iterations_prev;
 222     /* Iterations since start */
 223     uint64_t iterations;
 224     /* number of dirty bits in the bitmap */
 225     uint64_t migration_dirty_pages;
 226     /* protects modification of the bitmap */
 227     QemuMutex bitmap_mutex;
 228     /* The RAMBlock used in the last src_page_requests */
 229     RAMBlock *last_req_rb;
 230     /* Queue of outstanding page requests from the destination */
 231     QemuMutex src_page_req_mutex;
 232     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 233 };
 234 typedef struct RAMState RAMState;
 235
 236 static RAMState *ram_state;
 237
 238 uint64_t ram_bytes_remaining(void)
 239 {
 240     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 241                        0;
 242 }
 243
 244 MigrationStats ram_counters;
 245
 246 /* used by the search for pages to send */
 247 struct PageSearchStatus {
 248     /* Current block being searched */
 249     RAMBlock    *block;
 250     /* Current page to search from */
 251     unsigned long page;
 252     /* Set once we wrap around */
 253     bool         complete_round;
 254 };
 255 typedef struct PageSearchStatus PageSearchStatus;
 256
 257 struct CompressParam {
 258     bool done;
 259     bool quit;
 260     QEMUFile *file;
 261     QemuMutex mutex;
 262     QemuCond cond;
 263     RAMBlock *block;
 264     ram_addr_t offset;
 265 };
 266 typedef struct CompressParam CompressParam;
 267
 268 struct DecompressParam {
 269     bool done;
 270     bool quit;
 271     QemuMutex mutex;
 272     QemuCond cond;
 273     void *des;
 274     uint8_t *compbuf;
 275     int len;
 276 };
 277 typedef struct DecompressParam DecompressParam;
 278
 279 static CompressParam *comp_param;
 280 static QemuThread *compress_threads;
 281 /* comp_done_cond is used to wake up the migration thread when
 282  * one of the compression threads has finished the compression.
 283  * comp_done_lock is used to co-work with comp_done_cond.
 284  */
 285 static QemuMutex comp_done_lock;
 286 static QemuCond comp_done_cond;
 287 /* The empty QEMUFileOps will be used by file in CompressParam */
 288 static const QEMUFileOps empty_ops = { };
 289
 290 static DecompressParam *decomp_param;
 291 static QemuThread *decompress_threads;
 292 static QemuMutex decomp_done_lock;
 293 static QemuCond decomp_done_cond;
 294
 295 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 296                                 ram_addr_t offset);
 297
 298 static void *do_data_compress(void *opaque)
 299 {
 300     CompressParam *param = opaque;
 301     RAMBlock *block;
 302     ram_addr_t offset;
 303
 304     qemu_mutex_lock(&param->mutex);
 305     while (!param->quit) {
 306         if (param->block) {
 307             block = param->block;
 308             offset = param->offset;
 309             param->block = NULL;
 310             qemu_mutex_unlock(&param->mutex);
 311
 312             do_compress_ram_page(param->file, block, offset);
 313
 314             qemu_mutex_lock(&comp_done_lock);
 315             param->done = true;
 316             qemu_cond_signal(&comp_done_cond);
 317             qemu_mutex_unlock(&comp_done_lock);
 318
 319             qemu_mutex_lock(&param->mutex);
 320         } else {
 321             qemu_cond_wait(&param->cond, &param->mutex);
 322         }
 323     }
 324     qemu_mutex_unlock(&param->mutex);
 325
 326     return NULL;
 327 }
 328
 329 static inline void terminate_compression_threads(void)
 330 {
 331     int idx, thread_count;
 332
 333     thread_count = migrate_compress_threads();
 334
 335     for (idx = 0; idx < thread_count; idx++) {
 336         qemu_mutex_lock(&comp_param[idx].mutex);
 337         comp_param[idx].quit = true;
 338         qemu_cond_signal(&comp_param[idx].cond);
 339         qemu_mutex_unlock(&comp_param[idx].mutex);
 340     }
 341 }
 342
 343 static void compress_threads_save_cleanup(void)
 344 {
 345     int i, thread_count;
 346
 347     if (!migrate_use_compression()) {
 348         return;
 349     }
 350     terminate_compression_threads();
 351     thread_count = migrate_compress_threads();
 352     for (i = 0; i < thread_count; i++) {
 353         qemu_thread_join(compress_threads + i);
 354         qemu_fclose(comp_param[i].file);
 355         qemu_mutex_destroy(&comp_param[i].mutex);
 356         qemu_cond_destroy(&comp_param[i].cond);
 357     }
 358     qemu_mutex_destroy(&comp_done_lock);
 359     qemu_cond_destroy(&comp_done_cond);
 360     g_free(compress_threads);
 361     g_free(comp_param);
 362     compress_threads = NULL;
 363     comp_param = NULL;
 364 }
 365
 366 static void compress_threads_save_setup(void)
 367 {
 368     int i, thread_count;
 369
 370     if (!migrate_use_compression()) {
 371         return;
 372     }
 373     thread_count = migrate_compress_threads();
 374     compress_threads = g_new0(QemuThread, thread_count);
 375     comp_param = g_new0(CompressParam, thread_count);
 376     qemu_cond_init(&comp_done_cond);
 377     qemu_mutex_init(&comp_done_lock);
 378     for (i = 0; i < thread_count; i++) {
 379         /* comp_param[i].file is just used as a dummy buffer to save data,
 380          * set its ops to empty.
 381          */
 382         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 383         comp_param[i].done = true;
 384         comp_param[i].quit = false;
 385         qemu_mutex_init(&comp_param[i].mutex);
 386         qemu_cond_init(&comp_param[i].cond);
 387         qemu_thread_create(compress_threads + i, "compress",
 388                            do_data_compress, comp_param + i,
 389                            QEMU_THREAD_JOINABLE);
 390     }
 391 }
 392
 393 /* Multiple fd's */
 394
 395 struct MultiFDSendParams {
 396     uint8_t id;
 397     char *name;
 398     QemuThread thread;
 399     QemuSemaphore sem;
 400     QemuMutex mutex;
 401     bool quit;
 402 };
 403 typedef struct MultiFDSendParams MultiFDSendParams;
 404
 405 struct {
 406     MultiFDSendParams *params;
 407     /* number of created threads */
 408     int count;
 409 } *multifd_send_state;
 410
 411 static void terminate_multifd_send_threads(Error *errp)
 412 {
 413     int i;
 414
 415     for (i = 0; i < multifd_send_state->count; i++) {
 416         MultiFDSendParams *p = &multifd_send_state->params[i];
 417
 418         qemu_mutex_lock(&p->mutex);
 419         p->quit = true;
 420         qemu_sem_post(&p->sem);
 421         qemu_mutex_unlock(&p->mutex);
 422     }
 423 }
 424
 425 int multifd_save_cleanup(Error **errp)
 426 {
 427     int i;
 428     int ret = 0;
 429
 430     if (!migrate_use_multifd()) {
 431         return 0;
 432     }
 433     terminate_multifd_send_threads(NULL);
 434     for (i = 0; i < multifd_send_state->count; i++) {
 435         MultiFDSendParams *p = &multifd_send_state->params[i];
 436
 437         qemu_thread_join(&p->thread);
 438         qemu_mutex_destroy(&p->mutex);
 439         qemu_sem_destroy(&p->sem);
 440         g_free(p->name);
 441         p->name = NULL;
 442     }
 443     g_free(multifd_send_state->params);
 444     multifd_send_state->params = NULL;
 445     g_free(multifd_send_state);
 446     multifd_send_state = NULL;
 447     return ret;
 448 }
 449
 450 static void *multifd_send_thread(void *opaque)
 451 {
 452     MultiFDSendParams *p = opaque;
 453
 454     while (true) {
 455         qemu_mutex_lock(&p->mutex);
 456         if (p->quit) {
 457             qemu_mutex_unlock(&p->mutex);
 458             break;
 459         }
 460         qemu_mutex_unlock(&p->mutex);
 461         qemu_sem_wait(&p->sem);
 462     }
 463
 464     return NULL;
 465 }
 466
 467 int multifd_save_setup(void)
 468 {
 469     int thread_count;
 470     uint8_t i;
 471
 472     if (!migrate_use_multifd()) {
 473         return 0;
 474     }
 475     thread_count = migrate_multifd_channels();
 476     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
 477     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
 478     multifd_send_state->count = 0;
 479     for (i = 0; i < thread_count; i++) {
 480         MultiFDSendParams *p = &multifd_send_state->params[i];
 481
 482         qemu_mutex_init(&p->mutex);
 483         qemu_sem_init(&p->sem, 0);
 484         p->quit = false;
 485         p->id = i;
 486         p->name = g_strdup_printf("multifdsend_%d", i);
 487         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
 488                            QEMU_THREAD_JOINABLE);
 489
 490         multifd_send_state->count++;
 491     }
 492     return 0;
 493 }
 494
 495 struct MultiFDRecvParams {
 496     uint8_t id;
 497     char *name;
 498     QemuThread thread;
 499     QemuSemaphore sem;
 500     QemuMutex mutex;
 501     bool quit;
 502 };
 503 typedef struct MultiFDRecvParams MultiFDRecvParams;
 504
 505 struct {
 506     MultiFDRecvParams *params;
 507     /* number of created threads */
 508     int count;
 509 } *multifd_recv_state;
 510
 511 static void terminate_multifd_recv_threads(Error *errp)
 512 {
 513     int i;
 514
 515     for (i = 0; i < multifd_recv_state->count; i++) {
 516         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 517
 518         qemu_mutex_lock(&p->mutex);
 519         p->quit = true;
 520         qemu_sem_post(&p->sem);
 521         qemu_mutex_unlock(&p->mutex);
 522     }
 523 }
 524
 525 int multifd_load_cleanup(Error **errp)
 526 {
 527     int i;
 528     int ret = 0;
 529
 530     if (!migrate_use_multifd()) {
 531         return 0;
 532     }
 533     terminate_multifd_recv_threads(NULL);
 534     for (i = 0; i < multifd_recv_state->count; i++) {
 535         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 536
 537         qemu_thread_join(&p->thread);
 538         qemu_mutex_destroy(&p->mutex);
 539         qemu_sem_destroy(&p->sem);
 540         g_free(p->name);
 541         p->name = NULL;
 542     }
 543     g_free(multifd_recv_state->params);
 544     multifd_recv_state->params = NULL;
 545     g_free(multifd_recv_state);
 546     multifd_recv_state = NULL;
 547
 548     return ret;
 549 }
 550
 551 static void *multifd_recv_thread(void *opaque)
 552 {
 553     MultiFDRecvParams *p = opaque;
 554
 555     while (true) {
 556         qemu_mutex_lock(&p->mutex);
 557         if (p->quit) {
 558             qemu_mutex_unlock(&p->mutex);
 559             break;
 560         }
 561         qemu_mutex_unlock(&p->mutex);
 562         qemu_sem_wait(&p->sem);
 563     }
 564
 565     return NULL;
 566 }
 567
 568 int multifd_load_setup(void)
 569 {
 570     int thread_count;
 571     uint8_t i;
 572
 573     if (!migrate_use_multifd()) {
 574         return 0;
 575     }
 576     thread_count = migrate_multifd_channels();
 577     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
 578     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
 579     multifd_recv_state->count = 0;
 580     for (i = 0; i < thread_count; i++) {
 581         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 582
 583         qemu_mutex_init(&p->mutex);
 584         qemu_sem_init(&p->sem, 0);
 585         p->quit = false;
 586         p->id = i;
 587         p->name = g_strdup_printf("multifdrecv_%d", i);
 588         qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
 589                            QEMU_THREAD_JOINABLE);
 590         multifd_recv_state->count++;
 591     }
 592     return 0;
 593 }
 594
 595 /**
 596  * save_page_header: write page header to wire
 597  *
 598  * If this is the 1st block, it also writes the block identification
 599  *
 600  * Returns the number of bytes written
 601  *
 602  * @f: QEMUFile where to send the data
 603  * @block: block that contains the page we want to send
 604  * @offset: offset inside the block for the page
 605  *          in the lower bits, it contains flags
 606  */
 607 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 608                                ram_addr_t offset)
 609 {
 610     size_t size, len;
 611
 612     if (block == rs->last_sent_block) {
 613         offset |= RAM_SAVE_FLAG_CONTINUE;
 614     }
 615     qemu_put_be64(f, offset);
 616     size = 8;
 617
 618     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 619         len = strlen(block->idstr);
 620         qemu_put_byte(f, len);
 621         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 622         size += 1 + len;
 623         rs->last_sent_block = block;
 624     }
 625     return size;
 626 }
 627
 628 /**
 629  * mig_throttle_guest_down: throotle down the guest
 630  *
 631  * Reduce amount of guest cpu execution to hopefully slow down memory
 632  * writes. If guest dirty memory rate is reduced below the rate at
 633  * which we can transfer pages to the destination then we should be
 634  * able to complete migration. Some workloads dirty memory way too
 635  * fast and will not effectively converge, even with auto-converge.
 636  */
 637 static void mig_throttle_guest_down(void)
 638 {
 639     MigrationState *s = migrate_get_current();
 640     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 641     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 642
 643     /* We have not started throttling yet. Let's start it. */
 644     if (!cpu_throttle_active()) {
 645         cpu_throttle_set(pct_initial);
 646     } else {
 647         /* Throttling already on, just increase the rate */
 648         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 649     }
 650 }
 651
 652 /**
 653  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 654  *
 655  * @rs: current RAM state
 656  * @current_addr: address for the zero page
 657  *
 658  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 659  * The important thing is that a stale (not-yet-0'd) page be replaced
 660  * by the new data.
 661  * As a bonus, if the page wasn't in the cache it gets added so that
 662  * when a small write is made into the 0'd page it gets XBZRLE sent.
 663  */
 664 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 665 {
 666     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 667         return;
 668     }
 669
 670     /* We don't care if this fails to allocate a new cache page
 671      * as long as it updated an old one */
 672     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 673                  ram_counters.dirty_sync_count);
 674 }
 675
 676 #define ENCODING_FLAG_XBZRLE 0x1
 677
 678 /**
 679  * save_xbzrle_page: compress and send current page
 680  *
 681  * Returns: 1 means that we wrote the page
 682  *          0 means that page is identical to the one already sent
 683  *          -1 means that xbzrle would be longer than normal
 684  *
 685  * @rs: current RAM state
 686  * @current_data: pointer to the address of the page contents
 687  * @current_addr: addr of the page
 688  * @block: block that contains the page we want to send
 689  * @offset: offset inside the block for the page
 690  * @last_stage: if we are at the completion stage
 691  */
 692 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 693                             ram_addr_t current_addr, RAMBlock *block,
 694                             ram_addr_t offset, bool last_stage)
 695 {
 696     int encoded_len = 0, bytes_xbzrle;
 697     uint8_t *prev_cached_page;
 698
 699     if (!cache_is_cached(XBZRLE.cache, current_addr,
 700                          ram_counters.dirty_sync_count)) {
 701         xbzrle_counters.cache_miss++;
 702         if (!last_stage) {
 703             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 704                              ram_counters.dirty_sync_count) == -1) {
 705                 return -1;
 706             } else {
 707                 /* update *current_data when the page has been
 708                    inserted into cache */
 709                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 710             }
 711         }
 712         return -1;
 713     }
 714
 715     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 716
 717     /* save current buffer into memory */
 718     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 719
 720     /* XBZRLE encoding (if there is no overflow) */
 721     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 722                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 723                                        TARGET_PAGE_SIZE);
 724     if (encoded_len == 0) {
 725         trace_save_xbzrle_page_skipping();
 726         return 0;
 727     } else if (encoded_len == -1) {
 728         trace_save_xbzrle_page_overflow();
 729         xbzrle_counters.overflow++;
 730         /* update data in the cache */
 731         if (!last_stage) {
 732             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 733             *current_data = prev_cached_page;
 734         }
 735         return -1;
 736     }
 737
 738     /* we need to update the data in the cache, in order to get the same data */
 739     if (!last_stage) {
 740         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 741     }
 742
 743     /* Send XBZRLE based compressed page */
 744     bytes_xbzrle = save_page_header(rs, rs->f, block,
 745                                     offset | RAM_SAVE_FLAG_XBZRLE);
 746     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 747     qemu_put_be16(rs->f, encoded_len);
 748     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 749     bytes_xbzrle += encoded_len + 1 + 2;
 750     xbzrle_counters.pages++;
 751     xbzrle_counters.bytes += bytes_xbzrle;
 752     ram_counters.transferred += bytes_xbzrle;
 753
 754     return 1;
 755 }
 756
 757 /**
 758  * migration_bitmap_find_dirty: find the next dirty page from start
 759  *
 760  * Called with rcu_read_lock() to protect migration_bitmap
 761  *
 762  * Returns the byte offset within memory region of the start of a dirty page
 763  *
 764  * @rs: current RAM state
 765  * @rb: RAMBlock where to search for dirty pages
 766  * @start: page where we start the search
 767  */
 768 static inline
 769 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 770                                           unsigned long start)
 771 {
 772     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 773     unsigned long *bitmap = rb->bmap;
 774     unsigned long next;
 775
 776     if (rs->ram_bulk_stage && start > 0) {
 777         next = start + 1;
 778     } else {
 779         next = find_next_bit(bitmap, size, start);
 780     }
 781
 782     return next;
 783 }
 784
 785 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 786                                                 RAMBlock *rb,
 787                                                 unsigned long page)
 788 {
 789     bool ret;
 790
 791     ret = test_and_clear_bit(page, rb->bmap);
 792
 793     if (ret) {
 794         rs->migration_dirty_pages--;
 795     }
 796     return ret;
 797 }
 798
 799 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 800                                         ram_addr_t start, ram_addr_t length)
 801 {
 802     rs->migration_dirty_pages +=
 803         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 804                                               &rs->num_dirty_pages_period);
 805 }
 806
 807 /**
 808  * ram_pagesize_summary: calculate all the pagesizes of a VM
 809  *
 810  * Returns a summary bitmap of the page sizes of all RAMBlocks
 811  *
 812  * For VMs with just normal pages this is equivalent to the host page
 813  * size. If it's got some huge pages then it's the OR of all the
 814  * different page sizes.
 815  */
 816 uint64_t ram_pagesize_summary(void)
 817 {
 818     RAMBlock *block;
 819     uint64_t summary = 0;
 820
 821     RAMBLOCK_FOREACH(block) {
 822         summary |= block->page_size;
 823     }
 824
 825     return summary;
 826 }
 827
 828 static void migration_bitmap_sync(RAMState *rs)
 829 {
 830     RAMBlock *block;
 831     int64_t end_time;
 832     uint64_t bytes_xfer_now;
 833
 834     ram_counters.dirty_sync_count++;
 835
 836     if (!rs->time_last_bitmap_sync) {
 837         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 838     }
 839
 840     trace_migration_bitmap_sync_start();
 841     memory_global_dirty_log_sync();
 842
 843     qemu_mutex_lock(&rs->bitmap_mutex);
 844     rcu_read_lock();
 845     RAMBLOCK_FOREACH(block) {
 846         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 847     }
 848     rcu_read_unlock();
 849     qemu_mutex_unlock(&rs->bitmap_mutex);
 850
 851     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 852
 853     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 854
 855     /* more than 1 second = 1000 millisecons */
 856     if (end_time > rs->time_last_bitmap_sync + 1000) {
 857         /* calculate period counters */
 858         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 859             / (end_time - rs->time_last_bitmap_sync);
 860         bytes_xfer_now = ram_counters.transferred;
 861
 862         /* During block migration the auto-converge logic incorrectly detects
 863          * that ram migration makes no progress. Avoid this by disabling the
 864          * throttling logic during the bulk phase of block migration. */
 865         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 866             /* The following detection logic can be refined later. For now:
 867                Check to see if the dirtied bytes is 50% more than the approx.
 868                amount of bytes that just got transferred since the last time we
 869                were in this routine. If that happens twice, start or increase
 870                throttling */
 871
 872             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 873                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 874                 (++rs->dirty_rate_high_cnt >= 2)) {
 875                     trace_migration_throttle();
 876                     rs->dirty_rate_high_cnt = 0;
 877                     mig_throttle_guest_down();
 878             }
 879         }
 880
 881         if (migrate_use_xbzrle()) {
 882             if (rs->iterations_prev != rs->iterations) {
 883                 xbzrle_counters.cache_miss_rate =
 884                    (double)(xbzrle_counters.cache_miss -
 885                             rs->xbzrle_cache_miss_prev) /
 886                    (rs->iterations - rs->iterations_prev);
 887             }
 888             rs->iterations_prev = rs->iterations;
 889             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 890         }
 891
 892         /* reset period counters */
 893         rs->time_last_bitmap_sync = end_time;
 894         rs->num_dirty_pages_period = 0;
 895         rs->bytes_xfer_prev = bytes_xfer_now;
 896     }
 897     if (migrate_use_events()) {
 898         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
 899     }
 900 }
 901
 902 /**
 903  * save_zero_page: send the zero page to the stream
 904  *
 905  * Returns the number of pages written.
 906  *
 907  * @rs: current RAM state
 908  * @block: block that contains the page we want to send
 909  * @offset: offset inside the block for the page
 910  * @p: pointer to the page
 911  */
 912 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 913                           uint8_t *p)
 914 {
 915     int pages = -1;
 916
 917     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 918         ram_counters.duplicate++;
 919         ram_counters.transferred +=
 920             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 921         qemu_put_byte(rs->f, 0);
 922         ram_counters.transferred += 1;
 923         pages = 1;
 924     }
 925
 926     return pages;
 927 }
 928
 929 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 930 {
 931     if (!migrate_release_ram() || !migration_in_postcopy()) {
 932         return;
 933     }
 934
 935     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 936 }
 937
 938 /**
 939  * ram_save_page: send the given page to the stream
 940  *
 941  * Returns the number of pages written.
 942  *          < 0 - error
 943  *          >=0 - Number of pages written - this might legally be 0
 944  *                if xbzrle noticed the page was the same.
 945  *
 946  * @rs: current RAM state
 947  * @block: block that contains the page we want to send
 948  * @offset: offset inside the block for the page
 949  * @last_stage: if we are at the completion stage
 950  */
 951 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 952 {
 953     int pages = -1;
 954     uint64_t bytes_xmit;
 955     ram_addr_t current_addr;
 956     uint8_t *p;
 957     int ret;
 958     bool send_async = true;
 959     RAMBlock *block = pss->block;
 960     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 961
 962     p = block->host + offset;
 963     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 964
 965     /* In doubt sent page as normal */
 966     bytes_xmit = 0;
 967     ret = ram_control_save_page(rs->f, block->offset,
 968                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 969     if (bytes_xmit) {
 970         ram_counters.transferred += bytes_xmit;
 971         pages = 1;
 972     }
 973
 974     XBZRLE_cache_lock();
 975
 976     current_addr = block->offset + offset;
 977
 978     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 979         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 980             if (bytes_xmit > 0) {
 981                 ram_counters.normal++;
 982             } else if (bytes_xmit == 0) {
 983                 ram_counters.duplicate++;
 984             }
 985         }
 986     } else {
 987         pages = save_zero_page(rs, block, offset, p);
 988         if (pages > 0) {
 989             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 990              * page would be stale
 991              */
 992             xbzrle_cache_zero_page(rs, current_addr);
 993             ram_release_pages(block->idstr, offset, pages);
 994         } else if (!rs->ram_bulk_stage &&
 995                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 996             pages = save_xbzrle_page(rs, &p, current_addr, block,
 997                                      offset, last_stage);
 998             if (!last_stage) {
 999                 /* Can't send this cached data async, since the cache page
1000                  * might get updated before it gets to the wire
1001                  */
1002                 send_async = false;
1003             }
1004         }
1005     }
1006
1007     /* XBZRLE overflow or normal page */
1008     if (pages == -1) {
1009         ram_counters.transferred +=
1010             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
1011         if (send_async) {
1012             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
1013                                   migrate_release_ram() &
1014                                   migration_in_postcopy());
1015         } else {
1016             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
1017         }
1018         ram_counters.transferred += TARGET_PAGE_SIZE;
1019         pages = 1;
1020         ram_counters.normal++;
1021     }
1022
1023     XBZRLE_cache_unlock();
1024
1025     return pages;
1026 }
1027
1028 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
1029                                 ram_addr_t offset)
1030 {
1031     RAMState *rs = ram_state;
1032     int bytes_sent, blen;
1033     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1034
1035     bytes_sent = save_page_header(rs, f, block, offset |
1036                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1037     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1038                                      migrate_compress_level());
1039     if (blen < 0) {
1040         bytes_sent = 0;
1041         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1042         error_report("compressed data failed!");
1043     } else {
1044         bytes_sent += blen;
1045         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1046     }
1047
1048     return bytes_sent;
1049 }
1050
1051 static void flush_compressed_data(RAMState *rs)
1052 {
1053     int idx, len, thread_count;
1054
1055     if (!migrate_use_compression()) {
1056         return;
1057     }
1058     thread_count = migrate_compress_threads();
1059
1060     qemu_mutex_lock(&comp_done_lock);
1061     for (idx = 0; idx < thread_count; idx++) {
1062         while (!comp_param[idx].done) {
1063             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1064         }
1065     }
1066     qemu_mutex_unlock(&comp_done_lock);
1067
1068     for (idx = 0; idx < thread_count; idx++) {
1069         qemu_mutex_lock(&comp_param[idx].mutex);
1070         if (!comp_param[idx].quit) {
1071             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1072             ram_counters.transferred += len;
1073         }
1074         qemu_mutex_unlock(&comp_param[idx].mutex);
1075     }
1076 }
1077
1078 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1079                                        ram_addr_t offset)
1080 {
1081     param->block = block;
1082     param->offset = offset;
1083 }
1084
1085 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1086                                            ram_addr_t offset)
1087 {
1088     int idx, thread_count, bytes_xmit = -1, pages = -1;
1089
1090     thread_count = migrate_compress_threads();
1091     qemu_mutex_lock(&comp_done_lock);
1092     while (true) {
1093         for (idx = 0; idx < thread_count; idx++) {
1094             if (comp_param[idx].done) {
1095                 comp_param[idx].done = false;
1096                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1097                 qemu_mutex_lock(&comp_param[idx].mutex);
1098                 set_compress_params(&comp_param[idx], block, offset);
1099                 qemu_cond_signal(&comp_param[idx].cond);
1100                 qemu_mutex_unlock(&comp_param[idx].mutex);
1101                 pages = 1;
1102                 ram_counters.normal++;
1103                 ram_counters.transferred += bytes_xmit;
1104                 break;
1105             }
1106         }
1107         if (pages > 0) {
1108             break;
1109         } else {
1110             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1111         }
1112     }
1113     qemu_mutex_unlock(&comp_done_lock);
1114
1115     return pages;
1116 }
1117
1118 /**
1119  * ram_save_compressed_page: compress the given page and send it to the stream
1120  *
1121  * Returns the number of pages written.
1122  *
1123  * @rs: current RAM state
1124  * @block: block that contains the page we want to send
1125  * @offset: offset inside the block for the page
1126  * @last_stage: if we are at the completion stage
1127  */
1128 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1129                                     bool last_stage)
1130 {
1131     int pages = -1;
1132     uint64_t bytes_xmit = 0;
1133     uint8_t *p;
1134     int ret, blen;
1135     RAMBlock *block = pss->block;
1136     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1137
1138     p = block->host + offset;
1139
1140     ret = ram_control_save_page(rs->f, block->offset,
1141                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1142     if (bytes_xmit) {
1143         ram_counters.transferred += bytes_xmit;
1144         pages = 1;
1145     }
1146     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1147         if (ret != RAM_SAVE_CONTROL_DELAYED) {
1148             if (bytes_xmit > 0) {
1149                 ram_counters.normal++;
1150             } else if (bytes_xmit == 0) {
1151                 ram_counters.duplicate++;
1152             }
1153         }
1154     } else {
1155         /* When starting the process of a new block, the first page of
1156          * the block should be sent out before other pages in the same
1157          * block, and all the pages in last block should have been sent
1158          * out, keeping this order is important, because the 'cont' flag
1159          * is used to avoid resending the block name.
1160          */
1161         if (block != rs->last_sent_block) {
1162             flush_compressed_data(rs);
1163             pages = save_zero_page(rs, block, offset, p);
1164             if (pages == -1) {
1165                 /* Make sure the first page is sent out before other pages */
1166                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1167                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1168                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1169                                                  migrate_compress_level());
1170                 if (blen > 0) {
1171                     ram_counters.transferred += bytes_xmit + blen;
1172                     ram_counters.normal++;
1173                     pages = 1;
1174                 } else {
1175                     qemu_file_set_error(rs->f, blen);
1176                     error_report("compressed data failed!");
1177                 }
1178             }
1179             if (pages > 0) {
1180                 ram_release_pages(block->idstr, offset, pages);
1181             }
1182         } else {
1183             pages = save_zero_page(rs, block, offset, p);
1184             if (pages == -1) {
1185                 pages = compress_page_with_multi_thread(rs, block, offset);
1186             } else {
1187                 ram_release_pages(block->idstr, offset, pages);
1188             }
1189         }
1190     }
1191
1192     return pages;
1193 }
1194
1195 /**
1196  * find_dirty_block: find the next dirty page and update any state
1197  * associated with the search process.
1198  *
1199  * Returns if a page is found
1200  *
1201  * @rs: current RAM state
1202  * @pss: data about the state of the current dirty page scan
1203  * @again: set to false if the search has scanned the whole of RAM
1204  */
1205 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1206 {
1207     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1208     if (pss->complete_round && pss->block == rs->last_seen_block &&
1209         pss->page >= rs->last_page) {
1210         /*
1211          * We've been once around the RAM and haven't found anything.
1212          * Give up.
1213          */
1214         *again = false;
1215         return false;
1216     }
1217     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1218         /* Didn't find anything in this RAM Block */
1219         pss->page = 0;
1220         pss->block = QLIST_NEXT_RCU(pss->block, next);
1221         if (!pss->block) {
1222             /* Hit the end of the list */
1223             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1224             /* Flag that we've looped */
1225             pss->complete_round = true;
1226             rs->ram_bulk_stage = false;
1227             if (migrate_use_xbzrle()) {
1228                 /* If xbzrle is on, stop using the data compression at this
1229                  * point. In theory, xbzrle can do better than compression.
1230                  */
1231                 flush_compressed_data(rs);
1232             }
1233         }
1234         /* Didn't find anything this time, but try again on the new block */
1235         *again = true;
1236         return false;
1237     } else {
1238         /* Can go around again, but... */
1239         *again = true;
1240         /* We've found something so probably don't need to */
1241         return true;
1242     }
1243 }
1244
1245 /**
1246  * unqueue_page: gets a page of the queue
1247  *
1248  * Helper for 'get_queued_page' - gets a page off the queue
1249  *
1250  * Returns the block of the page (or NULL if none available)
1251  *
1252  * @rs: current RAM state
1253  * @offset: used to return the offset within the RAMBlock
1254  */
1255 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1256 {
1257     RAMBlock *block = NULL;
1258
1259     qemu_mutex_lock(&rs->src_page_req_mutex);
1260     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1261         struct RAMSrcPageRequest *entry =
1262                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1263         block = entry->rb;
1264         *offset = entry->offset;
1265
1266         if (entry->len > TARGET_PAGE_SIZE) {
1267             entry->len -= TARGET_PAGE_SIZE;
1268             entry->offset += TARGET_PAGE_SIZE;
1269         } else {
1270             memory_region_unref(block->mr);
1271             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1272             g_free(entry);
1273         }
1274     }
1275     qemu_mutex_unlock(&rs->src_page_req_mutex);
1276
1277     return block;
1278 }
1279
1280 /**
1281  * get_queued_page: unqueue a page from the postocpy requests
1282  *
1283  * Skips pages that are already sent (!dirty)
1284  *
1285  * Returns if a queued page is found
1286  *
1287  * @rs: current RAM state
1288  * @pss: data about the state of the current dirty page scan
1289  */
1290 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1291 {
1292     RAMBlock  *block;
1293     ram_addr_t offset;
1294     bool dirty;
1295
1296     do {
1297         block = unqueue_page(rs, &offset);
1298         /*
1299          * We're sending this page, and since it's postcopy nothing else
1300          * will dirty it, and we must make sure it doesn't get sent again
1301          * even if this queue request was received after the background
1302          * search already sent it.
1303          */
1304         if (block) {
1305             unsigned long page;
1306
1307             page = offset >> TARGET_PAGE_BITS;
1308             dirty = test_bit(page, block->bmap);
1309             if (!dirty) {
1310                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1311                        page, test_bit(page, block->unsentmap));
1312             } else {
1313                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1314             }
1315         }
1316
1317     } while (block && !dirty);
1318
1319     if (block) {
1320         /*
1321          * As soon as we start servicing pages out of order, then we have
1322          * to kill the bulk stage, since the bulk stage assumes
1323          * in (migration_bitmap_find_and_reset_dirty) that every page is
1324          * dirty, that's no longer true.
1325          */
1326         rs->ram_bulk_stage = false;
1327
1328         /*
1329          * We want the background search to continue from the queued page
1330          * since the guest is likely to want other pages near to the page
1331          * it just requested.
1332          */
1333         pss->block = block;
1334         pss->page = offset >> TARGET_PAGE_BITS;
1335     }
1336
1337     return !!block;
1338 }
1339
1340 /**
1341  * migration_page_queue_free: drop any remaining pages in the ram
1342  * request queue
1343  *
1344  * It should be empty at the end anyway, but in error cases there may
1345  * be some left.  in case that there is any page left, we drop it.
1346  *
1347  */
1348 static void migration_page_queue_free(RAMState *rs)
1349 {
1350     struct RAMSrcPageRequest *mspr, *next_mspr;
1351     /* This queue generally should be empty - but in the case of a failed
1352      * migration might have some droppings in.
1353      */
1354     rcu_read_lock();
1355     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1356         memory_region_unref(mspr->rb->mr);
1357         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1358         g_free(mspr);
1359     }
1360     rcu_read_unlock();
1361 }
1362
1363 /**
1364  * ram_save_queue_pages: queue the page for transmission
1365  *
1366  * A request from postcopy destination for example.
1367  *
1368  * Returns zero on success or negative on error
1369  *
1370  * @rbname: Name of the RAMBLock of the request. NULL means the
1371  *          same that last one.
1372  * @start: starting address from the start of the RAMBlock
1373  * @len: length (in bytes) to send
1374  */
1375 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1376 {
1377     RAMBlock *ramblock;
1378     RAMState *rs = ram_state;
1379
1380     ram_counters.postcopy_requests++;
1381     rcu_read_lock();
1382     if (!rbname) {
1383         /* Reuse last RAMBlock */
1384         ramblock = rs->last_req_rb;
1385
1386         if (!ramblock) {
1387             /*
1388              * Shouldn't happen, we can't reuse the last RAMBlock if
1389              * it's the 1st request.
1390              */
1391             error_report("ram_save_queue_pages no previous block");
1392             goto err;
1393         }
1394     } else {
1395         ramblock = qemu_ram_block_by_name(rbname);
1396
1397         if (!ramblock) {
1398             /* We shouldn't be asked for a non-existent RAMBlock */
1399             error_report("ram_save_queue_pages no block '%s'", rbname);
1400             goto err;
1401         }
1402         rs->last_req_rb = ramblock;
1403     }
1404     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1405     if (start+len > ramblock->used_length) {
1406         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1407                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1408                      __func__, start, len, ramblock->used_length);
1409         goto err;
1410     }
1411
1412     struct RAMSrcPageRequest *new_entry =
1413         g_malloc0(sizeof(struct RAMSrcPageRequest));
1414     new_entry->rb = ramblock;
1415     new_entry->offset = start;
1416     new_entry->len = len;
1417
1418     memory_region_ref(ramblock->mr);
1419     qemu_mutex_lock(&rs->src_page_req_mutex);
1420     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1421     qemu_mutex_unlock(&rs->src_page_req_mutex);
1422     rcu_read_unlock();
1423
1424     return 0;
1425
1426 err:
1427     rcu_read_unlock();
1428     return -1;
1429 }
1430
1431 /**
1432  * ram_save_target_page: save one target page
1433  *
1434  * Returns the number of pages written
1435  *
1436  * @rs: current RAM state
1437  * @ms: current migration state
1438  * @pss: data about the page we want to send
1439  * @last_stage: if we are at the completion stage
1440  */
1441 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1442                                 bool last_stage)
1443 {
1444     int res = 0;
1445
1446     /* Check the pages is dirty and if it is send it */
1447     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1448         /*
1449          * If xbzrle is on, stop using the data compression after first
1450          * round of migration even if compression is enabled. In theory,
1451          * xbzrle can do better than compression.
1452          */
1453         if (migrate_use_compression() &&
1454             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1455             res = ram_save_compressed_page(rs, pss, last_stage);
1456         } else {
1457             res = ram_save_page(rs, pss, last_stage);
1458         }
1459
1460         if (res < 0) {
1461             return res;
1462         }
1463         if (pss->block->unsentmap) {
1464             clear_bit(pss->page, pss->block->unsentmap);
1465         }
1466     }
1467
1468     return res;
1469 }
1470
1471 /**
1472  * ram_save_host_page: save a whole host page
1473  *
1474  * Starting at *offset send pages up to the end of the current host
1475  * page. It's valid for the initial offset to point into the middle of
1476  * a host page in which case the remainder of the hostpage is sent.
1477  * Only dirty target pages are sent. Note that the host page size may
1478  * be a huge page for this block.
1479  * The saving stops at the boundary of the used_length of the block
1480  * if the RAMBlock isn't a multiple of the host page size.
1481  *
1482  * Returns the number of pages written or negative on error
1483  *
1484  * @rs: current RAM state
1485  * @ms: current migration state
1486  * @pss: data about the page we want to send
1487  * @last_stage: if we are at the completion stage
1488  */
1489 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1490                               bool last_stage)
1491 {
1492     int tmppages, pages = 0;
1493     size_t pagesize_bits =
1494         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1495
1496     do {
1497         tmppages = ram_save_target_page(rs, pss, last_stage);
1498         if (tmppages < 0) {
1499             return tmppages;
1500         }
1501
1502         pages += tmppages;
1503         pss->page++;
1504     } while ((pss->page & (pagesize_bits - 1)) &&
1505              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1506
1507     /* The offset we leave with is the last one we looked at */
1508     pss->page--;
1509     return pages;
1510 }
1511
1512 /**
1513  * ram_find_and_save_block: finds a dirty page and sends it to f
1514  *
1515  * Called within an RCU critical section.
1516  *
1517  * Returns the number of pages written where zero means no dirty pages
1518  *
1519  * @rs: current RAM state
1520  * @last_stage: if we are at the completion stage
1521  *
1522  * On systems where host-page-size > target-page-size it will send all the
1523  * pages in a host page that are dirty.
1524  */
1525
1526 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1527 {
1528     PageSearchStatus pss;
1529     int pages = 0;
1530     bool again, found;
1531
1532     /* No dirty page as there is zero RAM */
1533     if (!ram_bytes_total()) {
1534         return pages;
1535     }
1536
1537     pss.block = rs->last_seen_block;
1538     pss.page = rs->last_page;
1539     pss.complete_round = false;
1540
1541     if (!pss.block) {
1542         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1543     }
1544
1545     do {
1546         again = true;
1547         found = get_queued_page(rs, &pss);
1548
1549         if (!found) {
1550             /* priority queue empty, so just search for something dirty */
1551             found = find_dirty_block(rs, &pss, &again);
1552         }
1553
1554         if (found) {
1555             pages = ram_save_host_page(rs, &pss, last_stage);
1556         }
1557     } while (!pages && again);
1558
1559     rs->last_seen_block = pss.block;
1560     rs->last_page = pss.page;
1561
1562     return pages;
1563 }
1564
1565 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1566 {
1567     uint64_t pages = size / TARGET_PAGE_SIZE;
1568
1569     if (zero) {
1570         ram_counters.duplicate += pages;
1571     } else {
1572         ram_counters.normal += pages;
1573         ram_counters.transferred += size;
1574         qemu_update_position(f, size);
1575     }
1576 }
1577
1578 uint64_t ram_bytes_total(void)
1579 {
1580     RAMBlock *block;
1581     uint64_t total = 0;
1582
1583     rcu_read_lock();
1584     RAMBLOCK_FOREACH(block) {
1585         total += block->used_length;
1586     }
1587     rcu_read_unlock();
1588     return total;
1589 }
1590
1591 static void xbzrle_load_setup(void)
1592 {
1593     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1594 }
1595
1596 static void xbzrle_load_cleanup(void)
1597 {
1598     g_free(XBZRLE.decoded_buf);
1599     XBZRLE.decoded_buf = NULL;
1600 }
1601
1602 static void ram_state_cleanup(RAMState **rsp)
1603 {
1604     migration_page_queue_free(*rsp);
1605     qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1606     qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1607     g_free(*rsp);
1608     *rsp = NULL;
1609 }
1610
1611 static void xbzrle_cleanup(void)
1612 {
1613     XBZRLE_cache_lock();
1614     if (XBZRLE.cache) {
1615         cache_fini(XBZRLE.cache);
1616         g_free(XBZRLE.encoded_buf);
1617         g_free(XBZRLE.current_buf);
1618         g_free(XBZRLE.zero_target_page);
1619         XBZRLE.cache = NULL;
1620         XBZRLE.encoded_buf = NULL;
1621         XBZRLE.current_buf = NULL;
1622         XBZRLE.zero_target_page = NULL;
1623     }
1624     XBZRLE_cache_unlock();
1625 }
1626
1627 static void ram_save_cleanup(void *opaque)
1628 {
1629     RAMState **rsp = opaque;
1630     RAMBlock *block;
1631
1632     /* caller have hold iothread lock or is in a bh, so there is
1633      * no writing race against this migration_bitmap
1634      */
1635     memory_global_dirty_log_stop();
1636
1637     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1638         g_free(block->bmap);
1639         block->bmap = NULL;
1640         g_free(block->unsentmap);
1641         block->unsentmap = NULL;
1642     }
1643
1644     xbzrle_cleanup();
1645     compress_threads_save_cleanup();
1646     ram_state_cleanup(rsp);
1647 }
1648
1649 static void ram_state_reset(RAMState *rs)
1650 {
1651     rs->last_seen_block = NULL;
1652     rs->last_sent_block = NULL;
1653     rs->last_page = 0;
1654     rs->last_version = ram_list.version;
1655     rs->ram_bulk_stage = true;
1656 }
1657
1658 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1659
1660 /*
1661  * 'expected' is the value you expect the bitmap mostly to be full
1662  * of; it won't bother printing lines that are all this value.
1663  * If 'todump' is null the migration bitmap is dumped.
1664  */
1665 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1666                            unsigned long pages)
1667 {
1668     int64_t cur;
1669     int64_t linelen = 128;
1670     char linebuf[129];
1671
1672     for (cur = 0; cur < pages; cur += linelen) {
1673         int64_t curb;
1674         bool found = false;
1675         /*
1676          * Last line; catch the case where the line length
1677          * is longer than remaining ram
1678          */
1679         if (cur + linelen > pages) {
1680             linelen = pages - cur;
1681         }
1682         for (curb = 0; curb < linelen; curb++) {
1683             bool thisbit = test_bit(cur + curb, todump);
1684             linebuf[curb] = thisbit ? '1' : '.';
1685             found = found || (thisbit != expected);
1686         }
1687         if (found) {
1688             linebuf[curb] = '\0';
1689             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1690         }
1691     }
1692 }
1693
1694 /* **** functions for postcopy ***** */
1695
1696 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1697 {
1698     struct RAMBlock *block;
1699
1700     RAMBLOCK_FOREACH(block) {
1701         unsigned long *bitmap = block->bmap;
1702         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1703         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1704
1705         while (run_start < range) {
1706             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1707             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1708                               (run_end - run_start) << TARGET_PAGE_BITS);
1709             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1710         }
1711     }
1712 }
1713
1714 /**
1715  * postcopy_send_discard_bm_ram: discard a RAMBlock
1716  *
1717  * Returns zero on success
1718  *
1719  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1720  * Note: At this point the 'unsentmap' is the processed bitmap combined
1721  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1722  *
1723  * @ms: current migration state
1724  * @pds: state for postcopy
1725  * @start: RAMBlock starting page
1726  * @length: RAMBlock size
1727  */
1728 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1729                                         PostcopyDiscardState *pds,
1730                                         RAMBlock *block)
1731 {
1732     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1733     unsigned long current;
1734     unsigned long *unsentmap = block->unsentmap;
1735
1736     for (current = 0; current < end; ) {
1737         unsigned long one = find_next_bit(unsentmap, end, current);
1738
1739         if (one <= end) {
1740             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1741             unsigned long discard_length;
1742
1743             if (zero >= end) {
1744                 discard_length = end - one;
1745             } else {
1746                 discard_length = zero - one;
1747             }
1748             if (discard_length) {
1749                 postcopy_discard_send_range(ms, pds, one, discard_length);
1750             }
1751             current = one + discard_length;
1752         } else {
1753             current = one;
1754         }
1755     }
1756
1757     return 0;
1758 }
1759
1760 /**
1761  * postcopy_each_ram_send_discard: discard all RAMBlocks
1762  *
1763  * Returns 0 for success or negative for error
1764  *
1765  * Utility for the outgoing postcopy code.
1766  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1767  *   passing it bitmap indexes and name.
1768  * (qemu_ram_foreach_block ends up passing unscaled lengths
1769  *  which would mean postcopy code would have to deal with target page)
1770  *
1771  * @ms: current migration state
1772  */
1773 static int postcopy_each_ram_send_discard(MigrationState *ms)
1774 {
1775     struct RAMBlock *block;
1776     int ret;
1777
1778     RAMBLOCK_FOREACH(block) {
1779         PostcopyDiscardState *pds =
1780             postcopy_discard_send_init(ms, block->idstr);
1781
1782         /*
1783          * Postcopy sends chunks of bitmap over the wire, but it
1784          * just needs indexes at this point, avoids it having
1785          * target page specific code.
1786          */
1787         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1788         postcopy_discard_send_finish(ms, pds);
1789         if (ret) {
1790             return ret;
1791         }
1792     }
1793
1794     return 0;
1795 }
1796
1797 /**
1798  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1799  *
1800  * Helper for postcopy_chunk_hostpages; it's called twice to
1801  * canonicalize the two bitmaps, that are similar, but one is
1802  * inverted.
1803  *
1804  * Postcopy requires that all target pages in a hostpage are dirty or
1805  * clean, not a mix.  This function canonicalizes the bitmaps.
1806  *
1807  * @ms: current migration state
1808  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1809  *               otherwise we need to canonicalize partially dirty host pages
1810  * @block: block that contains the page we want to canonicalize
1811  * @pds: state for postcopy
1812  */
1813 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1814                                           RAMBlock *block,
1815                                           PostcopyDiscardState *pds)
1816 {
1817     RAMState *rs = ram_state;
1818     unsigned long *bitmap = block->bmap;
1819     unsigned long *unsentmap = block->unsentmap;
1820     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1821     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1822     unsigned long run_start;
1823
1824     if (block->page_size == TARGET_PAGE_SIZE) {
1825         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1826         return;
1827     }
1828
1829     if (unsent_pass) {
1830         /* Find a sent page */
1831         run_start = find_next_zero_bit(unsentmap, pages, 0);
1832     } else {
1833         /* Find a dirty page */
1834         run_start = find_next_bit(bitmap, pages, 0);
1835     }
1836
1837     while (run_start < pages) {
1838         bool do_fixup = false;
1839         unsigned long fixup_start_addr;
1840         unsigned long host_offset;
1841
1842         /*
1843          * If the start of this run of pages is in the middle of a host
1844          * page, then we need to fixup this host page.
1845          */
1846         host_offset = run_start % host_ratio;
1847         if (host_offset) {
1848             do_fixup = true;
1849             run_start -= host_offset;
1850             fixup_start_addr = run_start;
1851             /* For the next pass */
1852             run_start = run_start + host_ratio;
1853         } else {
1854             /* Find the end of this run */
1855             unsigned long run_end;
1856             if (unsent_pass) {
1857                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1858             } else {
1859                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1860             }
1861             /*
1862              * If the end isn't at the start of a host page, then the
1863              * run doesn't finish at the end of a host page
1864              * and we need to discard.
1865              */
1866             host_offset = run_end % host_ratio;
1867             if (host_offset) {
1868                 do_fixup = true;
1869                 fixup_start_addr = run_end - host_offset;
1870                 /*
1871                  * This host page has gone, the next loop iteration starts
1872                  * from after the fixup
1873                  */
1874                 run_start = fixup_start_addr + host_ratio;
1875             } else {
1876                 /*
1877                  * No discards on this iteration, next loop starts from
1878                  * next sent/dirty page
1879                  */
1880                 run_start = run_end + 1;
1881             }
1882         }
1883
1884         if (do_fixup) {
1885             unsigned long page;
1886
1887             /* Tell the destination to discard this page */
1888             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1889                 /* For the unsent_pass we:
1890                  *     discard partially sent pages
1891                  * For the !unsent_pass (dirty) we:
1892                  *     discard partially dirty pages that were sent
1893                  *     (any partially sent pages were already discarded
1894                  *     by the previous unsent_pass)
1895                  */
1896                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1897                                             host_ratio);
1898             }
1899
1900             /* Clean up the bitmap */
1901             for (page = fixup_start_addr;
1902                  page < fixup_start_addr + host_ratio; page++) {
1903                 /* All pages in this host page are now not sent */
1904                 set_bit(page, unsentmap);
1905
1906                 /*
1907                  * Remark them as dirty, updating the count for any pages
1908                  * that weren't previously dirty.
1909                  */
1910                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1911             }
1912         }
1913
1914         if (unsent_pass) {
1915             /* Find the next sent page for the next iteration */
1916             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1917         } else {
1918             /* Find the next dirty page for the next iteration */
1919             run_start = find_next_bit(bitmap, pages, run_start);
1920         }
1921     }
1922 }
1923
1924 /**
1925  * postcopy_chuck_hostpages: discrad any partially sent host page
1926  *
1927  * Utility for the outgoing postcopy code.
1928  *
1929  * Discard any partially sent host-page size chunks, mark any partially
1930  * dirty host-page size chunks as all dirty.  In this case the host-page
1931  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1932  *
1933  * Returns zero on success
1934  *
1935  * @ms: current migration state
1936  * @block: block we want to work with
1937  */
1938 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1939 {
1940     PostcopyDiscardState *pds =
1941         postcopy_discard_send_init(ms, block->idstr);
1942
1943     /* First pass: Discard all partially sent host pages */
1944     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1945     /*
1946      * Second pass: Ensure that all partially dirty host pages are made
1947      * fully dirty.
1948      */
1949     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1950
1951     postcopy_discard_send_finish(ms, pds);
1952     return 0;
1953 }
1954
1955 /**
1956  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1957  *
1958  * Returns zero on success
1959  *
1960  * Transmit the set of pages to be discarded after precopy to the target
1961  * these are pages that:
1962  *     a) Have been previously transmitted but are now dirty again
1963  *     b) Pages that have never been transmitted, this ensures that
1964  *        any pages on the destination that have been mapped by background
1965  *        tasks get discarded (transparent huge pages is the specific concern)
1966  * Hopefully this is pretty sparse
1967  *
1968  * @ms: current migration state
1969  */
1970 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1971 {
1972     RAMState *rs = ram_state;
1973     RAMBlock *block;
1974     int ret;
1975
1976     rcu_read_lock();
1977
1978     /* This should be our last sync, the src is now paused */
1979     migration_bitmap_sync(rs);
1980
1981     /* Easiest way to make sure we don't resume in the middle of a host-page */
1982     rs->last_seen_block = NULL;
1983     rs->last_sent_block = NULL;
1984     rs->last_page = 0;
1985
1986     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1987         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1988         unsigned long *bitmap = block->bmap;
1989         unsigned long *unsentmap = block->unsentmap;
1990
1991         if (!unsentmap) {
1992             /* We don't have a safe way to resize the sentmap, so
1993              * if the bitmap was resized it will be NULL at this
1994              * point.
1995              */
1996             error_report("migration ram resized during precopy phase");
1997             rcu_read_unlock();
1998             return -EINVAL;
1999         }
2000         /* Deal with TPS != HPS and huge pages */
2001         ret = postcopy_chunk_hostpages(ms, block);
2002         if (ret) {
2003             rcu_read_unlock();
2004             return ret;
2005         }
2006
2007         /*
2008          * Update the unsentmap to be unsentmap = unsentmap | dirty
2009          */
2010         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2011 #ifdef DEBUG_POSTCOPY
2012         ram_debug_dump_bitmap(unsentmap, true, pages);
2013 #endif
2014     }
2015     trace_ram_postcopy_send_discard_bitmap();
2016
2017     ret = postcopy_each_ram_send_discard(ms);
2018     rcu_read_unlock();
2019
2020     return ret;
2021 }
2022
2023 /**
2024  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2025  *
2026  * Returns zero on success
2027  *
2028  * @rbname: name of the RAMBlock of the request. NULL means the
2029  *          same that last one.
2030  * @start: RAMBlock starting page
2031  * @length: RAMBlock size
2032  */
2033 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2034 {
2035     int ret = -1;
2036
2037     trace_ram_discard_range(rbname, start, length);
2038
2039     rcu_read_lock();
2040     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2041
2042     if (!rb) {
2043         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2044         goto err;
2045     }
2046
2047     bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2048                  length >> qemu_target_page_bits());
2049     ret = ram_block_discard_range(rb, start, length);
2050
2051 err:
2052     rcu_read_unlock();
2053
2054     return ret;
2055 }
2056
2057 /*
2058  * For every allocation, we will try not to crash the VM if the
2059  * allocation failed.
2060  */
2061 static int xbzrle_init(void)
2062 {
2063     Error *local_err = NULL;
2064
2065     if (!migrate_use_xbzrle()) {
2066         return 0;
2067     }
2068
2069     XBZRLE_cache_lock();
2070
2071     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2072     if (!XBZRLE.zero_target_page) {
2073         error_report("%s: Error allocating zero page", __func__);
2074         goto err_out;
2075     }
2076
2077     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2078                               TARGET_PAGE_SIZE, &local_err);
2079     if (!XBZRLE.cache) {
2080         error_report_err(local_err);
2081         goto free_zero_page;
2082     }
2083
2084     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2085     if (!XBZRLE.encoded_buf) {
2086         error_report("%s: Error allocating encoded_buf", __func__);
2087         goto free_cache;
2088     }
2089
2090     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2091     if (!XBZRLE.current_buf) {
2092         error_report("%s: Error allocating current_buf", __func__);
2093         goto free_encoded_buf;
2094     }
2095
2096     /* We are all good */
2097     XBZRLE_cache_unlock();
2098     return 0;
2099
2100 free_encoded_buf:
2101     g_free(XBZRLE.encoded_buf);
2102     XBZRLE.encoded_buf = NULL;
2103 free_cache:
2104     cache_fini(XBZRLE.cache);
2105     XBZRLE.cache = NULL;
2106 free_zero_page:
2107     g_free(XBZRLE.zero_target_page);
2108     XBZRLE.zero_target_page = NULL;
2109 err_out:
2110     XBZRLE_cache_unlock();
2111     return -ENOMEM;
2112 }
2113
2114 static int ram_state_init(RAMState **rsp)
2115 {
2116     *rsp = g_try_new0(RAMState, 1);
2117
2118     if (!*rsp) {
2119         error_report("%s: Init ramstate fail", __func__);
2120         return -1;
2121     }
2122
2123     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2124     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2125     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2126
2127     /*
2128      * Count the total number of pages used by ram blocks not including any
2129      * gaps due to alignment or unplugs.
2130      */
2131     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2132
2133     ram_state_reset(*rsp);
2134
2135     return 0;
2136 }
2137
2138 static void ram_list_init_bitmaps(void)
2139 {
2140     RAMBlock *block;
2141     unsigned long pages;
2142
2143     /* Skip setting bitmap if there is no RAM */
2144     if (ram_bytes_total()) {
2145         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2146             pages = block->max_length >> TARGET_PAGE_BITS;
2147             block->bmap = bitmap_new(pages);
2148             bitmap_set(block->bmap, 0, pages);
2149             if (migrate_postcopy_ram()) {
2150                 block->unsentmap = bitmap_new(pages);
2151                 bitmap_set(block->unsentmap, 0, pages);
2152             }
2153         }
2154     }
2155 }
2156
2157 static void ram_init_bitmaps(RAMState *rs)
2158 {
2159     /* For memory_global_dirty_log_start below.  */
2160     qemu_mutex_lock_iothread();
2161     qemu_mutex_lock_ramlist();
2162     rcu_read_lock();
2163
2164     ram_list_init_bitmaps();
2165     memory_global_dirty_log_start();
2166     migration_bitmap_sync(rs);
2167
2168     rcu_read_unlock();
2169     qemu_mutex_unlock_ramlist();
2170     qemu_mutex_unlock_iothread();
2171 }
2172
2173 static int ram_init_all(RAMState **rsp)
2174 {
2175     if (ram_state_init(rsp)) {
2176         return -1;
2177     }
2178
2179     if (xbzrle_init()) {
2180         ram_state_cleanup(rsp);
2181         return -1;
2182     }
2183
2184     ram_init_bitmaps(*rsp);
2185
2186     return 0;
2187 }
2188
2189 /*
2190  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2191  * long-running RCU critical section.  When rcu-reclaims in the code
2192  * start to become numerous it will be necessary to reduce the
2193  * granularity of these critical sections.
2194  */
2195
2196 /**
2197  * ram_save_setup: Setup RAM for migration
2198  *
2199  * Returns zero to indicate success and negative for error
2200  *
2201  * @f: QEMUFile where to send the data
2202  * @opaque: RAMState pointer
2203  */
2204 static int ram_save_setup(QEMUFile *f, void *opaque)
2205 {
2206     RAMState **rsp = opaque;
2207     RAMBlock *block;
2208
2209     /* migration has already setup the bitmap, reuse it. */
2210     if (!migration_in_colo_state()) {
2211         if (ram_init_all(rsp) != 0) {
2212             return -1;
2213         }
2214     }
2215     (*rsp)->f = f;
2216
2217     rcu_read_lock();
2218
2219     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2220
2221     RAMBLOCK_FOREACH(block) {
2222         qemu_put_byte(f, strlen(block->idstr));
2223         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2224         qemu_put_be64(f, block->used_length);
2225         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2226             qemu_put_be64(f, block->page_size);
2227         }
2228     }
2229
2230     rcu_read_unlock();
2231     compress_threads_save_setup();
2232
2233     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2234     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2235
2236     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2237
2238     return 0;
2239 }
2240
2241 /**
2242  * ram_save_iterate: iterative stage for migration
2243  *
2244  * Returns zero to indicate success and negative for error
2245  *
2246  * @f: QEMUFile where to send the data
2247  * @opaque: RAMState pointer
2248  */
2249 static int ram_save_iterate(QEMUFile *f, void *opaque)
2250 {
2251     RAMState **temp = opaque;
2252     RAMState *rs = *temp;
2253     int ret;
2254     int i;
2255     int64_t t0;
2256     int done = 0;
2257
2258     rcu_read_lock();
2259     if (ram_list.version != rs->last_version) {
2260         ram_state_reset(rs);
2261     }
2262
2263     /* Read version before ram_list.blocks */
2264     smp_rmb();
2265
2266     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2267
2268     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2269     i = 0;
2270     while ((ret = qemu_file_rate_limit(f)) == 0) {
2271         int pages;
2272
2273         pages = ram_find_and_save_block(rs, false);
2274         /* no more pages to sent */
2275         if (pages == 0) {
2276             done = 1;
2277             break;
2278         }
2279         rs->iterations++;
2280
2281         /* we want to check in the 1st loop, just in case it was the 1st time
2282            and we had to sync the dirty bitmap.
2283            qemu_get_clock_ns() is a bit expensive, so we only check each some
2284            iterations
2285         */
2286         if ((i & 63) == 0) {
2287             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2288             if (t1 > MAX_WAIT) {
2289                 trace_ram_save_iterate_big_wait(t1, i);
2290                 break;
2291             }
2292         }
2293         i++;
2294     }
2295     flush_compressed_data(rs);
2296     rcu_read_unlock();
2297
2298     /*
2299      * Must occur before EOS (or any QEMUFile operation)
2300      * because of RDMA protocol.
2301      */
2302     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2303
2304     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2305     ram_counters.transferred += 8;
2306
2307     ret = qemu_file_get_error(f);
2308     if (ret < 0) {
2309         return ret;
2310     }
2311
2312     return done;
2313 }
2314
2315 /**
2316  * ram_save_complete: function called to send the remaining amount of ram
2317  *
2318  * Returns zero to indicate success
2319  *
2320  * Called with iothread lock
2321  *
2322  * @f: QEMUFile where to send the data
2323  * @opaque: RAMState pointer
2324  */
2325 static int ram_save_complete(QEMUFile *f, void *opaque)
2326 {
2327     RAMState **temp = opaque;
2328     RAMState *rs = *temp;
2329
2330     rcu_read_lock();
2331
2332     if (!migration_in_postcopy()) {
2333         migration_bitmap_sync(rs);
2334     }
2335
2336     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2337
2338     /* try transferring iterative blocks of memory */
2339
2340     /* flush all remaining blocks regardless of rate limiting */
2341     while (true) {
2342         int pages;
2343
2344         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2345         /* no more blocks to sent */
2346         if (pages == 0) {
2347             break;
2348         }
2349     }
2350
2351     flush_compressed_data(rs);
2352     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2353
2354     rcu_read_unlock();
2355
2356     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2357
2358     return 0;
2359 }
2360
2361 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2362                              uint64_t *non_postcopiable_pending,
2363                              uint64_t *postcopiable_pending)
2364 {
2365     RAMState **temp = opaque;
2366     RAMState *rs = *temp;
2367     uint64_t remaining_size;
2368
2369     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2370
2371     if (!migration_in_postcopy() &&
2372         remaining_size < max_size) {
2373         qemu_mutex_lock_iothread();
2374         rcu_read_lock();
2375         migration_bitmap_sync(rs);
2376         rcu_read_unlock();
2377         qemu_mutex_unlock_iothread();
2378         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2379     }
2380
2381     if (migrate_postcopy_ram()) {
2382         /* We can do postcopy, and all the data is postcopiable */
2383         *postcopiable_pending += remaining_size;
2384     } else {
2385         *non_postcopiable_pending += remaining_size;
2386     }
2387 }
2388
2389 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2390 {
2391     unsigned int xh_len;
2392     int xh_flags;
2393     uint8_t *loaded_data;
2394
2395     /* extract RLE header */
2396     xh_flags = qemu_get_byte(f);
2397     xh_len = qemu_get_be16(f);
2398
2399     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2400         error_report("Failed to load XBZRLE page - wrong compression!");
2401         return -1;
2402     }
2403
2404     if (xh_len > TARGET_PAGE_SIZE) {
2405         error_report("Failed to load XBZRLE page - len overflow!");
2406         return -1;
2407     }
2408     loaded_data = XBZRLE.decoded_buf;
2409     /* load data and decode */
2410     /* it can change loaded_data to point to an internal buffer */
2411     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2412
2413     /* decode RLE */
2414     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2415                              TARGET_PAGE_SIZE) == -1) {
2416         error_report("Failed to load XBZRLE page - decode error!");
2417         return -1;
2418     }
2419
2420     return 0;
2421 }
2422
2423 /**
2424  * ram_block_from_stream: read a RAMBlock id from the migration stream
2425  *
2426  * Must be called from within a rcu critical section.
2427  *
2428  * Returns a pointer from within the RCU-protected ram_list.
2429  *
2430  * @f: QEMUFile where to read the data from
2431  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2432  */
2433 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2434 {
2435     static RAMBlock *block = NULL;
2436     char id[256];
2437     uint8_t len;
2438
2439     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2440         if (!block) {
2441             error_report("Ack, bad migration stream!");
2442             return NULL;
2443         }
2444         return block;
2445     }
2446
2447     len = qemu_get_byte(f);
2448     qemu_get_buffer(f, (uint8_t *)id, len);
2449     id[len] = 0;
2450
2451     block = qemu_ram_block_by_name(id);
2452     if (!block) {
2453         error_report("Can't find block %s", id);
2454         return NULL;
2455     }
2456
2457     return block;
2458 }
2459
2460 static inline void *host_from_ram_block_offset(RAMBlock *block,
2461                                                ram_addr_t offset)
2462 {
2463     if (!offset_in_ramblock(block, offset)) {
2464         return NULL;
2465     }
2466
2467     return block->host + offset;
2468 }
2469
2470 /**
2471  * ram_handle_compressed: handle the zero page case
2472  *
2473  * If a page (or a whole RDMA chunk) has been
2474  * determined to be zero, then zap it.
2475  *
2476  * @host: host address for the zero page
2477  * @ch: what the page is filled from.  We only support zero
2478  * @size: size of the zero page
2479  */
2480 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2481 {
2482     if (ch != 0 || !is_zero_range(host, size)) {
2483         memset(host, ch, size);
2484     }
2485 }
2486
2487 static void *do_data_decompress(void *opaque)
2488 {
2489     DecompressParam *param = opaque;
2490     unsigned long pagesize;
2491     uint8_t *des;
2492     int len;
2493
2494     qemu_mutex_lock(&param->mutex);
2495     while (!param->quit) {
2496         if (param->des) {
2497             des = param->des;
2498             len = param->len;
2499             param->des = 0;
2500             qemu_mutex_unlock(&param->mutex);
2501
2502             pagesize = TARGET_PAGE_SIZE;
2503             /* uncompress() will return failed in some case, especially
2504              * when the page is dirted when doing the compression, it's
2505              * not a problem because the dirty page will be retransferred
2506              * and uncompress() won't break the data in other pages.
2507              */
2508             uncompress((Bytef *)des, &pagesize,
2509                        (const Bytef *)param->compbuf, len);
2510
2511             qemu_mutex_lock(&decomp_done_lock);
2512             param->done = true;
2513             qemu_cond_signal(&decomp_done_cond);
2514             qemu_mutex_unlock(&decomp_done_lock);
2515
2516             qemu_mutex_lock(&param->mutex);
2517         } else {
2518             qemu_cond_wait(&param->cond, &param->mutex);
2519         }
2520     }
2521     qemu_mutex_unlock(&param->mutex);
2522
2523     return NULL;
2524 }
2525
2526 static void wait_for_decompress_done(void)
2527 {
2528     int idx, thread_count;
2529
2530     if (!migrate_use_compression()) {
2531         return;
2532     }
2533
2534     thread_count = migrate_decompress_threads();
2535     qemu_mutex_lock(&decomp_done_lock);
2536     for (idx = 0; idx < thread_count; idx++) {
2537         while (!decomp_param[idx].done) {
2538             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2539         }
2540     }
2541     qemu_mutex_unlock(&decomp_done_lock);
2542 }
2543
2544 static void compress_threads_load_setup(void)
2545 {
2546     int i, thread_count;
2547
2548     if (!migrate_use_compression()) {
2549         return;
2550     }
2551     thread_count = migrate_decompress_threads();
2552     decompress_threads = g_new0(QemuThread, thread_count);
2553     decomp_param = g_new0(DecompressParam, thread_count);
2554     qemu_mutex_init(&decomp_done_lock);
2555     qemu_cond_init(&decomp_done_cond);
2556     for (i = 0; i < thread_count; i++) {
2557         qemu_mutex_init(&decomp_param[i].mutex);
2558         qemu_cond_init(&decomp_param[i].cond);
2559         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2560         decomp_param[i].done = true;
2561         decomp_param[i].quit = false;
2562         qemu_thread_create(decompress_threads + i, "decompress",
2563                            do_data_decompress, decomp_param + i,
2564                            QEMU_THREAD_JOINABLE);
2565     }
2566 }
2567
2568 static void compress_threads_load_cleanup(void)
2569 {
2570     int i, thread_count;
2571
2572     if (!migrate_use_compression()) {
2573         return;
2574     }
2575     thread_count = migrate_decompress_threads();
2576     for (i = 0; i < thread_count; i++) {
2577         qemu_mutex_lock(&decomp_param[i].mutex);
2578         decomp_param[i].quit = true;
2579         qemu_cond_signal(&decomp_param[i].cond);
2580         qemu_mutex_unlock(&decomp_param[i].mutex);
2581     }
2582     for (i = 0; i < thread_count; i++) {
2583         qemu_thread_join(decompress_threads + i);
2584         qemu_mutex_destroy(&decomp_param[i].mutex);
2585         qemu_cond_destroy(&decomp_param[i].cond);
2586         g_free(decomp_param[i].compbuf);
2587     }
2588     g_free(decompress_threads);
2589     g_free(decomp_param);
2590     decompress_threads = NULL;
2591     decomp_param = NULL;
2592 }
2593
2594 static void decompress_data_with_multi_threads(QEMUFile *f,
2595                                                void *host, int len)
2596 {
2597     int idx, thread_count;
2598
2599     thread_count = migrate_decompress_threads();
2600     qemu_mutex_lock(&decomp_done_lock);
2601     while (true) {
2602         for (idx = 0; idx < thread_count; idx++) {
2603             if (decomp_param[idx].done) {
2604                 decomp_param[idx].done = false;
2605                 qemu_mutex_lock(&decomp_param[idx].mutex);
2606                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2607                 decomp_param[idx].des = host;
2608                 decomp_param[idx].len = len;
2609                 qemu_cond_signal(&decomp_param[idx].cond);
2610                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2611                 break;
2612             }
2613         }
2614         if (idx < thread_count) {
2615             break;
2616         } else {
2617             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2618         }
2619     }
2620     qemu_mutex_unlock(&decomp_done_lock);
2621 }
2622
2623 /**
2624  * ram_load_setup: Setup RAM for migration incoming side
2625  *
2626  * Returns zero to indicate success and negative for error
2627  *
2628  * @f: QEMUFile where to receive the data
2629  * @opaque: RAMState pointer
2630  */
2631 static int ram_load_setup(QEMUFile *f, void *opaque)
2632 {
2633     xbzrle_load_setup();
2634     compress_threads_load_setup();
2635     ramblock_recv_map_init();
2636     return 0;
2637 }
2638
2639 static int ram_load_cleanup(void *opaque)
2640 {
2641     RAMBlock *rb;
2642     xbzrle_load_cleanup();
2643     compress_threads_load_cleanup();
2644
2645     RAMBLOCK_FOREACH(rb) {
2646         g_free(rb->receivedmap);
2647         rb->receivedmap = NULL;
2648     }
2649     return 0;
2650 }
2651
2652 /**
2653  * ram_postcopy_incoming_init: allocate postcopy data structures
2654  *
2655  * Returns 0 for success and negative if there was one error
2656  *
2657  * @mis: current migration incoming state
2658  *
2659  * Allocate data structures etc needed by incoming migration with
2660  * postcopy-ram. postcopy-ram's similarly names
2661  * postcopy_ram_incoming_init does the work.
2662  */
2663 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2664 {
2665     unsigned long ram_pages = last_ram_page();
2666
2667     return postcopy_ram_incoming_init(mis, ram_pages);
2668 }
2669
2670 /**
2671  * ram_load_postcopy: load a page in postcopy case
2672  *
2673  * Returns 0 for success or -errno in case of error
2674  *
2675  * Called in postcopy mode by ram_load().
2676  * rcu_read_lock is taken prior to this being called.
2677  *
2678  * @f: QEMUFile where to send the data
2679  */
2680 static int ram_load_postcopy(QEMUFile *f)
2681 {
2682     int flags = 0, ret = 0;
2683     bool place_needed = false;
2684     bool matching_page_sizes = false;
2685     MigrationIncomingState *mis = migration_incoming_get_current();
2686     /* Temporary page that is later 'placed' */
2687     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2688     void *last_host = NULL;
2689     bool all_zero = false;
2690
2691     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2692         ram_addr_t addr;
2693         void *host = NULL;
2694         void *page_buffer = NULL;
2695         void *place_source = NULL;
2696         RAMBlock *block = NULL;
2697         uint8_t ch;
2698
2699         addr = qemu_get_be64(f);
2700         flags = addr & ~TARGET_PAGE_MASK;
2701         addr &= TARGET_PAGE_MASK;
2702
2703         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2704         place_needed = false;
2705         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2706             block = ram_block_from_stream(f, flags);
2707
2708             host = host_from_ram_block_offset(block, addr);
2709             if (!host) {
2710                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2711                 ret = -EINVAL;
2712                 break;
2713             }
2714             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2715             /*
2716              * Postcopy requires that we place whole host pages atomically;
2717              * these may be huge pages for RAMBlocks that are backed by
2718              * hugetlbfs.
2719              * To make it atomic, the data is read into a temporary page
2720              * that's moved into place later.
2721              * The migration protocol uses,  possibly smaller, target-pages
2722              * however the source ensures it always sends all the components
2723              * of a host page in order.
2724              */
2725             page_buffer = postcopy_host_page +
2726                           ((uintptr_t)host & (block->page_size - 1));
2727             /* If all TP are zero then we can optimise the place */
2728             if (!((uintptr_t)host & (block->page_size - 1))) {
2729                 all_zero = true;
2730             } else {
2731                 /* not the 1st TP within the HP */
2732                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2733                     error_report("Non-sequential target page %p/%p",
2734                                   host, last_host);
2735                     ret = -EINVAL;
2736                     break;
2737                 }
2738             }
2739
2740
2741             /*
2742              * If it's the last part of a host page then we place the host
2743              * page
2744              */
2745             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2746                                      (block->page_size - 1)) == 0;
2747             place_source = postcopy_host_page;
2748         }
2749         last_host = host;
2750
2751         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2752         case RAM_SAVE_FLAG_ZERO:
2753             ch = qemu_get_byte(f);
2754             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2755             if (ch) {
2756                 all_zero = false;
2757             }
2758             break;
2759
2760         case RAM_SAVE_FLAG_PAGE:
2761             all_zero = false;
2762             if (!place_needed || !matching_page_sizes) {
2763                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2764             } else {
2765                 /* Avoids the qemu_file copy during postcopy, which is
2766                  * going to do a copy later; can only do it when we
2767                  * do this read in one go (matching page sizes)
2768                  */
2769                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2770                                          TARGET_PAGE_SIZE);
2771             }
2772             break;
2773         case RAM_SAVE_FLAG_EOS:
2774             /* normal exit */
2775             break;
2776         default:
2777             error_report("Unknown combination of migration flags: %#x"
2778                          " (postcopy mode)", flags);
2779             ret = -EINVAL;
2780         }
2781
2782         if (place_needed) {
2783             /* This gets called at the last target page in the host page */
2784             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2785
2786             if (all_zero) {
2787                 ret = postcopy_place_page_zero(mis, place_dest,
2788                                                block);
2789             } else {
2790                 ret = postcopy_place_page(mis, place_dest,
2791                                           place_source, block);
2792             }
2793         }
2794         if (!ret) {
2795             ret = qemu_file_get_error(f);
2796         }
2797     }
2798
2799     return ret;
2800 }
2801
2802 static bool postcopy_is_advised(void)
2803 {
2804     PostcopyState ps = postcopy_state_get();
2805     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
2806 }
2807
2808 static bool postcopy_is_running(void)
2809 {
2810     PostcopyState ps = postcopy_state_get();
2811     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
2812 }
2813
2814 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2815 {
2816     int flags = 0, ret = 0, invalid_flags = 0;
2817     static uint64_t seq_iter;
2818     int len = 0;
2819     /*
2820      * If system is running in postcopy mode, page inserts to host memory must
2821      * be atomic
2822      */
2823     bool postcopy_running = postcopy_is_running();
2824     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2825     bool postcopy_advised = postcopy_is_advised();
2826
2827     seq_iter++;
2828
2829     if (version_id != 4) {
2830         ret = -EINVAL;
2831     }
2832
2833     if (!migrate_use_compression()) {
2834         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2835     }
2836     /* This RCU critical section can be very long running.
2837      * When RCU reclaims in the code start to become numerous,
2838      * it will be necessary to reduce the granularity of this
2839      * critical section.
2840      */
2841     rcu_read_lock();
2842
2843     if (postcopy_running) {
2844         ret = ram_load_postcopy(f);
2845     }
2846
2847     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2848         ram_addr_t addr, total_ram_bytes;
2849         void *host = NULL;
2850         uint8_t ch;
2851
2852         addr = qemu_get_be64(f);
2853         flags = addr & ~TARGET_PAGE_MASK;
2854         addr &= TARGET_PAGE_MASK;
2855
2856         if (flags & invalid_flags) {
2857             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2858                 error_report("Received an unexpected compressed page");
2859             }
2860
2861             ret = -EINVAL;
2862             break;
2863         }
2864
2865         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2866                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2867             RAMBlock *block = ram_block_from_stream(f, flags);
2868
2869             host = host_from_ram_block_offset(block, addr);
2870             if (!host) {
2871                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2872                 ret = -EINVAL;
2873                 break;
2874             }
2875             ramblock_recv_bitmap_set(block, host);
2876             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2877         }
2878
2879         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2880         case RAM_SAVE_FLAG_MEM_SIZE:
2881             /* Synchronize RAM block list */
2882             total_ram_bytes = addr;
2883             while (!ret && total_ram_bytes) {
2884                 RAMBlock *block;
2885                 char id[256];
2886                 ram_addr_t length;
2887
2888                 len = qemu_get_byte(f);
2889                 qemu_get_buffer(f, (uint8_t *)id, len);
2890                 id[len] = 0;
2891                 length = qemu_get_be64(f);
2892
2893                 block = qemu_ram_block_by_name(id);
2894                 if (block) {
2895                     if (length != block->used_length) {
2896                         Error *local_err = NULL;
2897
2898                         ret = qemu_ram_resize(block, length,
2899                                               &local_err);
2900                         if (local_err) {
2901                             error_report_err(local_err);
2902                         }
2903                     }
2904                     /* For postcopy we need to check hugepage sizes match */
2905                     if (postcopy_advised &&
2906                         block->page_size != qemu_host_page_size) {
2907                         uint64_t remote_page_size = qemu_get_be64(f);
2908                         if (remote_page_size != block->page_size) {
2909                             error_report("Mismatched RAM page size %s "
2910                                          "(local) %zd != %" PRId64,
2911                                          id, block->page_size,
2912                                          remote_page_size);
2913                             ret = -EINVAL;
2914                         }
2915                     }
2916                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2917                                           block->idstr);
2918                 } else {
2919                     error_report("Unknown ramblock \"%s\", cannot "
2920                                  "accept migration", id);
2921                     ret = -EINVAL;
2922                 }
2923
2924                 total_ram_bytes -= length;
2925             }
2926             break;
2927
2928         case RAM_SAVE_FLAG_ZERO:
2929             ch = qemu_get_byte(f);
2930             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2931             break;
2932
2933         case RAM_SAVE_FLAG_PAGE:
2934             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2935             break;
2936
2937         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2938             len = qemu_get_be32(f);
2939             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2940                 error_report("Invalid compressed data length: %d", len);
2941                 ret = -EINVAL;
2942                 break;
2943             }
2944             decompress_data_with_multi_threads(f, host, len);
2945             break;
2946
2947         case RAM_SAVE_FLAG_XBZRLE:
2948             if (load_xbzrle(f, addr, host) < 0) {
2949                 error_report("Failed to decompress XBZRLE page at "
2950                              RAM_ADDR_FMT, addr);
2951                 ret = -EINVAL;
2952                 break;
2953             }
2954             break;
2955         case RAM_SAVE_FLAG_EOS:
2956             /* normal exit */
2957             break;
2958         default:
2959             if (flags & RAM_SAVE_FLAG_HOOK) {
2960                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2961             } else {
2962                 error_report("Unknown combination of migration flags: %#x",
2963                              flags);
2964                 ret = -EINVAL;
2965             }
2966         }
2967         if (!ret) {
2968             ret = qemu_file_get_error(f);
2969         }
2970     }
2971
2972     wait_for_decompress_done();
2973     rcu_read_unlock();
2974     trace_ram_load_complete(ret, seq_iter);
2975     return ret;
2976 }
2977
2978 static bool ram_has_postcopy(void *opaque)
2979 {
2980     return migrate_postcopy_ram();
2981 }
2982
2983 static SaveVMHandlers savevm_ram_handlers = {
2984     .save_setup = ram_save_setup,
2985     .save_live_iterate = ram_save_iterate,
2986     .save_live_complete_postcopy = ram_save_complete,
2987     .save_live_complete_precopy = ram_save_complete,
2988     .has_postcopy = ram_has_postcopy,
2989     .save_live_pending = ram_save_pending,
2990     .load_state = ram_load,
2991     .save_cleanup = ram_save_cleanup,
2992     .load_setup = ram_load_setup,
2993     .load_cleanup = ram_load_cleanup,
2994 };
2995
2996 void ram_mig_init(void)
2997 {
2998     qemu_mutex_init(&XBZRLE.lock);
2999     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
3000 }