migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include "qemu/cutils.h"
  32 #include "qemu/bitops.h"
  33 #include "qemu/bitmap.h"
  34 #include "qemu/main-loop.h"
  35 #include "xbzrle.h"
  36 #include "ram.h"
  37 #include "migration.h"
  38 #include "migration/register.h"
  39 #include "migration/misc.h"
  40 #include "qemu-file.h"
  41 #include "postcopy-ram.h"
  42 #include "page_cache.h"
  43 #include "qemu/error-report.h"
  44 #include "qapi/error.h"
  45 #include "qapi/qapi-types-migration.h"
  46 #include "qapi/qapi-events-migration.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "block.h"
  54 #include "sysemu/sysemu.h"
  55 #include "sysemu/cpu-throttle.h"
  56 #include "savevm.h"
  57 #include "qemu/iov.h"
  58 #include "multifd.h"
  59
  60 /***********************************************************/
  61 /* ram save/restore */
  62
  63 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  64  * worked for pages that where filled with the same char.  We switched
  65  * it to only search for the zero value.  And to avoid confusion with
  66  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  67  */
  68
  69 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  70 #define RAM_SAVE_FLAG_ZERO     0x02
  71 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  72 #define RAM_SAVE_FLAG_PAGE     0x08
  73 #define RAM_SAVE_FLAG_EOS      0x10
  74 #define RAM_SAVE_FLAG_CONTINUE 0x20
  75 #define RAM_SAVE_FLAG_XBZRLE   0x40
  76 /* 0x80 is reserved in migration.h start with 0x100 next */
  77 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  78
  79 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  80 {
  81     return buffer_is_zero(p, size);
  82 }
  83
  84 XBZRLECacheStats xbzrle_counters;
  85
  86 /* struct contains XBZRLE cache and a static page
  87    used by the compression */
  88 static struct {
  89     /* buffer used for XBZRLE encoding */
  90     uint8_t *encoded_buf;
  91     /* buffer for storing page content */
  92     uint8_t *current_buf;
  93     /* Cache for XBZRLE, Protected by lock. */
  94     PageCache *cache;
  95     QemuMutex lock;
  96     /* it will store a page full of zeros */
  97     uint8_t *zero_target_page;
  98     /* buffer used for XBZRLE decoding */
  99     uint8_t *decoded_buf;
 100 } XBZRLE;
 101
 102 static void XBZRLE_cache_lock(void)
 103 {
 104     if (migrate_use_xbzrle())
 105         qemu_mutex_lock(&XBZRLE.lock);
 106 }
 107
 108 static void XBZRLE_cache_unlock(void)
 109 {
 110     if (migrate_use_xbzrle())
 111         qemu_mutex_unlock(&XBZRLE.lock);
 112 }
 113
 114 /**
 115  * xbzrle_cache_resize: resize the xbzrle cache
 116  *
 117  * This function is called from qmp_migrate_set_cache_size in main
 118  * thread, possibly while a migration is in progress.  A running
 119  * migration may be using the cache and might finish during this call,
 120  * hence changes to the cache are protected by XBZRLE.lock().
 121  *
 122  * Returns 0 for success or -1 for error
 123  *
 124  * @new_size: new cache size
 125  * @errp: set *errp if the check failed, with reason
 126  */
 127 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 128 {
 129     PageCache *new_cache;
 130     int64_t ret = 0;
 131
 132     /* Check for truncation */
 133     if (new_size != (size_t)new_size) {
 134         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 135                    "exceeding address space");
 136         return -1;
 137     }
 138
 139     if (new_size == migrate_xbzrle_cache_size()) {
 140         /* nothing to do */
 141         return 0;
 142     }
 143
 144     XBZRLE_cache_lock();
 145
 146     if (XBZRLE.cache != NULL) {
 147         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 148         if (!new_cache) {
 149             ret = -1;
 150             goto out;
 151         }
 152
 153         cache_fini(XBZRLE.cache);
 154         XBZRLE.cache = new_cache;
 155     }
 156 out:
 157     XBZRLE_cache_unlock();
 158     return ret;
 159 }
 160
 161 bool ramblock_is_ignored(RAMBlock *block)
 162 {
 163     return !qemu_ram_is_migratable(block) ||
 164            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 165 }
 166
 167 #undef RAMBLOCK_FOREACH
 168
 169 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 170 {
 171     RAMBlock *block;
 172     int ret = 0;
 173
 174     RCU_READ_LOCK_GUARD();
 175
 176     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 177         ret = func(block, opaque);
 178         if (ret) {
 179             break;
 180         }
 181     }
 182     return ret;
 183 }
 184
 185 static void ramblock_recv_map_init(void)
 186 {
 187     RAMBlock *rb;
 188
 189     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 190         assert(!rb->receivedmap);
 191         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 192     }
 193 }
 194
 195 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 196 {
 197     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 198                     rb->receivedmap);
 199 }
 200
 201 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 202 {
 203     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 204 }
 205
 206 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 207 {
 208     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 209 }
 210
 211 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 212                                     size_t nr)
 213 {
 214     bitmap_set_atomic(rb->receivedmap,
 215                       ramblock_recv_bitmap_offset(host_addr, rb),
 216                       nr);
 217 }
 218
 219 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 220
 221 /*
 222  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 223  *
 224  * Returns >0 if success with sent bytes, or <0 if error.
 225  */
 226 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 227                                   const char *block_name)
 228 {
 229     RAMBlock *block = qemu_ram_block_by_name(block_name);
 230     unsigned long *le_bitmap, nbits;
 231     uint64_t size;
 232
 233     if (!block) {
 234         error_report("%s: invalid block name: %s", __func__, block_name);
 235         return -1;
 236     }
 237
 238     nbits = block->used_length >> TARGET_PAGE_BITS;
 239
 240     /*
 241      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 242      * machines we may need 4 more bytes for padding (see below
 243      * comment). So extend it a bit before hand.
 244      */
 245     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 246
 247     /*
 248      * Always use little endian when sending the bitmap. This is
 249      * required that when source and destination VMs are not using the
 250      * same endianness. (Note: big endian won't work.)
 251      */
 252     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 253
 254     /* Size of the bitmap, in bytes */
 255     size = DIV_ROUND_UP(nbits, 8);
 256
 257     /*
 258      * size is always aligned to 8 bytes for 64bit machines, but it
 259      * may not be true for 32bit machines. We need this padding to
 260      * make sure the migration can survive even between 32bit and
 261      * 64bit machines.
 262      */
 263     size = ROUND_UP(size, 8);
 264
 265     qemu_put_be64(file, size);
 266     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 267     /*
 268      * Mark as an end, in case the middle part is screwed up due to
 269      * some "mysterious" reason.
 270      */
 271     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 272     qemu_fflush(file);
 273
 274     g_free(le_bitmap);
 275
 276     if (qemu_file_get_error(file)) {
 277         return qemu_file_get_error(file);
 278     }
 279
 280     return size + sizeof(size);
 281 }
 282
 283 /*
 284  * An outstanding page request, on the source, having been received
 285  * and queued
 286  */
 287 struct RAMSrcPageRequest {
 288     RAMBlock *rb;
 289     hwaddr    offset;
 290     hwaddr    len;
 291
 292     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 293 };
 294
 295 /* State of RAM for migration */
 296 struct RAMState {
 297     /* QEMUFile used for this migration */
 298     QEMUFile *f;
 299     /* Last block that we have visited searching for dirty pages */
 300     RAMBlock *last_seen_block;
 301     /* Last block from where we have sent data */
 302     RAMBlock *last_sent_block;
 303     /* Last dirty target page we have sent */
 304     ram_addr_t last_page;
 305     /* last ram version we have seen */
 306     uint32_t last_version;
 307     /* We are in the first round */
 308     bool ram_bulk_stage;
 309     /* The free page optimization is enabled */
 310     bool fpo_enabled;
 311     /* How many times we have dirty too many pages */
 312     int dirty_rate_high_cnt;
 313     /* these variables are used for bitmap sync */
 314     /* last time we did a full bitmap_sync */
 315     int64_t time_last_bitmap_sync;
 316     /* bytes transferred at start_time */
 317     uint64_t bytes_xfer_prev;
 318     /* number of dirty pages since start_time */
 319     uint64_t num_dirty_pages_period;
 320     /* xbzrle misses since the beginning of the period */
 321     uint64_t xbzrle_cache_miss_prev;
 322     /* Amount of xbzrle pages since the beginning of the period */
 323     uint64_t xbzrle_pages_prev;
 324     /* Amount of xbzrle encoded bytes since the beginning of the period */
 325     uint64_t xbzrle_bytes_prev;
 326
 327     /* compression statistics since the beginning of the period */
 328     /* amount of count that no free thread to compress data */
 329     uint64_t compress_thread_busy_prev;
 330     /* amount bytes after compression */
 331     uint64_t compressed_size_prev;
 332     /* amount of compressed pages */
 333     uint64_t compress_pages_prev;
 334
 335     /* total handled target pages at the beginning of period */
 336     uint64_t target_page_count_prev;
 337     /* total handled target pages since start */
 338     uint64_t target_page_count;
 339     /* number of dirty bits in the bitmap */
 340     uint64_t migration_dirty_pages;
 341     /* Protects modification of the bitmap and migration dirty pages */
 342     QemuMutex bitmap_mutex;
 343     /* The RAMBlock used in the last src_page_requests */
 344     RAMBlock *last_req_rb;
 345     /* Queue of outstanding page requests from the destination */
 346     QemuMutex src_page_req_mutex;
 347     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 348 };
 349 typedef struct RAMState RAMState;
 350
 351 static RAMState *ram_state;
 352
 353 static NotifierWithReturnList precopy_notifier_list;
 354
 355 void precopy_infrastructure_init(void)
 356 {
 357     notifier_with_return_list_init(&precopy_notifier_list);
 358 }
 359
 360 void precopy_add_notifier(NotifierWithReturn *n)
 361 {
 362     notifier_with_return_list_add(&precopy_notifier_list, n);
 363 }
 364
 365 void precopy_remove_notifier(NotifierWithReturn *n)
 366 {
 367     notifier_with_return_remove(n);
 368 }
 369
 370 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 371 {
 372     PrecopyNotifyData pnd;
 373     pnd.reason = reason;
 374     pnd.errp = errp;
 375
 376     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 377 }
 378
 379 void precopy_enable_free_page_optimization(void)
 380 {
 381     if (!ram_state) {
 382         return;
 383     }
 384
 385     ram_state->fpo_enabled = true;
 386 }
 387
 388 uint64_t ram_bytes_remaining(void)
 389 {
 390     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 391                        0;
 392 }
 393
 394 MigrationStats ram_counters;
 395
 396 /* used by the search for pages to send */
 397 struct PageSearchStatus {
 398     /* Current block being searched */
 399     RAMBlock    *block;
 400     /* Current page to search from */
 401     unsigned long page;
 402     /* Set once we wrap around */
 403     bool         complete_round;
 404 };
 405 typedef struct PageSearchStatus PageSearchStatus;
 406
 407 CompressionStats compression_counters;
 408
 409 struct CompressParam {
 410     bool done;
 411     bool quit;
 412     bool zero_page;
 413     QEMUFile *file;
 414     QemuMutex mutex;
 415     QemuCond cond;
 416     RAMBlock *block;
 417     ram_addr_t offset;
 418
 419     /* internally used fields */
 420     z_stream stream;
 421     uint8_t *originbuf;
 422 };
 423 typedef struct CompressParam CompressParam;
 424
 425 struct DecompressParam {
 426     bool done;
 427     bool quit;
 428     QemuMutex mutex;
 429     QemuCond cond;
 430     void *des;
 431     uint8_t *compbuf;
 432     int len;
 433     z_stream stream;
 434 };
 435 typedef struct DecompressParam DecompressParam;
 436
 437 static CompressParam *comp_param;
 438 static QemuThread *compress_threads;
 439 /* comp_done_cond is used to wake up the migration thread when
 440  * one of the compression threads has finished the compression.
 441  * comp_done_lock is used to co-work with comp_done_cond.
 442  */
 443 static QemuMutex comp_done_lock;
 444 static QemuCond comp_done_cond;
 445 /* The empty QEMUFileOps will be used by file in CompressParam */
 446 static const QEMUFileOps empty_ops = { };
 447
 448 static QEMUFile *decomp_file;
 449 static DecompressParam *decomp_param;
 450 static QemuThread *decompress_threads;
 451 static QemuMutex decomp_done_lock;
 452 static QemuCond decomp_done_cond;
 453
 454 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 455                                  ram_addr_t offset, uint8_t *source_buf);
 456
 457 static void *do_data_compress(void *opaque)
 458 {
 459     CompressParam *param = opaque;
 460     RAMBlock *block;
 461     ram_addr_t offset;
 462     bool zero_page;
 463
 464     qemu_mutex_lock(&param->mutex);
 465     while (!param->quit) {
 466         if (param->block) {
 467             block = param->block;
 468             offset = param->offset;
 469             param->block = NULL;
 470             qemu_mutex_unlock(&param->mutex);
 471
 472             zero_page = do_compress_ram_page(param->file, &param->stream,
 473                                              block, offset, param->originbuf);
 474
 475             qemu_mutex_lock(&comp_done_lock);
 476             param->done = true;
 477             param->zero_page = zero_page;
 478             qemu_cond_signal(&comp_done_cond);
 479             qemu_mutex_unlock(&comp_done_lock);
 480
 481             qemu_mutex_lock(&param->mutex);
 482         } else {
 483             qemu_cond_wait(&param->cond, &param->mutex);
 484         }
 485     }
 486     qemu_mutex_unlock(&param->mutex);
 487
 488     return NULL;
 489 }
 490
 491 static void compress_threads_save_cleanup(void)
 492 {
 493     int i, thread_count;
 494
 495     if (!migrate_use_compression() || !comp_param) {
 496         return;
 497     }
 498
 499     thread_count = migrate_compress_threads();
 500     for (i = 0; i < thread_count; i++) {
 501         /*
 502          * we use it as a indicator which shows if the thread is
 503          * properly init'd or not
 504          */
 505         if (!comp_param[i].file) {
 506             break;
 507         }
 508
 509         qemu_mutex_lock(&comp_param[i].mutex);
 510         comp_param[i].quit = true;
 511         qemu_cond_signal(&comp_param[i].cond);
 512         qemu_mutex_unlock(&comp_param[i].mutex);
 513
 514         qemu_thread_join(compress_threads + i);
 515         qemu_mutex_destroy(&comp_param[i].mutex);
 516         qemu_cond_destroy(&comp_param[i].cond);
 517         deflateEnd(&comp_param[i].stream);
 518         g_free(comp_param[i].originbuf);
 519         qemu_fclose(comp_param[i].file);
 520         comp_param[i].file = NULL;
 521     }
 522     qemu_mutex_destroy(&comp_done_lock);
 523     qemu_cond_destroy(&comp_done_cond);
 524     g_free(compress_threads);
 525     g_free(comp_param);
 526     compress_threads = NULL;
 527     comp_param = NULL;
 528 }
 529
 530 static int compress_threads_save_setup(void)
 531 {
 532     int i, thread_count;
 533
 534     if (!migrate_use_compression()) {
 535         return 0;
 536     }
 537     thread_count = migrate_compress_threads();
 538     compress_threads = g_new0(QemuThread, thread_count);
 539     comp_param = g_new0(CompressParam, thread_count);
 540     qemu_cond_init(&comp_done_cond);
 541     qemu_mutex_init(&comp_done_lock);
 542     for (i = 0; i < thread_count; i++) {
 543         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 544         if (!comp_param[i].originbuf) {
 545             goto exit;
 546         }
 547
 548         if (deflateInit(&comp_param[i].stream,
 549                         migrate_compress_level()) != Z_OK) {
 550             g_free(comp_param[i].originbuf);
 551             goto exit;
 552         }
 553
 554         /* comp_param[i].file is just used as a dummy buffer to save data,
 555          * set its ops to empty.
 556          */
 557         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 558         comp_param[i].done = true;
 559         comp_param[i].quit = false;
 560         qemu_mutex_init(&comp_param[i].mutex);
 561         qemu_cond_init(&comp_param[i].cond);
 562         qemu_thread_create(compress_threads + i, "compress",
 563                            do_data_compress, comp_param + i,
 564                            QEMU_THREAD_JOINABLE);
 565     }
 566     return 0;
 567
 568 exit:
 569     compress_threads_save_cleanup();
 570     return -1;
 571 }
 572
 573 /**
 574  * save_page_header: write page header to wire
 575  *
 576  * If this is the 1st block, it also writes the block identification
 577  *
 578  * Returns the number of bytes written
 579  *
 580  * @f: QEMUFile where to send the data
 581  * @block: block that contains the page we want to send
 582  * @offset: offset inside the block for the page
 583  *          in the lower bits, it contains flags
 584  */
 585 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 586                                ram_addr_t offset)
 587 {
 588     size_t size, len;
 589
 590     if (block == rs->last_sent_block) {
 591         offset |= RAM_SAVE_FLAG_CONTINUE;
 592     }
 593     qemu_put_be64(f, offset);
 594     size = 8;
 595
 596     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 597         len = strlen(block->idstr);
 598         qemu_put_byte(f, len);
 599         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 600         size += 1 + len;
 601         rs->last_sent_block = block;
 602     }
 603     return size;
 604 }
 605
 606 /**
 607  * mig_throttle_guest_down: throotle down the guest
 608  *
 609  * Reduce amount of guest cpu execution to hopefully slow down memory
 610  * writes. If guest dirty memory rate is reduced below the rate at
 611  * which we can transfer pages to the destination then we should be
 612  * able to complete migration. Some workloads dirty memory way too
 613  * fast and will not effectively converge, even with auto-converge.
 614  */
 615 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 616                                     uint64_t bytes_dirty_threshold)
 617 {
 618     MigrationState *s = migrate_get_current();
 619     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 620     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 621     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 622     int pct_max = s->parameters.max_cpu_throttle;
 623
 624     uint64_t throttle_now = cpu_throttle_get_percentage();
 625     uint64_t cpu_now, cpu_ideal, throttle_inc;
 626
 627     /* We have not started throttling yet. Let's start it. */
 628     if (!cpu_throttle_active()) {
 629         cpu_throttle_set(pct_initial);
 630     } else {
 631         /* Throttling already on, just increase the rate */
 632         if (!pct_tailslow) {
 633             throttle_inc = pct_increment;
 634         } else {
 635             /* Compute the ideal CPU percentage used by Guest, which may
 636              * make the dirty rate match the dirty rate threshold. */
 637             cpu_now = 100 - throttle_now;
 638             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 639                         bytes_dirty_period);
 640             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 641         }
 642         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 643     }
 644 }
 645
 646 /**
 647  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 648  *
 649  * @rs: current RAM state
 650  * @current_addr: address for the zero page
 651  *
 652  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 653  * The important thing is that a stale (not-yet-0'd) page be replaced
 654  * by the new data.
 655  * As a bonus, if the page wasn't in the cache it gets added so that
 656  * when a small write is made into the 0'd page it gets XBZRLE sent.
 657  */
 658 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 659 {
 660     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 661         return;
 662     }
 663
 664     /* We don't care if this fails to allocate a new cache page
 665      * as long as it updated an old one */
 666     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 667                  ram_counters.dirty_sync_count);
 668 }
 669
 670 #define ENCODING_FLAG_XBZRLE 0x1
 671
 672 /**
 673  * save_xbzrle_page: compress and send current page
 674  *
 675  * Returns: 1 means that we wrote the page
 676  *          0 means that page is identical to the one already sent
 677  *          -1 means that xbzrle would be longer than normal
 678  *
 679  * @rs: current RAM state
 680  * @current_data: pointer to the address of the page contents
 681  * @current_addr: addr of the page
 682  * @block: block that contains the page we want to send
 683  * @offset: offset inside the block for the page
 684  * @last_stage: if we are at the completion stage
 685  */
 686 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 687                             ram_addr_t current_addr, RAMBlock *block,
 688                             ram_addr_t offset, bool last_stage)
 689 {
 690     int encoded_len = 0, bytes_xbzrle;
 691     uint8_t *prev_cached_page;
 692
 693     if (!cache_is_cached(XBZRLE.cache, current_addr,
 694                          ram_counters.dirty_sync_count)) {
 695         xbzrle_counters.cache_miss++;
 696         if (!last_stage) {
 697             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 698                              ram_counters.dirty_sync_count) == -1) {
 699                 return -1;
 700             } else {
 701                 /* update *current_data when the page has been
 702                    inserted into cache */
 703                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 704             }
 705         }
 706         return -1;
 707     }
 708
 709     /*
 710      * Reaching here means the page has hit the xbzrle cache, no matter what
 711      * encoding result it is (normal encoding, overflow or skipping the page),
 712      * count the page as encoded. This is used to calculate the encoding rate.
 713      *
 714      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 715      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 716      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 717      * skipped page included. In this way, the encoding rate can tell if the
 718      * guest page is good for xbzrle encoding.
 719      */
 720     xbzrle_counters.pages++;
 721     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 722
 723     /* save current buffer into memory */
 724     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 725
 726     /* XBZRLE encoding (if there is no overflow) */
 727     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 728                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 729                                        TARGET_PAGE_SIZE);
 730
 731     /*
 732      * Update the cache contents, so that it corresponds to the data
 733      * sent, in all cases except where we skip the page.
 734      */
 735     if (!last_stage && encoded_len != 0) {
 736         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 737         /*
 738          * In the case where we couldn't compress, ensure that the caller
 739          * sends the data from the cache, since the guest might have
 740          * changed the RAM since we copied it.
 741          */
 742         *current_data = prev_cached_page;
 743     }
 744
 745     if (encoded_len == 0) {
 746         trace_save_xbzrle_page_skipping();
 747         return 0;
 748     } else if (encoded_len == -1) {
 749         trace_save_xbzrle_page_overflow();
 750         xbzrle_counters.overflow++;
 751         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 752         return -1;
 753     }
 754
 755     /* Send XBZRLE based compressed page */
 756     bytes_xbzrle = save_page_header(rs, rs->f, block,
 757                                     offset | RAM_SAVE_FLAG_XBZRLE);
 758     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 759     qemu_put_be16(rs->f, encoded_len);
 760     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 761     bytes_xbzrle += encoded_len + 1 + 2;
 762     /*
 763      * Like compressed_size (please see update_compress_thread_counts),
 764      * the xbzrle encoded bytes don't count the 8 byte header with
 765      * RAM_SAVE_FLAG_CONTINUE.
 766      */
 767     xbzrle_counters.bytes += bytes_xbzrle - 8;
 768     ram_counters.transferred += bytes_xbzrle;
 769
 770     return 1;
 771 }
 772
 773 /**
 774  * migration_bitmap_find_dirty: find the next dirty page from start
 775  *
 776  * Returns the page offset within memory region of the start of a dirty page
 777  *
 778  * @rs: current RAM state
 779  * @rb: RAMBlock where to search for dirty pages
 780  * @start: page where we start the search
 781  */
 782 static inline
 783 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 784                                           unsigned long start)
 785 {
 786     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 787     unsigned long *bitmap = rb->bmap;
 788     unsigned long next;
 789
 790     if (ramblock_is_ignored(rb)) {
 791         return size;
 792     }
 793
 794     /*
 795      * When the free page optimization is enabled, we need to check the bitmap
 796      * to send the non-free pages rather than all the pages in the bulk stage.
 797      */
 798     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
 799         next = start + 1;
 800     } else {
 801         next = find_next_bit(bitmap, size, start);
 802     }
 803
 804     return next;
 805 }
 806
 807 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 808                                                 RAMBlock *rb,
 809                                                 unsigned long page)
 810 {
 811     bool ret;
 812
 813     qemu_mutex_lock(&rs->bitmap_mutex);
 814
 815     /*
 816      * Clear dirty bitmap if needed.  This _must_ be called before we
 817      * send any of the page in the chunk because we need to make sure
 818      * we can capture further page content changes when we sync dirty
 819      * log the next time.  So as long as we are going to send any of
 820      * the page in the chunk we clear the remote dirty bitmap for all.
 821      * Clearing it earlier won't be a problem, but too late will.
 822      */
 823     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
 824         uint8_t shift = rb->clear_bmap_shift;
 825         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
 826         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 827
 828         /*
 829          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 830          * can make things easier sometimes since then start address
 831          * of the small chunk will always be 64 pages aligned so the
 832          * bitmap will always be aligned to unsigned long.  We should
 833          * even be able to remove this restriction but I'm simply
 834          * keeping it.
 835          */
 836         assert(shift >= 6);
 837         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 838         memory_region_clear_dirty_bitmap(rb->mr, start, size);
 839     }
 840
 841     ret = test_and_clear_bit(page, rb->bmap);
 842
 843     if (ret) {
 844         rs->migration_dirty_pages--;
 845     }
 846     qemu_mutex_unlock(&rs->bitmap_mutex);
 847
 848     return ret;
 849 }
 850
 851 /* Called with RCU critical section */
 852 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 853 {
 854     uint64_t new_dirty_pages =
 855         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 856
 857     rs->migration_dirty_pages += new_dirty_pages;
 858     rs->num_dirty_pages_period += new_dirty_pages;
 859 }
 860
 861 /**
 862  * ram_pagesize_summary: calculate all the pagesizes of a VM
 863  *
 864  * Returns a summary bitmap of the page sizes of all RAMBlocks
 865  *
 866  * For VMs with just normal pages this is equivalent to the host page
 867  * size. If it's got some huge pages then it's the OR of all the
 868  * different page sizes.
 869  */
 870 uint64_t ram_pagesize_summary(void)
 871 {
 872     RAMBlock *block;
 873     uint64_t summary = 0;
 874
 875     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 876         summary |= block->page_size;
 877     }
 878
 879     return summary;
 880 }
 881
 882 uint64_t ram_get_total_transferred_pages(void)
 883 {
 884     return  ram_counters.normal + ram_counters.duplicate +
 885                 compression_counters.pages + xbzrle_counters.pages;
 886 }
 887
 888 static void migration_update_rates(RAMState *rs, int64_t end_time)
 889 {
 890     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 891     double compressed_size;
 892
 893     /* calculate period counters */
 894     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 895                 / (end_time - rs->time_last_bitmap_sync);
 896
 897     if (!page_count) {
 898         return;
 899     }
 900
 901     if (migrate_use_xbzrle()) {
 902         double encoded_size, unencoded_size;
 903
 904         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 905             rs->xbzrle_cache_miss_prev) / page_count;
 906         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 907         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 908                          TARGET_PAGE_SIZE;
 909         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 910         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 911             xbzrle_counters.encoding_rate = 0;
 912         } else {
 913             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 914         }
 915         rs->xbzrle_pages_prev = xbzrle_counters.pages;
 916         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 917     }
 918
 919     if (migrate_use_compression()) {
 920         compression_counters.busy_rate = (double)(compression_counters.busy -
 921             rs->compress_thread_busy_prev) / page_count;
 922         rs->compress_thread_busy_prev = compression_counters.busy;
 923
 924         compressed_size = compression_counters.compressed_size -
 925                           rs->compressed_size_prev;
 926         if (compressed_size) {
 927             double uncompressed_size = (compression_counters.pages -
 928                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 929
 930             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 931             compression_counters.compression_rate =
 932                                         uncompressed_size / compressed_size;
 933
 934             rs->compress_pages_prev = compression_counters.pages;
 935             rs->compressed_size_prev = compression_counters.compressed_size;
 936         }
 937     }
 938 }
 939
 940 static void migration_trigger_throttle(RAMState *rs)
 941 {
 942     MigrationState *s = migrate_get_current();
 943     uint64_t threshold = s->parameters.throttle_trigger_threshold;
 944
 945     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 946     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 947     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 948
 949     /* During block migration the auto-converge logic incorrectly detects
 950      * that ram migration makes no progress. Avoid this by disabling the
 951      * throttling logic during the bulk phase of block migration. */
 952     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 953         /* The following detection logic can be refined later. For now:
 954            Check to see if the ratio between dirtied bytes and the approx.
 955            amount of bytes that just got transferred since the last time
 956            we were in this routine reaches the threshold. If that happens
 957            twice, start or increase throttling. */
 958
 959         if ((bytes_dirty_period > bytes_dirty_threshold) &&
 960             (++rs->dirty_rate_high_cnt >= 2)) {
 961             trace_migration_throttle();
 962             rs->dirty_rate_high_cnt = 0;
 963             mig_throttle_guest_down(bytes_dirty_period,
 964                                     bytes_dirty_threshold);
 965         }
 966     }
 967 }
 968
 969 static void migration_bitmap_sync(RAMState *rs)
 970 {
 971     RAMBlock *block;
 972     int64_t end_time;
 973
 974     ram_counters.dirty_sync_count++;
 975
 976     if (!rs->time_last_bitmap_sync) {
 977         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 978     }
 979
 980     trace_migration_bitmap_sync_start();
 981     memory_global_dirty_log_sync();
 982
 983     qemu_mutex_lock(&rs->bitmap_mutex);
 984     WITH_RCU_READ_LOCK_GUARD() {
 985         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 986             ramblock_sync_dirty_bitmap(rs, block);
 987         }
 988         ram_counters.remaining = ram_bytes_remaining();
 989     }
 990     qemu_mutex_unlock(&rs->bitmap_mutex);
 991
 992     memory_global_after_dirty_log_sync();
 993     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 994
 995     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 996
 997     /* more than 1 second = 1000 millisecons */
 998     if (end_time > rs->time_last_bitmap_sync + 1000) {
 999         migration_trigger_throttle(rs);
1000
1001         migration_update_rates(rs, end_time);
1002
1003         rs->target_page_count_prev = rs->target_page_count;
1004
1005         /* reset period counters */
1006         rs->time_last_bitmap_sync = end_time;
1007         rs->num_dirty_pages_period = 0;
1008         rs->bytes_xfer_prev = ram_counters.transferred;
1009     }
1010     if (migrate_use_events()) {
1011         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1012     }
1013 }
1014
1015 static void migration_bitmap_sync_precopy(RAMState *rs)
1016 {
1017     Error *local_err = NULL;
1018
1019     /*
1020      * The current notifier usage is just an optimization to migration, so we
1021      * don't stop the normal migration process in the error case.
1022      */
1023     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1024         error_report_err(local_err);
1025         local_err = NULL;
1026     }
1027
1028     migration_bitmap_sync(rs);
1029
1030     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1031         error_report_err(local_err);
1032     }
1033 }
1034
1035 /**
1036  * save_zero_page_to_file: send the zero page to the file
1037  *
1038  * Returns the size of data written to the file, 0 means the page is not
1039  * a zero page
1040  *
1041  * @rs: current RAM state
1042  * @file: the file where the data is saved
1043  * @block: block that contains the page we want to send
1044  * @offset: offset inside the block for the page
1045  */
1046 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1047                                   RAMBlock *block, ram_addr_t offset)
1048 {
1049     uint8_t *p = block->host + offset;
1050     int len = 0;
1051
1052     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1053         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1054         qemu_put_byte(file, 0);
1055         len += 1;
1056     }
1057     return len;
1058 }
1059
1060 /**
1061  * save_zero_page: send the zero page to the stream
1062  *
1063  * Returns the number of pages written.
1064  *
1065  * @rs: current RAM state
1066  * @block: block that contains the page we want to send
1067  * @offset: offset inside the block for the page
1068  */
1069 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1070 {
1071     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1072
1073     if (len) {
1074         ram_counters.duplicate++;
1075         ram_counters.transferred += len;
1076         return 1;
1077     }
1078     return -1;
1079 }
1080
1081 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1082 {
1083     if (!migrate_release_ram() || !migration_in_postcopy()) {
1084         return;
1085     }
1086
1087     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1088 }
1089
1090 /*
1091  * @pages: the number of pages written by the control path,
1092  *        < 0 - error
1093  *        > 0 - number of pages written
1094  *
1095  * Return true if the pages has been saved, otherwise false is returned.
1096  */
1097 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1098                               int *pages)
1099 {
1100     uint64_t bytes_xmit = 0;
1101     int ret;
1102
1103     *pages = -1;
1104     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1105                                 &bytes_xmit);
1106     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1107         return false;
1108     }
1109
1110     if (bytes_xmit) {
1111         ram_counters.transferred += bytes_xmit;
1112         *pages = 1;
1113     }
1114
1115     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1116         return true;
1117     }
1118
1119     if (bytes_xmit > 0) {
1120         ram_counters.normal++;
1121     } else if (bytes_xmit == 0) {
1122         ram_counters.duplicate++;
1123     }
1124
1125     return true;
1126 }
1127
1128 /*
1129  * directly send the page to the stream
1130  *
1131  * Returns the number of pages written.
1132  *
1133  * @rs: current RAM state
1134  * @block: block that contains the page we want to send
1135  * @offset: offset inside the block for the page
1136  * @buf: the page to be sent
1137  * @async: send to page asyncly
1138  */
1139 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1140                             uint8_t *buf, bool async)
1141 {
1142     ram_counters.transferred += save_page_header(rs, rs->f, block,
1143                                                  offset | RAM_SAVE_FLAG_PAGE);
1144     if (async) {
1145         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1146                               migrate_release_ram() &
1147                               migration_in_postcopy());
1148     } else {
1149         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1150     }
1151     ram_counters.transferred += TARGET_PAGE_SIZE;
1152     ram_counters.normal++;
1153     return 1;
1154 }
1155
1156 /**
1157  * ram_save_page: send the given page to the stream
1158  *
1159  * Returns the number of pages written.
1160  *          < 0 - error
1161  *          >=0 - Number of pages written - this might legally be 0
1162  *                if xbzrle noticed the page was the same.
1163  *
1164  * @rs: current RAM state
1165  * @block: block that contains the page we want to send
1166  * @offset: offset inside the block for the page
1167  * @last_stage: if we are at the completion stage
1168  */
1169 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1170 {
1171     int pages = -1;
1172     uint8_t *p;
1173     bool send_async = true;
1174     RAMBlock *block = pss->block;
1175     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1176     ram_addr_t current_addr = block->offset + offset;
1177
1178     p = block->host + offset;
1179     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1180
1181     XBZRLE_cache_lock();
1182     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1183         migrate_use_xbzrle()) {
1184         pages = save_xbzrle_page(rs, &p, current_addr, block,
1185                                  offset, last_stage);
1186         if (!last_stage) {
1187             /* Can't send this cached data async, since the cache page
1188              * might get updated before it gets to the wire
1189              */
1190             send_async = false;
1191         }
1192     }
1193
1194     /* XBZRLE overflow or normal page */
1195     if (pages == -1) {
1196         pages = save_normal_page(rs, block, offset, p, send_async);
1197     }
1198
1199     XBZRLE_cache_unlock();
1200
1201     return pages;
1202 }
1203
1204 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1205                                  ram_addr_t offset)
1206 {
1207     if (multifd_queue_page(rs->f, block, offset) < 0) {
1208         return -1;
1209     }
1210     ram_counters.normal++;
1211
1212     return 1;
1213 }
1214
1215 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1216                                  ram_addr_t offset, uint8_t *source_buf)
1217 {
1218     RAMState *rs = ram_state;
1219     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1220     bool zero_page = false;
1221     int ret;
1222
1223     if (save_zero_page_to_file(rs, f, block, offset)) {
1224         zero_page = true;
1225         goto exit;
1226     }
1227
1228     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1229
1230     /*
1231      * copy it to a internal buffer to avoid it being modified by VM
1232      * so that we can catch up the error during compression and
1233      * decompression
1234      */
1235     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1236     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1237     if (ret < 0) {
1238         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1239         error_report("compressed data failed!");
1240         return false;
1241     }
1242
1243 exit:
1244     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1245     return zero_page;
1246 }
1247
1248 static void
1249 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1250 {
1251     ram_counters.transferred += bytes_xmit;
1252
1253     if (param->zero_page) {
1254         ram_counters.duplicate++;
1255         return;
1256     }
1257
1258     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1259     compression_counters.compressed_size += bytes_xmit - 8;
1260     compression_counters.pages++;
1261 }
1262
1263 static bool save_page_use_compression(RAMState *rs);
1264
1265 static void flush_compressed_data(RAMState *rs)
1266 {
1267     int idx, len, thread_count;
1268
1269     if (!save_page_use_compression(rs)) {
1270         return;
1271     }
1272     thread_count = migrate_compress_threads();
1273
1274     qemu_mutex_lock(&comp_done_lock);
1275     for (idx = 0; idx < thread_count; idx++) {
1276         while (!comp_param[idx].done) {
1277             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1278         }
1279     }
1280     qemu_mutex_unlock(&comp_done_lock);
1281
1282     for (idx = 0; idx < thread_count; idx++) {
1283         qemu_mutex_lock(&comp_param[idx].mutex);
1284         if (!comp_param[idx].quit) {
1285             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1286             /*
1287              * it's safe to fetch zero_page without holding comp_done_lock
1288              * as there is no further request submitted to the thread,
1289              * i.e, the thread should be waiting for a request at this point.
1290              */
1291             update_compress_thread_counts(&comp_param[idx], len);
1292         }
1293         qemu_mutex_unlock(&comp_param[idx].mutex);
1294     }
1295 }
1296
1297 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1298                                        ram_addr_t offset)
1299 {
1300     param->block = block;
1301     param->offset = offset;
1302 }
1303
1304 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1305                                            ram_addr_t offset)
1306 {
1307     int idx, thread_count, bytes_xmit = -1, pages = -1;
1308     bool wait = migrate_compress_wait_thread();
1309
1310     thread_count = migrate_compress_threads();
1311     qemu_mutex_lock(&comp_done_lock);
1312 retry:
1313     for (idx = 0; idx < thread_count; idx++) {
1314         if (comp_param[idx].done) {
1315             comp_param[idx].done = false;
1316             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1317             qemu_mutex_lock(&comp_param[idx].mutex);
1318             set_compress_params(&comp_param[idx], block, offset);
1319             qemu_cond_signal(&comp_param[idx].cond);
1320             qemu_mutex_unlock(&comp_param[idx].mutex);
1321             pages = 1;
1322             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1323             break;
1324         }
1325     }
1326
1327     /*
1328      * wait for the free thread if the user specifies 'compress-wait-thread',
1329      * otherwise we will post the page out in the main thread as normal page.
1330      */
1331     if (pages < 0 && wait) {
1332         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1333         goto retry;
1334     }
1335     qemu_mutex_unlock(&comp_done_lock);
1336
1337     return pages;
1338 }
1339
1340 /**
1341  * find_dirty_block: find the next dirty page and update any state
1342  * associated with the search process.
1343  *
1344  * Returns true if a page is found
1345  *
1346  * @rs: current RAM state
1347  * @pss: data about the state of the current dirty page scan
1348  * @again: set to false if the search has scanned the whole of RAM
1349  */
1350 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1351 {
1352     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1353     if (pss->complete_round && pss->block == rs->last_seen_block &&
1354         pss->page >= rs->last_page) {
1355         /*
1356          * We've been once around the RAM and haven't found anything.
1357          * Give up.
1358          */
1359         *again = false;
1360         return false;
1361     }
1362     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1363         >= pss->block->used_length) {
1364         /* Didn't find anything in this RAM Block */
1365         pss->page = 0;
1366         pss->block = QLIST_NEXT_RCU(pss->block, next);
1367         if (!pss->block) {
1368             /*
1369              * If memory migration starts over, we will meet a dirtied page
1370              * which may still exists in compression threads's ring, so we
1371              * should flush the compressed data to make sure the new page
1372              * is not overwritten by the old one in the destination.
1373              *
1374              * Also If xbzrle is on, stop using the data compression at this
1375              * point. In theory, xbzrle can do better than compression.
1376              */
1377             flush_compressed_data(rs);
1378
1379             /* Hit the end of the list */
1380             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1381             /* Flag that we've looped */
1382             pss->complete_round = true;
1383             rs->ram_bulk_stage = false;
1384         }
1385         /* Didn't find anything this time, but try again on the new block */
1386         *again = true;
1387         return false;
1388     } else {
1389         /* Can go around again, but... */
1390         *again = true;
1391         /* We've found something so probably don't need to */
1392         return true;
1393     }
1394 }
1395
1396 /**
1397  * unqueue_page: gets a page of the queue
1398  *
1399  * Helper for 'get_queued_page' - gets a page off the queue
1400  *
1401  * Returns the block of the page (or NULL if none available)
1402  *
1403  * @rs: current RAM state
1404  * @offset: used to return the offset within the RAMBlock
1405  */
1406 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1407 {
1408     RAMBlock *block = NULL;
1409
1410     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1411         return NULL;
1412     }
1413
1414     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1415     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1416         struct RAMSrcPageRequest *entry =
1417                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1418         block = entry->rb;
1419         *offset = entry->offset;
1420
1421         if (entry->len > TARGET_PAGE_SIZE) {
1422             entry->len -= TARGET_PAGE_SIZE;
1423             entry->offset += TARGET_PAGE_SIZE;
1424         } else {
1425             memory_region_unref(block->mr);
1426             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1427             g_free(entry);
1428             migration_consume_urgent_request();
1429         }
1430     }
1431
1432     return block;
1433 }
1434
1435 /**
1436  * get_queued_page: unqueue a page from the postcopy requests
1437  *
1438  * Skips pages that are already sent (!dirty)
1439  *
1440  * Returns true if a queued page is found
1441  *
1442  * @rs: current RAM state
1443  * @pss: data about the state of the current dirty page scan
1444  */
1445 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1446 {
1447     RAMBlock  *block;
1448     ram_addr_t offset;
1449     bool dirty;
1450
1451     do {
1452         block = unqueue_page(rs, &offset);
1453         /*
1454          * We're sending this page, and since it's postcopy nothing else
1455          * will dirty it, and we must make sure it doesn't get sent again
1456          * even if this queue request was received after the background
1457          * search already sent it.
1458          */
1459         if (block) {
1460             unsigned long page;
1461
1462             page = offset >> TARGET_PAGE_BITS;
1463             dirty = test_bit(page, block->bmap);
1464             if (!dirty) {
1465                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1466                                                 page);
1467             } else {
1468                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1469             }
1470         }
1471
1472     } while (block && !dirty);
1473
1474     if (block) {
1475         /*
1476          * As soon as we start servicing pages out of order, then we have
1477          * to kill the bulk stage, since the bulk stage assumes
1478          * in (migration_bitmap_find_and_reset_dirty) that every page is
1479          * dirty, that's no longer true.
1480          */
1481         rs->ram_bulk_stage = false;
1482
1483         /*
1484          * We want the background search to continue from the queued page
1485          * since the guest is likely to want other pages near to the page
1486          * it just requested.
1487          */
1488         pss->block = block;
1489         pss->page = offset >> TARGET_PAGE_BITS;
1490
1491         /*
1492          * This unqueued page would break the "one round" check, even is
1493          * really rare.
1494          */
1495         pss->complete_round = false;
1496     }
1497
1498     return !!block;
1499 }
1500
1501 /**
1502  * migration_page_queue_free: drop any remaining pages in the ram
1503  * request queue
1504  *
1505  * It should be empty at the end anyway, but in error cases there may
1506  * be some left.  in case that there is any page left, we drop it.
1507  *
1508  */
1509 static void migration_page_queue_free(RAMState *rs)
1510 {
1511     struct RAMSrcPageRequest *mspr, *next_mspr;
1512     /* This queue generally should be empty - but in the case of a failed
1513      * migration might have some droppings in.
1514      */
1515     RCU_READ_LOCK_GUARD();
1516     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1517         memory_region_unref(mspr->rb->mr);
1518         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1519         g_free(mspr);
1520     }
1521 }
1522
1523 /**
1524  * ram_save_queue_pages: queue the page for transmission
1525  *
1526  * A request from postcopy destination for example.
1527  *
1528  * Returns zero on success or negative on error
1529  *
1530  * @rbname: Name of the RAMBLock of the request. NULL means the
1531  *          same that last one.
1532  * @start: starting address from the start of the RAMBlock
1533  * @len: length (in bytes) to send
1534  */
1535 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1536 {
1537     RAMBlock *ramblock;
1538     RAMState *rs = ram_state;
1539
1540     ram_counters.postcopy_requests++;
1541     RCU_READ_LOCK_GUARD();
1542
1543     if (!rbname) {
1544         /* Reuse last RAMBlock */
1545         ramblock = rs->last_req_rb;
1546
1547         if (!ramblock) {
1548             /*
1549              * Shouldn't happen, we can't reuse the last RAMBlock if
1550              * it's the 1st request.
1551              */
1552             error_report("ram_save_queue_pages no previous block");
1553             return -1;
1554         }
1555     } else {
1556         ramblock = qemu_ram_block_by_name(rbname);
1557
1558         if (!ramblock) {
1559             /* We shouldn't be asked for a non-existent RAMBlock */
1560             error_report("ram_save_queue_pages no block '%s'", rbname);
1561             return -1;
1562         }
1563         rs->last_req_rb = ramblock;
1564     }
1565     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1566     if (start+len > ramblock->used_length) {
1567         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1568                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1569                      __func__, start, len, ramblock->used_length);
1570         return -1;
1571     }
1572
1573     struct RAMSrcPageRequest *new_entry =
1574         g_malloc0(sizeof(struct RAMSrcPageRequest));
1575     new_entry->rb = ramblock;
1576     new_entry->offset = start;
1577     new_entry->len = len;
1578
1579     memory_region_ref(ramblock->mr);
1580     qemu_mutex_lock(&rs->src_page_req_mutex);
1581     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1582     migration_make_urgent_request();
1583     qemu_mutex_unlock(&rs->src_page_req_mutex);
1584
1585     return 0;
1586 }
1587
1588 static bool save_page_use_compression(RAMState *rs)
1589 {
1590     if (!migrate_use_compression()) {
1591         return false;
1592     }
1593
1594     /*
1595      * If xbzrle is on, stop using the data compression after first
1596      * round of migration even if compression is enabled. In theory,
1597      * xbzrle can do better than compression.
1598      */
1599     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1600         return true;
1601     }
1602
1603     return false;
1604 }
1605
1606 /*
1607  * try to compress the page before posting it out, return true if the page
1608  * has been properly handled by compression, otherwise needs other
1609  * paths to handle it
1610  */
1611 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1612 {
1613     if (!save_page_use_compression(rs)) {
1614         return false;
1615     }
1616
1617     /*
1618      * When starting the process of a new block, the first page of
1619      * the block should be sent out before other pages in the same
1620      * block, and all the pages in last block should have been sent
1621      * out, keeping this order is important, because the 'cont' flag
1622      * is used to avoid resending the block name.
1623      *
1624      * We post the fist page as normal page as compression will take
1625      * much CPU resource.
1626      */
1627     if (block != rs->last_sent_block) {
1628         flush_compressed_data(rs);
1629         return false;
1630     }
1631
1632     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1633         return true;
1634     }
1635
1636     compression_counters.busy++;
1637     return false;
1638 }
1639
1640 /**
1641  * ram_save_target_page: save one target page
1642  *
1643  * Returns the number of pages written
1644  *
1645  * @rs: current RAM state
1646  * @pss: data about the page we want to send
1647  * @last_stage: if we are at the completion stage
1648  */
1649 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1650                                 bool last_stage)
1651 {
1652     RAMBlock *block = pss->block;
1653     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1654     int res;
1655
1656     if (control_save_page(rs, block, offset, &res)) {
1657         return res;
1658     }
1659
1660     if (save_compress_page(rs, block, offset)) {
1661         return 1;
1662     }
1663
1664     res = save_zero_page(rs, block, offset);
1665     if (res > 0) {
1666         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1667          * page would be stale
1668          */
1669         if (!save_page_use_compression(rs)) {
1670             XBZRLE_cache_lock();
1671             xbzrle_cache_zero_page(rs, block->offset + offset);
1672             XBZRLE_cache_unlock();
1673         }
1674         ram_release_pages(block->idstr, offset, res);
1675         return res;
1676     }
1677
1678     /*
1679      * Do not use multifd for:
1680      * 1. Compression as the first page in the new block should be posted out
1681      *    before sending the compressed page
1682      * 2. In postcopy as one whole host page should be placed
1683      */
1684     if (!save_page_use_compression(rs) && migrate_use_multifd()
1685         && !migration_in_postcopy()) {
1686         return ram_save_multifd_page(rs, block, offset);
1687     }
1688
1689     return ram_save_page(rs, pss, last_stage);
1690 }
1691
1692 /**
1693  * ram_save_host_page: save a whole host page
1694  *
1695  * Starting at *offset send pages up to the end of the current host
1696  * page. It's valid for the initial offset to point into the middle of
1697  * a host page in which case the remainder of the hostpage is sent.
1698  * Only dirty target pages are sent. Note that the host page size may
1699  * be a huge page for this block.
1700  * The saving stops at the boundary of the used_length of the block
1701  * if the RAMBlock isn't a multiple of the host page size.
1702  *
1703  * Returns the number of pages written or negative on error
1704  *
1705  * @rs: current RAM state
1706  * @ms: current migration state
1707  * @pss: data about the page we want to send
1708  * @last_stage: if we are at the completion stage
1709  */
1710 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1711                               bool last_stage)
1712 {
1713     int tmppages, pages = 0;
1714     size_t pagesize_bits =
1715         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1716
1717     if (ramblock_is_ignored(pss->block)) {
1718         error_report("block %s should not be migrated !", pss->block->idstr);
1719         return 0;
1720     }
1721
1722     do {
1723         /* Check the pages is dirty and if it is send it */
1724         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1725             pss->page++;
1726             continue;
1727         }
1728
1729         tmppages = ram_save_target_page(rs, pss, last_stage);
1730         if (tmppages < 0) {
1731             return tmppages;
1732         }
1733
1734         pages += tmppages;
1735         pss->page++;
1736         /* Allow rate limiting to happen in the middle of huge pages */
1737         migration_rate_limit();
1738     } while ((pss->page & (pagesize_bits - 1)) &&
1739              offset_in_ramblock(pss->block,
1740                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1741
1742     /* The offset we leave with is the last one we looked at */
1743     pss->page--;
1744     return pages;
1745 }
1746
1747 /**
1748  * ram_find_and_save_block: finds a dirty page and sends it to f
1749  *
1750  * Called within an RCU critical section.
1751  *
1752  * Returns the number of pages written where zero means no dirty pages,
1753  * or negative on error
1754  *
1755  * @rs: current RAM state
1756  * @last_stage: if we are at the completion stage
1757  *
1758  * On systems where host-page-size > target-page-size it will send all the
1759  * pages in a host page that are dirty.
1760  */
1761
1762 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1763 {
1764     PageSearchStatus pss;
1765     int pages = 0;
1766     bool again, found;
1767
1768     /* No dirty page as there is zero RAM */
1769     if (!ram_bytes_total()) {
1770         return pages;
1771     }
1772
1773     pss.block = rs->last_seen_block;
1774     pss.page = rs->last_page;
1775     pss.complete_round = false;
1776
1777     if (!pss.block) {
1778         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1779     }
1780
1781     do {
1782         again = true;
1783         found = get_queued_page(rs, &pss);
1784
1785         if (!found) {
1786             /* priority queue empty, so just search for something dirty */
1787             found = find_dirty_block(rs, &pss, &again);
1788         }
1789
1790         if (found) {
1791             pages = ram_save_host_page(rs, &pss, last_stage);
1792         }
1793     } while (!pages && again);
1794
1795     rs->last_seen_block = pss.block;
1796     rs->last_page = pss.page;
1797
1798     return pages;
1799 }
1800
1801 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1802 {
1803     uint64_t pages = size / TARGET_PAGE_SIZE;
1804
1805     if (zero) {
1806         ram_counters.duplicate += pages;
1807     } else {
1808         ram_counters.normal += pages;
1809         ram_counters.transferred += size;
1810         qemu_update_position(f, size);
1811     }
1812 }
1813
1814 static uint64_t ram_bytes_total_common(bool count_ignored)
1815 {
1816     RAMBlock *block;
1817     uint64_t total = 0;
1818
1819     RCU_READ_LOCK_GUARD();
1820
1821     if (count_ignored) {
1822         RAMBLOCK_FOREACH_MIGRATABLE(block) {
1823             total += block->used_length;
1824         }
1825     } else {
1826         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1827             total += block->used_length;
1828         }
1829     }
1830     return total;
1831 }
1832
1833 uint64_t ram_bytes_total(void)
1834 {
1835     return ram_bytes_total_common(false);
1836 }
1837
1838 static void xbzrle_load_setup(void)
1839 {
1840     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1841 }
1842
1843 static void xbzrle_load_cleanup(void)
1844 {
1845     g_free(XBZRLE.decoded_buf);
1846     XBZRLE.decoded_buf = NULL;
1847 }
1848
1849 static void ram_state_cleanup(RAMState **rsp)
1850 {
1851     if (*rsp) {
1852         migration_page_queue_free(*rsp);
1853         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1854         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1855         g_free(*rsp);
1856         *rsp = NULL;
1857     }
1858 }
1859
1860 static void xbzrle_cleanup(void)
1861 {
1862     XBZRLE_cache_lock();
1863     if (XBZRLE.cache) {
1864         cache_fini(XBZRLE.cache);
1865         g_free(XBZRLE.encoded_buf);
1866         g_free(XBZRLE.current_buf);
1867         g_free(XBZRLE.zero_target_page);
1868         XBZRLE.cache = NULL;
1869         XBZRLE.encoded_buf = NULL;
1870         XBZRLE.current_buf = NULL;
1871         XBZRLE.zero_target_page = NULL;
1872     }
1873     XBZRLE_cache_unlock();
1874 }
1875
1876 static void ram_save_cleanup(void *opaque)
1877 {
1878     RAMState **rsp = opaque;
1879     RAMBlock *block;
1880
1881     /* caller have hold iothread lock or is in a bh, so there is
1882      * no writing race against the migration bitmap
1883      */
1884     memory_global_dirty_log_stop();
1885
1886     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1887         g_free(block->clear_bmap);
1888         block->clear_bmap = NULL;
1889         g_free(block->bmap);
1890         block->bmap = NULL;
1891     }
1892
1893     xbzrle_cleanup();
1894     compress_threads_save_cleanup();
1895     ram_state_cleanup(rsp);
1896 }
1897
1898 static void ram_state_reset(RAMState *rs)
1899 {
1900     rs->last_seen_block = NULL;
1901     rs->last_sent_block = NULL;
1902     rs->last_page = 0;
1903     rs->last_version = ram_list.version;
1904     rs->ram_bulk_stage = true;
1905     rs->fpo_enabled = false;
1906 }
1907
1908 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1909
1910 /*
1911  * 'expected' is the value you expect the bitmap mostly to be full
1912  * of; it won't bother printing lines that are all this value.
1913  * If 'todump' is null the migration bitmap is dumped.
1914  */
1915 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1916                            unsigned long pages)
1917 {
1918     int64_t cur;
1919     int64_t linelen = 128;
1920     char linebuf[129];
1921
1922     for (cur = 0; cur < pages; cur += linelen) {
1923         int64_t curb;
1924         bool found = false;
1925         /*
1926          * Last line; catch the case where the line length
1927          * is longer than remaining ram
1928          */
1929         if (cur + linelen > pages) {
1930             linelen = pages - cur;
1931         }
1932         for (curb = 0; curb < linelen; curb++) {
1933             bool thisbit = test_bit(cur + curb, todump);
1934             linebuf[curb] = thisbit ? '1' : '.';
1935             found = found || (thisbit != expected);
1936         }
1937         if (found) {
1938             linebuf[curb] = '\0';
1939             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1940         }
1941     }
1942 }
1943
1944 /* **** functions for postcopy ***** */
1945
1946 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1947 {
1948     struct RAMBlock *block;
1949
1950     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1951         unsigned long *bitmap = block->bmap;
1952         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1953         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1954
1955         while (run_start < range) {
1956             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1957             ram_discard_range(block->idstr,
1958                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1959                               ((ram_addr_t)(run_end - run_start))
1960                                 << TARGET_PAGE_BITS);
1961             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1962         }
1963     }
1964 }
1965
1966 /**
1967  * postcopy_send_discard_bm_ram: discard a RAMBlock
1968  *
1969  * Returns zero on success
1970  *
1971  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1972  *
1973  * @ms: current migration state
1974  * @block: RAMBlock to discard
1975  */
1976 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1977 {
1978     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1979     unsigned long current;
1980     unsigned long *bitmap = block->bmap;
1981
1982     for (current = 0; current < end; ) {
1983         unsigned long one = find_next_bit(bitmap, end, current);
1984         unsigned long zero, discard_length;
1985
1986         if (one >= end) {
1987             break;
1988         }
1989
1990         zero = find_next_zero_bit(bitmap, end, one + 1);
1991
1992         if (zero >= end) {
1993             discard_length = end - one;
1994         } else {
1995             discard_length = zero - one;
1996         }
1997         postcopy_discard_send_range(ms, one, discard_length);
1998         current = one + discard_length;
1999     }
2000
2001     return 0;
2002 }
2003
2004 /**
2005  * postcopy_each_ram_send_discard: discard all RAMBlocks
2006  *
2007  * Returns 0 for success or negative for error
2008  *
2009  * Utility for the outgoing postcopy code.
2010  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2011  *   passing it bitmap indexes and name.
2012  * (qemu_ram_foreach_block ends up passing unscaled lengths
2013  *  which would mean postcopy code would have to deal with target page)
2014  *
2015  * @ms: current migration state
2016  */
2017 static int postcopy_each_ram_send_discard(MigrationState *ms)
2018 {
2019     struct RAMBlock *block;
2020     int ret;
2021
2022     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2023         postcopy_discard_send_init(ms, block->idstr);
2024
2025         /*
2026          * Postcopy sends chunks of bitmap over the wire, but it
2027          * just needs indexes at this point, avoids it having
2028          * target page specific code.
2029          */
2030         ret = postcopy_send_discard_bm_ram(ms, block);
2031         postcopy_discard_send_finish(ms);
2032         if (ret) {
2033             return ret;
2034         }
2035     }
2036
2037     return 0;
2038 }
2039
2040 /**
2041  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2042  *
2043  * Helper for postcopy_chunk_hostpages; it's called twice to
2044  * canonicalize the two bitmaps, that are similar, but one is
2045  * inverted.
2046  *
2047  * Postcopy requires that all target pages in a hostpage are dirty or
2048  * clean, not a mix.  This function canonicalizes the bitmaps.
2049  *
2050  * @ms: current migration state
2051  * @block: block that contains the page we want to canonicalize
2052  */
2053 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2054 {
2055     RAMState *rs = ram_state;
2056     unsigned long *bitmap = block->bmap;
2057     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2058     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2059     unsigned long run_start;
2060
2061     if (block->page_size == TARGET_PAGE_SIZE) {
2062         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2063         return;
2064     }
2065
2066     /* Find a dirty page */
2067     run_start = find_next_bit(bitmap, pages, 0);
2068
2069     while (run_start < pages) {
2070
2071         /*
2072          * If the start of this run of pages is in the middle of a host
2073          * page, then we need to fixup this host page.
2074          */
2075         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2076             /* Find the end of this run */
2077             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2078             /*
2079              * If the end isn't at the start of a host page, then the
2080              * run doesn't finish at the end of a host page
2081              * and we need to discard.
2082              */
2083         }
2084
2085         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2086             unsigned long page;
2087             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2088                                                              host_ratio);
2089             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2090
2091             /* Clean up the bitmap */
2092             for (page = fixup_start_addr;
2093                  page < fixup_start_addr + host_ratio; page++) {
2094                 /*
2095                  * Remark them as dirty, updating the count for any pages
2096                  * that weren't previously dirty.
2097                  */
2098                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2099             }
2100         }
2101
2102         /* Find the next dirty page for the next iteration */
2103         run_start = find_next_bit(bitmap, pages, run_start);
2104     }
2105 }
2106
2107 /**
2108  * postcopy_chunk_hostpages: discard any partially sent host page
2109  *
2110  * Utility for the outgoing postcopy code.
2111  *
2112  * Discard any partially sent host-page size chunks, mark any partially
2113  * dirty host-page size chunks as all dirty.  In this case the host-page
2114  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2115  *
2116  * Returns zero on success
2117  *
2118  * @ms: current migration state
2119  * @block: block we want to work with
2120  */
2121 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2122 {
2123     postcopy_discard_send_init(ms, block->idstr);
2124
2125     /*
2126      * Ensure that all partially dirty host pages are made fully dirty.
2127      */
2128     postcopy_chunk_hostpages_pass(ms, block);
2129
2130     postcopy_discard_send_finish(ms);
2131     return 0;
2132 }
2133
2134 /**
2135  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2136  *
2137  * Returns zero on success
2138  *
2139  * Transmit the set of pages to be discarded after precopy to the target
2140  * these are pages that:
2141  *     a) Have been previously transmitted but are now dirty again
2142  *     b) Pages that have never been transmitted, this ensures that
2143  *        any pages on the destination that have been mapped by background
2144  *        tasks get discarded (transparent huge pages is the specific concern)
2145  * Hopefully this is pretty sparse
2146  *
2147  * @ms: current migration state
2148  */
2149 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2150 {
2151     RAMState *rs = ram_state;
2152     RAMBlock *block;
2153     int ret;
2154
2155     RCU_READ_LOCK_GUARD();
2156
2157     /* This should be our last sync, the src is now paused */
2158     migration_bitmap_sync(rs);
2159
2160     /* Easiest way to make sure we don't resume in the middle of a host-page */
2161     rs->last_seen_block = NULL;
2162     rs->last_sent_block = NULL;
2163     rs->last_page = 0;
2164
2165     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2166         /* Deal with TPS != HPS and huge pages */
2167         ret = postcopy_chunk_hostpages(ms, block);
2168         if (ret) {
2169             return ret;
2170         }
2171
2172 #ifdef DEBUG_POSTCOPY
2173         ram_debug_dump_bitmap(block->bmap, true,
2174                               block->used_length >> TARGET_PAGE_BITS);
2175 #endif
2176     }
2177     trace_ram_postcopy_send_discard_bitmap();
2178
2179     return postcopy_each_ram_send_discard(ms);
2180 }
2181
2182 /**
2183  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2184  *
2185  * Returns zero on success
2186  *
2187  * @rbname: name of the RAMBlock of the request. NULL means the
2188  *          same that last one.
2189  * @start: RAMBlock starting page
2190  * @length: RAMBlock size
2191  */
2192 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2193 {
2194     trace_ram_discard_range(rbname, start, length);
2195
2196     RCU_READ_LOCK_GUARD();
2197     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2198
2199     if (!rb) {
2200         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2201         return -1;
2202     }
2203
2204     /*
2205      * On source VM, we don't need to update the received bitmap since
2206      * we don't even have one.
2207      */
2208     if (rb->receivedmap) {
2209         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2210                      length >> qemu_target_page_bits());
2211     }
2212
2213     return ram_block_discard_range(rb, start, length);
2214 }
2215
2216 /*
2217  * For every allocation, we will try not to crash the VM if the
2218  * allocation failed.
2219  */
2220 static int xbzrle_init(void)
2221 {
2222     Error *local_err = NULL;
2223
2224     if (!migrate_use_xbzrle()) {
2225         return 0;
2226     }
2227
2228     XBZRLE_cache_lock();
2229
2230     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2231     if (!XBZRLE.zero_target_page) {
2232         error_report("%s: Error allocating zero page", __func__);
2233         goto err_out;
2234     }
2235
2236     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2237                               TARGET_PAGE_SIZE, &local_err);
2238     if (!XBZRLE.cache) {
2239         error_report_err(local_err);
2240         goto free_zero_page;
2241     }
2242
2243     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2244     if (!XBZRLE.encoded_buf) {
2245         error_report("%s: Error allocating encoded_buf", __func__);
2246         goto free_cache;
2247     }
2248
2249     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2250     if (!XBZRLE.current_buf) {
2251         error_report("%s: Error allocating current_buf", __func__);
2252         goto free_encoded_buf;
2253     }
2254
2255     /* We are all good */
2256     XBZRLE_cache_unlock();
2257     return 0;
2258
2259 free_encoded_buf:
2260     g_free(XBZRLE.encoded_buf);
2261     XBZRLE.encoded_buf = NULL;
2262 free_cache:
2263     cache_fini(XBZRLE.cache);
2264     XBZRLE.cache = NULL;
2265 free_zero_page:
2266     g_free(XBZRLE.zero_target_page);
2267     XBZRLE.zero_target_page = NULL;
2268 err_out:
2269     XBZRLE_cache_unlock();
2270     return -ENOMEM;
2271 }
2272
2273 static int ram_state_init(RAMState **rsp)
2274 {
2275     *rsp = g_try_new0(RAMState, 1);
2276
2277     if (!*rsp) {
2278         error_report("%s: Init ramstate fail", __func__);
2279         return -1;
2280     }
2281
2282     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2283     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2284     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2285
2286     /*
2287      * Count the total number of pages used by ram blocks not including any
2288      * gaps due to alignment or unplugs.
2289      * This must match with the initial values of dirty bitmap.
2290      */
2291     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2292     ram_state_reset(*rsp);
2293
2294     return 0;
2295 }
2296
2297 static void ram_list_init_bitmaps(void)
2298 {
2299     MigrationState *ms = migrate_get_current();
2300     RAMBlock *block;
2301     unsigned long pages;
2302     uint8_t shift;
2303
2304     /* Skip setting bitmap if there is no RAM */
2305     if (ram_bytes_total()) {
2306         shift = ms->clear_bitmap_shift;
2307         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2308             error_report("clear_bitmap_shift (%u) too big, using "
2309                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2310             shift = CLEAR_BITMAP_SHIFT_MAX;
2311         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2312             error_report("clear_bitmap_shift (%u) too small, using "
2313                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2314             shift = CLEAR_BITMAP_SHIFT_MIN;
2315         }
2316
2317         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2318             pages = block->max_length >> TARGET_PAGE_BITS;
2319             /*
2320              * The initial dirty bitmap for migration must be set with all
2321              * ones to make sure we'll migrate every guest RAM page to
2322              * destination.
2323              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2324              * new migration after a failed migration, ram_list.
2325              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2326              * guest memory.
2327              */
2328             block->bmap = bitmap_new(pages);
2329             bitmap_set(block->bmap, 0, pages);
2330             block->clear_bmap_shift = shift;
2331             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2332         }
2333     }
2334 }
2335
2336 static void ram_init_bitmaps(RAMState *rs)
2337 {
2338     /* For memory_global_dirty_log_start below.  */
2339     qemu_mutex_lock_iothread();
2340     qemu_mutex_lock_ramlist();
2341
2342     WITH_RCU_READ_LOCK_GUARD() {
2343         ram_list_init_bitmaps();
2344         memory_global_dirty_log_start();
2345         migration_bitmap_sync_precopy(rs);
2346     }
2347     qemu_mutex_unlock_ramlist();
2348     qemu_mutex_unlock_iothread();
2349 }
2350
2351 static int ram_init_all(RAMState **rsp)
2352 {
2353     if (ram_state_init(rsp)) {
2354         return -1;
2355     }
2356
2357     if (xbzrle_init()) {
2358         ram_state_cleanup(rsp);
2359         return -1;
2360     }
2361
2362     ram_init_bitmaps(*rsp);
2363
2364     return 0;
2365 }
2366
2367 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2368 {
2369     RAMBlock *block;
2370     uint64_t pages = 0;
2371
2372     /*
2373      * Postcopy is not using xbzrle/compression, so no need for that.
2374      * Also, since source are already halted, we don't need to care
2375      * about dirty page logging as well.
2376      */
2377
2378     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2379         pages += bitmap_count_one(block->bmap,
2380                                   block->used_length >> TARGET_PAGE_BITS);
2381     }
2382
2383     /* This may not be aligned with current bitmaps. Recalculate. */
2384     rs->migration_dirty_pages = pages;
2385
2386     rs->last_seen_block = NULL;
2387     rs->last_sent_block = NULL;
2388     rs->last_page = 0;
2389     rs->last_version = ram_list.version;
2390     /*
2391      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2392      * matter what we have sent.
2393      */
2394     rs->ram_bulk_stage = false;
2395
2396     /* Update RAMState cache of output QEMUFile */
2397     rs->f = out;
2398
2399     trace_ram_state_resume_prepare(pages);
2400 }
2401
2402 /*
2403  * This function clears bits of the free pages reported by the caller from the
2404  * migration dirty bitmap. @addr is the host address corresponding to the
2405  * start of the continuous guest free pages, and @len is the total bytes of
2406  * those pages.
2407  */
2408 void qemu_guest_free_page_hint(void *addr, size_t len)
2409 {
2410     RAMBlock *block;
2411     ram_addr_t offset;
2412     size_t used_len, start, npages;
2413     MigrationState *s = migrate_get_current();
2414
2415     /* This function is currently expected to be used during live migration */
2416     if (!migration_is_setup_or_active(s->state)) {
2417         return;
2418     }
2419
2420     for (; len > 0; len -= used_len, addr += used_len) {
2421         block = qemu_ram_block_from_host(addr, false, &offset);
2422         if (unlikely(!block || offset >= block->used_length)) {
2423             /*
2424              * The implementation might not support RAMBlock resize during
2425              * live migration, but it could happen in theory with future
2426              * updates. So we add a check here to capture that case.
2427              */
2428             error_report_once("%s unexpected error", __func__);
2429             return;
2430         }
2431
2432         if (len <= block->used_length - offset) {
2433             used_len = len;
2434         } else {
2435             used_len = block->used_length - offset;
2436         }
2437
2438         start = offset >> TARGET_PAGE_BITS;
2439         npages = used_len >> TARGET_PAGE_BITS;
2440
2441         qemu_mutex_lock(&ram_state->bitmap_mutex);
2442         ram_state->migration_dirty_pages -=
2443                       bitmap_count_one_with_offset(block->bmap, start, npages);
2444         bitmap_clear(block->bmap, start, npages);
2445         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2446     }
2447 }
2448
2449 /*
2450  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2451  * long-running RCU critical section.  When rcu-reclaims in the code
2452  * start to become numerous it will be necessary to reduce the
2453  * granularity of these critical sections.
2454  */
2455
2456 /**
2457  * ram_save_setup: Setup RAM for migration
2458  *
2459  * Returns zero to indicate success and negative for error
2460  *
2461  * @f: QEMUFile where to send the data
2462  * @opaque: RAMState pointer
2463  */
2464 static int ram_save_setup(QEMUFile *f, void *opaque)
2465 {
2466     RAMState **rsp = opaque;
2467     RAMBlock *block;
2468
2469     if (compress_threads_save_setup()) {
2470         return -1;
2471     }
2472
2473     /* migration has already setup the bitmap, reuse it. */
2474     if (!migration_in_colo_state()) {
2475         if (ram_init_all(rsp) != 0) {
2476             compress_threads_save_cleanup();
2477             return -1;
2478         }
2479     }
2480     (*rsp)->f = f;
2481
2482     WITH_RCU_READ_LOCK_GUARD() {
2483         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2484
2485         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2486             qemu_put_byte(f, strlen(block->idstr));
2487             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2488             qemu_put_be64(f, block->used_length);
2489             if (migrate_postcopy_ram() && block->page_size !=
2490                                           qemu_host_page_size) {
2491                 qemu_put_be64(f, block->page_size);
2492             }
2493             if (migrate_ignore_shared()) {
2494                 qemu_put_be64(f, block->mr->addr);
2495             }
2496         }
2497     }
2498
2499     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2500     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2501
2502     multifd_send_sync_main(f);
2503     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2504     qemu_fflush(f);
2505
2506     return 0;
2507 }
2508
2509 /**
2510  * ram_save_iterate: iterative stage for migration
2511  *
2512  * Returns zero to indicate success and negative for error
2513  *
2514  * @f: QEMUFile where to send the data
2515  * @opaque: RAMState pointer
2516  */
2517 static int ram_save_iterate(QEMUFile *f, void *opaque)
2518 {
2519     RAMState **temp = opaque;
2520     RAMState *rs = *temp;
2521     int ret = 0;
2522     int i;
2523     int64_t t0;
2524     int done = 0;
2525
2526     if (blk_mig_bulk_active()) {
2527         /* Avoid transferring ram during bulk phase of block migration as
2528          * the bulk phase will usually take a long time and transferring
2529          * ram updates during that time is pointless. */
2530         goto out;
2531     }
2532
2533     WITH_RCU_READ_LOCK_GUARD() {
2534         if (ram_list.version != rs->last_version) {
2535             ram_state_reset(rs);
2536         }
2537
2538         /* Read version before ram_list.blocks */
2539         smp_rmb();
2540
2541         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2542
2543         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2544         i = 0;
2545         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2546                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2547             int pages;
2548
2549             if (qemu_file_get_error(f)) {
2550                 break;
2551             }
2552
2553             pages = ram_find_and_save_block(rs, false);
2554             /* no more pages to sent */
2555             if (pages == 0) {
2556                 done = 1;
2557                 break;
2558             }
2559
2560             if (pages < 0) {
2561                 qemu_file_set_error(f, pages);
2562                 break;
2563             }
2564
2565             rs->target_page_count += pages;
2566
2567             /*
2568              * During postcopy, it is necessary to make sure one whole host
2569              * page is sent in one chunk.
2570              */
2571             if (migrate_postcopy_ram()) {
2572                 flush_compressed_data(rs);
2573             }
2574
2575             /*
2576              * we want to check in the 1st loop, just in case it was the 1st
2577              * time and we had to sync the dirty bitmap.
2578              * qemu_clock_get_ns() is a bit expensive, so we only check each
2579              * some iterations
2580              */
2581             if ((i & 63) == 0) {
2582                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2583                               1000000;
2584                 if (t1 > MAX_WAIT) {
2585                     trace_ram_save_iterate_big_wait(t1, i);
2586                     break;
2587                 }
2588             }
2589             i++;
2590         }
2591     }
2592
2593     /*
2594      * Must occur before EOS (or any QEMUFile operation)
2595      * because of RDMA protocol.
2596      */
2597     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2598
2599 out:
2600     if (ret >= 0
2601         && migration_is_setup_or_active(migrate_get_current()->state)) {
2602         multifd_send_sync_main(rs->f);
2603         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2604         qemu_fflush(f);
2605         ram_counters.transferred += 8;
2606
2607         ret = qemu_file_get_error(f);
2608     }
2609     if (ret < 0) {
2610         return ret;
2611     }
2612
2613     return done;
2614 }
2615
2616 /**
2617  * ram_save_complete: function called to send the remaining amount of ram
2618  *
2619  * Returns zero to indicate success or negative on error
2620  *
2621  * Called with iothread lock
2622  *
2623  * @f: QEMUFile where to send the data
2624  * @opaque: RAMState pointer
2625  */
2626 static int ram_save_complete(QEMUFile *f, void *opaque)
2627 {
2628     RAMState **temp = opaque;
2629     RAMState *rs = *temp;
2630     int ret = 0;
2631
2632     WITH_RCU_READ_LOCK_GUARD() {
2633         if (!migration_in_postcopy()) {
2634             migration_bitmap_sync_precopy(rs);
2635         }
2636
2637         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2638
2639         /* try transferring iterative blocks of memory */
2640
2641         /* flush all remaining blocks regardless of rate limiting */
2642         while (true) {
2643             int pages;
2644
2645             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2646             /* no more blocks to sent */
2647             if (pages == 0) {
2648                 break;
2649             }
2650             if (pages < 0) {
2651                 ret = pages;
2652                 break;
2653             }
2654         }
2655
2656         flush_compressed_data(rs);
2657         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2658     }
2659
2660     if (ret >= 0) {
2661         multifd_send_sync_main(rs->f);
2662         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2663         qemu_fflush(f);
2664     }
2665
2666     return ret;
2667 }
2668
2669 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2670                              uint64_t *res_precopy_only,
2671                              uint64_t *res_compatible,
2672                              uint64_t *res_postcopy_only)
2673 {
2674     RAMState **temp = opaque;
2675     RAMState *rs = *temp;
2676     uint64_t remaining_size;
2677
2678     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2679
2680     if (!migration_in_postcopy() &&
2681         remaining_size < max_size) {
2682         qemu_mutex_lock_iothread();
2683         WITH_RCU_READ_LOCK_GUARD() {
2684             migration_bitmap_sync_precopy(rs);
2685         }
2686         qemu_mutex_unlock_iothread();
2687         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2688     }
2689
2690     if (migrate_postcopy_ram()) {
2691         /* We can do postcopy, and all the data is postcopiable */
2692         *res_compatible += remaining_size;
2693     } else {
2694         *res_precopy_only += remaining_size;
2695     }
2696 }
2697
2698 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2699 {
2700     unsigned int xh_len;
2701     int xh_flags;
2702     uint8_t *loaded_data;
2703
2704     /* extract RLE header */
2705     xh_flags = qemu_get_byte(f);
2706     xh_len = qemu_get_be16(f);
2707
2708     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2709         error_report("Failed to load XBZRLE page - wrong compression!");
2710         return -1;
2711     }
2712
2713     if (xh_len > TARGET_PAGE_SIZE) {
2714         error_report("Failed to load XBZRLE page - len overflow!");
2715         return -1;
2716     }
2717     loaded_data = XBZRLE.decoded_buf;
2718     /* load data and decode */
2719     /* it can change loaded_data to point to an internal buffer */
2720     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2721
2722     /* decode RLE */
2723     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2724                              TARGET_PAGE_SIZE) == -1) {
2725         error_report("Failed to load XBZRLE page - decode error!");
2726         return -1;
2727     }
2728
2729     return 0;
2730 }
2731
2732 /**
2733  * ram_block_from_stream: read a RAMBlock id from the migration stream
2734  *
2735  * Must be called from within a rcu critical section.
2736  *
2737  * Returns a pointer from within the RCU-protected ram_list.
2738  *
2739  * @f: QEMUFile where to read the data from
2740  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2741  */
2742 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2743 {
2744     static RAMBlock *block = NULL;
2745     char id[256];
2746     uint8_t len;
2747
2748     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2749         if (!block) {
2750             error_report("Ack, bad migration stream!");
2751             return NULL;
2752         }
2753         return block;
2754     }
2755
2756     len = qemu_get_byte(f);
2757     qemu_get_buffer(f, (uint8_t *)id, len);
2758     id[len] = 0;
2759
2760     block = qemu_ram_block_by_name(id);
2761     if (!block) {
2762         error_report("Can't find block %s", id);
2763         return NULL;
2764     }
2765
2766     if (ramblock_is_ignored(block)) {
2767         error_report("block %s should not be migrated !", id);
2768         return NULL;
2769     }
2770
2771     return block;
2772 }
2773
2774 static inline void *host_from_ram_block_offset(RAMBlock *block,
2775                                                ram_addr_t offset)
2776 {
2777     if (!offset_in_ramblock(block, offset)) {
2778         return NULL;
2779     }
2780
2781     return block->host + offset;
2782 }
2783
2784 static inline void *colo_cache_from_block_offset(RAMBlock *block,
2785                              ram_addr_t offset, bool record_bitmap)
2786 {
2787     if (!offset_in_ramblock(block, offset)) {
2788         return NULL;
2789     }
2790     if (!block->colo_cache) {
2791         error_report("%s: colo_cache is NULL in block :%s",
2792                      __func__, block->idstr);
2793         return NULL;
2794     }
2795
2796     /*
2797     * During colo checkpoint, we need bitmap of these migrated pages.
2798     * It help us to decide which pages in ram cache should be flushed
2799     * into VM's RAM later.
2800     */
2801     if (record_bitmap &&
2802         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2803         ram_state->migration_dirty_pages++;
2804     }
2805     return block->colo_cache + offset;
2806 }
2807
2808 /**
2809  * ram_handle_compressed: handle the zero page case
2810  *
2811  * If a page (or a whole RDMA chunk) has been
2812  * determined to be zero, then zap it.
2813  *
2814  * @host: host address for the zero page
2815  * @ch: what the page is filled from.  We only support zero
2816  * @size: size of the zero page
2817  */
2818 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2819 {
2820     if (ch != 0 || !is_zero_range(host, size)) {
2821         memset(host, ch, size);
2822     }
2823 }
2824
2825 /* return the size after decompression, or negative value on error */
2826 static int
2827 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2828                      const uint8_t *source, size_t source_len)
2829 {
2830     int err;
2831
2832     err = inflateReset(stream);
2833     if (err != Z_OK) {
2834         return -1;
2835     }
2836
2837     stream->avail_in = source_len;
2838     stream->next_in = (uint8_t *)source;
2839     stream->avail_out = dest_len;
2840     stream->next_out = dest;
2841
2842     err = inflate(stream, Z_NO_FLUSH);
2843     if (err != Z_STREAM_END) {
2844         return -1;
2845     }
2846
2847     return stream->total_out;
2848 }
2849
2850 static void *do_data_decompress(void *opaque)
2851 {
2852     DecompressParam *param = opaque;
2853     unsigned long pagesize;
2854     uint8_t *des;
2855     int len, ret;
2856
2857     qemu_mutex_lock(&param->mutex);
2858     while (!param->quit) {
2859         if (param->des) {
2860             des = param->des;
2861             len = param->len;
2862             param->des = 0;
2863             qemu_mutex_unlock(&param->mutex);
2864
2865             pagesize = TARGET_PAGE_SIZE;
2866
2867             ret = qemu_uncompress_data(&param->stream, des, pagesize,
2868                                        param->compbuf, len);
2869             if (ret < 0 && migrate_get_current()->decompress_error_check) {
2870                 error_report("decompress data failed");
2871                 qemu_file_set_error(decomp_file, ret);
2872             }
2873
2874             qemu_mutex_lock(&decomp_done_lock);
2875             param->done = true;
2876             qemu_cond_signal(&decomp_done_cond);
2877             qemu_mutex_unlock(&decomp_done_lock);
2878
2879             qemu_mutex_lock(&param->mutex);
2880         } else {
2881             qemu_cond_wait(&param->cond, &param->mutex);
2882         }
2883     }
2884     qemu_mutex_unlock(&param->mutex);
2885
2886     return NULL;
2887 }
2888
2889 static int wait_for_decompress_done(void)
2890 {
2891     int idx, thread_count;
2892
2893     if (!migrate_use_compression()) {
2894         return 0;
2895     }
2896
2897     thread_count = migrate_decompress_threads();
2898     qemu_mutex_lock(&decomp_done_lock);
2899     for (idx = 0; idx < thread_count; idx++) {
2900         while (!decomp_param[idx].done) {
2901             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2902         }
2903     }
2904     qemu_mutex_unlock(&decomp_done_lock);
2905     return qemu_file_get_error(decomp_file);
2906 }
2907
2908 static void compress_threads_load_cleanup(void)
2909 {
2910     int i, thread_count;
2911
2912     if (!migrate_use_compression()) {
2913         return;
2914     }
2915     thread_count = migrate_decompress_threads();
2916     for (i = 0; i < thread_count; i++) {
2917         /*
2918          * we use it as a indicator which shows if the thread is
2919          * properly init'd or not
2920          */
2921         if (!decomp_param[i].compbuf) {
2922             break;
2923         }
2924
2925         qemu_mutex_lock(&decomp_param[i].mutex);
2926         decomp_param[i].quit = true;
2927         qemu_cond_signal(&decomp_param[i].cond);
2928         qemu_mutex_unlock(&decomp_param[i].mutex);
2929     }
2930     for (i = 0; i < thread_count; i++) {
2931         if (!decomp_param[i].compbuf) {
2932             break;
2933         }
2934
2935         qemu_thread_join(decompress_threads + i);
2936         qemu_mutex_destroy(&decomp_param[i].mutex);
2937         qemu_cond_destroy(&decomp_param[i].cond);
2938         inflateEnd(&decomp_param[i].stream);
2939         g_free(decomp_param[i].compbuf);
2940         decomp_param[i].compbuf = NULL;
2941     }
2942     g_free(decompress_threads);
2943     g_free(decomp_param);
2944     decompress_threads = NULL;
2945     decomp_param = NULL;
2946     decomp_file = NULL;
2947 }
2948
2949 static int compress_threads_load_setup(QEMUFile *f)
2950 {
2951     int i, thread_count;
2952
2953     if (!migrate_use_compression()) {
2954         return 0;
2955     }
2956
2957     thread_count = migrate_decompress_threads();
2958     decompress_threads = g_new0(QemuThread, thread_count);
2959     decomp_param = g_new0(DecompressParam, thread_count);
2960     qemu_mutex_init(&decomp_done_lock);
2961     qemu_cond_init(&decomp_done_cond);
2962     decomp_file = f;
2963     for (i = 0; i < thread_count; i++) {
2964         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2965             goto exit;
2966         }
2967
2968         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2969         qemu_mutex_init(&decomp_param[i].mutex);
2970         qemu_cond_init(&decomp_param[i].cond);
2971         decomp_param[i].done = true;
2972         decomp_param[i].quit = false;
2973         qemu_thread_create(decompress_threads + i, "decompress",
2974                            do_data_decompress, decomp_param + i,
2975                            QEMU_THREAD_JOINABLE);
2976     }
2977     return 0;
2978 exit:
2979     compress_threads_load_cleanup();
2980     return -1;
2981 }
2982
2983 static void decompress_data_with_multi_threads(QEMUFile *f,
2984                                                void *host, int len)
2985 {
2986     int idx, thread_count;
2987
2988     thread_count = migrate_decompress_threads();
2989     qemu_mutex_lock(&decomp_done_lock);
2990     while (true) {
2991         for (idx = 0; idx < thread_count; idx++) {
2992             if (decomp_param[idx].done) {
2993                 decomp_param[idx].done = false;
2994                 qemu_mutex_lock(&decomp_param[idx].mutex);
2995                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2996                 decomp_param[idx].des = host;
2997                 decomp_param[idx].len = len;
2998                 qemu_cond_signal(&decomp_param[idx].cond);
2999                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3000                 break;
3001             }
3002         }
3003         if (idx < thread_count) {
3004             break;
3005         } else {
3006             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3007         }
3008     }
3009     qemu_mutex_unlock(&decomp_done_lock);
3010 }
3011
3012 /*
3013  * colo cache: this is for secondary VM, we cache the whole
3014  * memory of the secondary VM, it is need to hold the global lock
3015  * to call this helper.
3016  */
3017 int colo_init_ram_cache(void)
3018 {
3019     RAMBlock *block;
3020
3021     WITH_RCU_READ_LOCK_GUARD() {
3022         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3023             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3024                                                     NULL,
3025                                                     false);
3026             if (!block->colo_cache) {
3027                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3028                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3029                              block->used_length);
3030                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3031                     if (block->colo_cache) {
3032                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3033                         block->colo_cache = NULL;
3034                     }
3035                 }
3036                 return -errno;
3037             }
3038         }
3039     }
3040
3041     /*
3042     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3043     * with to decide which page in cache should be flushed into SVM's RAM. Here
3044     * we use the same name 'ram_bitmap' as for migration.
3045     */
3046     if (ram_bytes_total()) {
3047         RAMBlock *block;
3048
3049         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3050             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3051             block->bmap = bitmap_new(pages);
3052         }
3053     }
3054
3055     ram_state_init(&ram_state);
3056     return 0;
3057 }
3058
3059 /* TODO: duplicated with ram_init_bitmaps */
3060 void colo_incoming_start_dirty_log(void)
3061 {
3062     RAMBlock *block = NULL;
3063     /* For memory_global_dirty_log_start below. */
3064     qemu_mutex_lock_iothread();
3065     qemu_mutex_lock_ramlist();
3066
3067     memory_global_dirty_log_sync();
3068     WITH_RCU_READ_LOCK_GUARD() {
3069         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3070             ramblock_sync_dirty_bitmap(ram_state, block);
3071             /* Discard this dirty bitmap record */
3072             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3073         }
3074         memory_global_dirty_log_start();
3075     }
3076     ram_state->migration_dirty_pages = 0;
3077     qemu_mutex_unlock_ramlist();
3078     qemu_mutex_unlock_iothread();
3079 }
3080
3081 /* It is need to hold the global lock to call this helper */
3082 void colo_release_ram_cache(void)
3083 {
3084     RAMBlock *block;
3085
3086     memory_global_dirty_log_stop();
3087     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3088         g_free(block->bmap);
3089         block->bmap = NULL;
3090     }
3091
3092     WITH_RCU_READ_LOCK_GUARD() {
3093         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3094             if (block->colo_cache) {
3095                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3096                 block->colo_cache = NULL;
3097             }
3098         }
3099     }
3100     ram_state_cleanup(&ram_state);
3101 }
3102
3103 /**
3104  * ram_load_setup: Setup RAM for migration incoming side
3105  *
3106  * Returns zero to indicate success and negative for error
3107  *
3108  * @f: QEMUFile where to receive the data
3109  * @opaque: RAMState pointer
3110  */
3111 static int ram_load_setup(QEMUFile *f, void *opaque)
3112 {
3113     if (compress_threads_load_setup(f)) {
3114         return -1;
3115     }
3116
3117     xbzrle_load_setup();
3118     ramblock_recv_map_init();
3119
3120     return 0;
3121 }
3122
3123 static int ram_load_cleanup(void *opaque)
3124 {
3125     RAMBlock *rb;
3126
3127     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3128         qemu_ram_block_writeback(rb);
3129     }
3130
3131     xbzrle_load_cleanup();
3132     compress_threads_load_cleanup();
3133
3134     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3135         g_free(rb->receivedmap);
3136         rb->receivedmap = NULL;
3137     }
3138
3139     return 0;
3140 }
3141
3142 /**
3143  * ram_postcopy_incoming_init: allocate postcopy data structures
3144  *
3145  * Returns 0 for success and negative if there was one error
3146  *
3147  * @mis: current migration incoming state
3148  *
3149  * Allocate data structures etc needed by incoming migration with
3150  * postcopy-ram. postcopy-ram's similarly names
3151  * postcopy_ram_incoming_init does the work.
3152  */
3153 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3154 {
3155     return postcopy_ram_incoming_init(mis);
3156 }
3157
3158 /**
3159  * ram_load_postcopy: load a page in postcopy case
3160  *
3161  * Returns 0 for success or -errno in case of error
3162  *
3163  * Called in postcopy mode by ram_load().
3164  * rcu_read_lock is taken prior to this being called.
3165  *
3166  * @f: QEMUFile where to send the data
3167  */
3168 static int ram_load_postcopy(QEMUFile *f)
3169 {
3170     int flags = 0, ret = 0;
3171     bool place_needed = false;
3172     bool matches_target_page_size = false;
3173     MigrationIncomingState *mis = migration_incoming_get_current();
3174     /* Temporary page that is later 'placed' */
3175     void *postcopy_host_page = mis->postcopy_tmp_page;
3176     void *this_host = NULL;
3177     bool all_zero = true;
3178     int target_pages = 0;
3179
3180     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3181         ram_addr_t addr;
3182         void *host = NULL;
3183         void *page_buffer = NULL;
3184         void *place_source = NULL;
3185         RAMBlock *block = NULL;
3186         uint8_t ch;
3187         int len;
3188
3189         addr = qemu_get_be64(f);
3190
3191         /*
3192          * If qemu file error, we should stop here, and then "addr"
3193          * may be invalid
3194          */
3195         ret = qemu_file_get_error(f);
3196         if (ret) {
3197             break;
3198         }
3199
3200         flags = addr & ~TARGET_PAGE_MASK;
3201         addr &= TARGET_PAGE_MASK;
3202
3203         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3204         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3205                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3206             block = ram_block_from_stream(f, flags);
3207
3208             host = host_from_ram_block_offset(block, addr);
3209             if (!host) {
3210                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3211                 ret = -EINVAL;
3212                 break;
3213             }
3214             target_pages++;
3215             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3216             /*
3217              * Postcopy requires that we place whole host pages atomically;
3218              * these may be huge pages for RAMBlocks that are backed by
3219              * hugetlbfs.
3220              * To make it atomic, the data is read into a temporary page
3221              * that's moved into place later.
3222              * The migration protocol uses,  possibly smaller, target-pages
3223              * however the source ensures it always sends all the components
3224              * of a host page in one chunk.
3225              */
3226             page_buffer = postcopy_host_page +
3227                           ((uintptr_t)host & (block->page_size - 1));
3228             if (target_pages == 1) {
3229                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3230                                                     block->page_size);
3231             } else {
3232                 /* not the 1st TP within the HP */
3233                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3234                     (uintptr_t)this_host) {
3235                     error_report("Non-same host page %p/%p",
3236                                   host, this_host);
3237                     ret = -EINVAL;
3238                     break;
3239                 }
3240             }
3241
3242             /*
3243              * If it's the last part of a host page then we place the host
3244              * page
3245              */
3246             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3247                 place_needed = true;
3248             }
3249             place_source = postcopy_host_page;
3250         }
3251
3252         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3253         case RAM_SAVE_FLAG_ZERO:
3254             ch = qemu_get_byte(f);
3255             /*
3256              * Can skip to set page_buffer when
3257              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3258              */
3259             if (ch || !matches_target_page_size) {
3260                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3261             }
3262             if (ch) {
3263                 all_zero = false;
3264             }
3265             break;
3266
3267         case RAM_SAVE_FLAG_PAGE:
3268             all_zero = false;
3269             if (!matches_target_page_size) {
3270                 /* For huge pages, we always use temporary buffer */
3271                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3272             } else {
3273                 /*
3274                  * For small pages that matches target page size, we
3275                  * avoid the qemu_file copy.  Instead we directly use
3276                  * the buffer of QEMUFile to place the page.  Note: we
3277                  * cannot do any QEMUFile operation before using that
3278                  * buffer to make sure the buffer is valid when
3279                  * placing the page.
3280                  */
3281                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3282                                          TARGET_PAGE_SIZE);
3283             }
3284             break;
3285         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3286             all_zero = false;
3287             len = qemu_get_be32(f);
3288             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3289                 error_report("Invalid compressed data length: %d", len);
3290                 ret = -EINVAL;
3291                 break;
3292             }
3293             decompress_data_with_multi_threads(f, page_buffer, len);
3294             break;
3295
3296         case RAM_SAVE_FLAG_EOS:
3297             /* normal exit */
3298             multifd_recv_sync_main();
3299             break;
3300         default:
3301             error_report("Unknown combination of migration flags: %#x"
3302                          " (postcopy mode)", flags);
3303             ret = -EINVAL;
3304             break;
3305         }
3306
3307         /* Got the whole host page, wait for decompress before placing. */
3308         if (place_needed) {
3309             ret |= wait_for_decompress_done();
3310         }
3311
3312         /* Detect for any possible file errors */
3313         if (!ret && qemu_file_get_error(f)) {
3314             ret = qemu_file_get_error(f);
3315         }
3316
3317         if (!ret && place_needed) {
3318             /* This gets called at the last target page in the host page */
3319             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3320                                                        block->page_size);
3321
3322             if (all_zero) {
3323                 ret = postcopy_place_page_zero(mis, place_dest,
3324                                                block);
3325             } else {
3326                 ret = postcopy_place_page(mis, place_dest,
3327                                           place_source, block);
3328             }
3329             place_needed = false;
3330             target_pages = 0;
3331             /* Assume we have a zero page until we detect something different */
3332             all_zero = true;
3333         }
3334     }
3335
3336     return ret;
3337 }
3338
3339 static bool postcopy_is_advised(void)
3340 {
3341     PostcopyState ps = postcopy_state_get();
3342     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3343 }
3344
3345 static bool postcopy_is_running(void)
3346 {
3347     PostcopyState ps = postcopy_state_get();
3348     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3349 }
3350
3351 /*
3352  * Flush content of RAM cache into SVM's memory.
3353  * Only flush the pages that be dirtied by PVM or SVM or both.
3354  */
3355 void colo_flush_ram_cache(void)
3356 {
3357     RAMBlock *block = NULL;
3358     void *dst_host;
3359     void *src_host;
3360     unsigned long offset = 0;
3361
3362     memory_global_dirty_log_sync();
3363     WITH_RCU_READ_LOCK_GUARD() {
3364         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3365             ramblock_sync_dirty_bitmap(ram_state, block);
3366         }
3367     }
3368
3369     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3370     WITH_RCU_READ_LOCK_GUARD() {
3371         block = QLIST_FIRST_RCU(&ram_list.blocks);
3372
3373         while (block) {
3374             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3375
3376             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3377                 >= block->used_length) {
3378                 offset = 0;
3379                 block = QLIST_NEXT_RCU(block, next);
3380             } else {
3381                 migration_bitmap_clear_dirty(ram_state, block, offset);
3382                 dst_host = block->host
3383                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3384                 src_host = block->colo_cache
3385                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3386                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3387             }
3388         }
3389     }
3390     trace_colo_flush_ram_cache_end();
3391 }
3392
3393 /**
3394  * ram_load_precopy: load pages in precopy case
3395  *
3396  * Returns 0 for success or -errno in case of error
3397  *
3398  * Called in precopy mode by ram_load().
3399  * rcu_read_lock is taken prior to this being called.
3400  *
3401  * @f: QEMUFile where to send the data
3402  */
3403 static int ram_load_precopy(QEMUFile *f)
3404 {
3405     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3406     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3407     bool postcopy_advised = postcopy_is_advised();
3408     if (!migrate_use_compression()) {
3409         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3410     }
3411
3412     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3413         ram_addr_t addr, total_ram_bytes;
3414         void *host = NULL, *host_bak = NULL;
3415         uint8_t ch;
3416
3417         /*
3418          * Yield periodically to let main loop run, but an iteration of
3419          * the main loop is expensive, so do it each some iterations
3420          */
3421         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3422             aio_co_schedule(qemu_get_current_aio_context(),
3423                             qemu_coroutine_self());
3424             qemu_coroutine_yield();
3425         }
3426         i++;
3427
3428         addr = qemu_get_be64(f);
3429         flags = addr & ~TARGET_PAGE_MASK;
3430         addr &= TARGET_PAGE_MASK;
3431
3432         if (flags & invalid_flags) {
3433             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3434                 error_report("Received an unexpected compressed page");
3435             }
3436
3437             ret = -EINVAL;
3438             break;
3439         }
3440
3441         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3442                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3443             RAMBlock *block = ram_block_from_stream(f, flags);
3444
3445             host = host_from_ram_block_offset(block, addr);
3446             /*
3447              * After going into COLO stage, we should not load the page
3448              * into SVM's memory directly, we put them into colo_cache firstly.
3449              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3450              * Previously, we copied all these memory in preparing stage of COLO
3451              * while we need to stop VM, which is a time-consuming process.
3452              * Here we optimize it by a trick, back-up every page while in
3453              * migration process while COLO is enabled, though it affects the
3454              * speed of the migration, but it obviously reduce the downtime of
3455              * back-up all SVM'S memory in COLO preparing stage.
3456              */
3457             if (migration_incoming_colo_enabled()) {
3458                 if (migration_incoming_in_colo_state()) {
3459                     /* In COLO stage, put all pages into cache temporarily */
3460                     host = colo_cache_from_block_offset(block, addr, true);
3461                 } else {
3462                    /*
3463                     * In migration stage but before COLO stage,
3464                     * Put all pages into both cache and SVM's memory.
3465                     */
3466                     host_bak = colo_cache_from_block_offset(block, addr, false);
3467                 }
3468             }
3469             if (!host) {
3470                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3471                 ret = -EINVAL;
3472                 break;
3473             }
3474             if (!migration_incoming_in_colo_state()) {
3475                 ramblock_recv_bitmap_set(block, host);
3476             }
3477
3478             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3479         }
3480
3481         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3482         case RAM_SAVE_FLAG_MEM_SIZE:
3483             /* Synchronize RAM block list */
3484             total_ram_bytes = addr;
3485             while (!ret && total_ram_bytes) {
3486                 RAMBlock *block;
3487                 char id[256];
3488                 ram_addr_t length;
3489
3490                 len = qemu_get_byte(f);
3491                 qemu_get_buffer(f, (uint8_t *)id, len);
3492                 id[len] = 0;
3493                 length = qemu_get_be64(f);
3494
3495                 block = qemu_ram_block_by_name(id);
3496                 if (block && !qemu_ram_is_migratable(block)) {
3497                     error_report("block %s should not be migrated !", id);
3498                     ret = -EINVAL;
3499                 } else if (block) {
3500                     if (length != block->used_length) {
3501                         Error *local_err = NULL;
3502
3503                         ret = qemu_ram_resize(block, length,
3504                                               &local_err);
3505                         if (local_err) {
3506                             error_report_err(local_err);
3507                         }
3508                     }
3509                     /* For postcopy we need to check hugepage sizes match */
3510                     if (postcopy_advised &&
3511                         block->page_size != qemu_host_page_size) {
3512                         uint64_t remote_page_size = qemu_get_be64(f);
3513                         if (remote_page_size != block->page_size) {
3514                             error_report("Mismatched RAM page size %s "
3515                                          "(local) %zd != %" PRId64,
3516                                          id, block->page_size,
3517                                          remote_page_size);
3518                             ret = -EINVAL;
3519                         }
3520                     }
3521                     if (migrate_ignore_shared()) {
3522                         hwaddr addr = qemu_get_be64(f);
3523                         if (ramblock_is_ignored(block) &&
3524                             block->mr->addr != addr) {
3525                             error_report("Mismatched GPAs for block %s "
3526                                          "%" PRId64 "!= %" PRId64,
3527                                          id, (uint64_t)addr,
3528                                          (uint64_t)block->mr->addr);
3529                             ret = -EINVAL;
3530                         }
3531                     }
3532                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3533                                           block->idstr);
3534                 } else {
3535                     error_report("Unknown ramblock \"%s\", cannot "
3536                                  "accept migration", id);
3537                     ret = -EINVAL;
3538                 }
3539
3540                 total_ram_bytes -= length;
3541             }
3542             break;
3543
3544         case RAM_SAVE_FLAG_ZERO:
3545             ch = qemu_get_byte(f);
3546             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3547             break;
3548
3549         case RAM_SAVE_FLAG_PAGE:
3550             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3551             break;
3552
3553         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3554             len = qemu_get_be32(f);
3555             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3556                 error_report("Invalid compressed data length: %d", len);
3557                 ret = -EINVAL;
3558                 break;
3559             }
3560             decompress_data_with_multi_threads(f, host, len);
3561             break;
3562
3563         case RAM_SAVE_FLAG_XBZRLE:
3564             if (load_xbzrle(f, addr, host) < 0) {
3565                 error_report("Failed to decompress XBZRLE page at "
3566                              RAM_ADDR_FMT, addr);
3567                 ret = -EINVAL;
3568                 break;
3569             }
3570             break;
3571         case RAM_SAVE_FLAG_EOS:
3572             /* normal exit */
3573             multifd_recv_sync_main();
3574             break;
3575         default:
3576             if (flags & RAM_SAVE_FLAG_HOOK) {
3577                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3578             } else {
3579                 error_report("Unknown combination of migration flags: %#x",
3580                              flags);
3581                 ret = -EINVAL;
3582             }
3583         }
3584         if (!ret) {
3585             ret = qemu_file_get_error(f);
3586         }
3587         if (!ret && host_bak) {
3588             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3589         }
3590     }
3591
3592     ret |= wait_for_decompress_done();
3593     return ret;
3594 }
3595
3596 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3597 {
3598     int ret = 0;
3599     static uint64_t seq_iter;
3600     /*
3601      * If system is running in postcopy mode, page inserts to host memory must
3602      * be atomic
3603      */
3604     bool postcopy_running = postcopy_is_running();
3605
3606     seq_iter++;
3607
3608     if (version_id != 4) {
3609         return -EINVAL;
3610     }
3611
3612     /*
3613      * This RCU critical section can be very long running.
3614      * When RCU reclaims in the code start to become numerous,
3615      * it will be necessary to reduce the granularity of this
3616      * critical section.
3617      */
3618     WITH_RCU_READ_LOCK_GUARD() {
3619         if (postcopy_running) {
3620             ret = ram_load_postcopy(f);
3621         } else {
3622             ret = ram_load_precopy(f);
3623         }
3624     }
3625     trace_ram_load_complete(ret, seq_iter);
3626
3627     return ret;
3628 }
3629
3630 static bool ram_has_postcopy(void *opaque)
3631 {
3632     RAMBlock *rb;
3633     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3634         if (ramblock_is_pmem(rb)) {
3635             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3636                          "is not supported now!", rb->idstr, rb->host);
3637             return false;
3638         }
3639     }
3640
3641     return migrate_postcopy_ram();
3642 }
3643
3644 /* Sync all the dirty bitmap with destination VM.  */
3645 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3646 {
3647     RAMBlock *block;
3648     QEMUFile *file = s->to_dst_file;
3649     int ramblock_count = 0;
3650
3651     trace_ram_dirty_bitmap_sync_start();
3652
3653     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3654         qemu_savevm_send_recv_bitmap(file, block->idstr);
3655         trace_ram_dirty_bitmap_request(block->idstr);
3656         ramblock_count++;
3657     }
3658
3659     trace_ram_dirty_bitmap_sync_wait();
3660
3661     /* Wait until all the ramblocks' dirty bitmap synced */
3662     while (ramblock_count--) {
3663         qemu_sem_wait(&s->rp_state.rp_sem);
3664     }
3665
3666     trace_ram_dirty_bitmap_sync_complete();
3667
3668     return 0;
3669 }
3670
3671 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3672 {
3673     qemu_sem_post(&s->rp_state.rp_sem);
3674 }
3675
3676 /*
3677  * Read the received bitmap, revert it as the initial dirty bitmap.
3678  * This is only used when the postcopy migration is paused but wants
3679  * to resume from a middle point.
3680  */
3681 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3682 {
3683     int ret = -EINVAL;
3684     QEMUFile *file = s->rp_state.from_dst_file;
3685     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3686     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3687     uint64_t size, end_mark;
3688
3689     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3690
3691     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3692         error_report("%s: incorrect state %s", __func__,
3693                      MigrationStatus_str(s->state));
3694         return -EINVAL;
3695     }
3696
3697     /*
3698      * Note: see comments in ramblock_recv_bitmap_send() on why we
3699      * need the endianness conversion, and the paddings.
3700      */
3701     local_size = ROUND_UP(local_size, 8);
3702
3703     /* Add paddings */
3704     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3705
3706     size = qemu_get_be64(file);
3707
3708     /* The size of the bitmap should match with our ramblock */
3709     if (size != local_size) {
3710         error_report("%s: ramblock '%s' bitmap size mismatch "
3711                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3712                      block->idstr, size, local_size);
3713         ret = -EINVAL;
3714         goto out;
3715     }
3716
3717     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3718     end_mark = qemu_get_be64(file);
3719
3720     ret = qemu_file_get_error(file);
3721     if (ret || size != local_size) {
3722         error_report("%s: read bitmap failed for ramblock '%s': %d"
3723                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3724                      __func__, block->idstr, ret, local_size, size);
3725         ret = -EIO;
3726         goto out;
3727     }
3728
3729     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3730         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3731                      __func__, block->idstr, end_mark);
3732         ret = -EINVAL;
3733         goto out;
3734     }
3735
3736     /*
3737      * Endianness conversion. We are during postcopy (though paused).
3738      * The dirty bitmap won't change. We can directly modify it.
3739      */
3740     bitmap_from_le(block->bmap, le_bitmap, nbits);
3741
3742     /*
3743      * What we received is "received bitmap". Revert it as the initial
3744      * dirty bitmap for this ramblock.
3745      */
3746     bitmap_complement(block->bmap, block->bmap, nbits);
3747
3748     trace_ram_dirty_bitmap_reload_complete(block->idstr);
3749
3750     /*
3751      * We succeeded to sync bitmap for current ramblock. If this is
3752      * the last one to sync, we need to notify the main send thread.
3753      */
3754     ram_dirty_bitmap_reload_notify(s);
3755
3756     ret = 0;
3757 out:
3758     g_free(le_bitmap);
3759     return ret;
3760 }
3761
3762 static int ram_resume_prepare(MigrationState *s, void *opaque)
3763 {
3764     RAMState *rs = *(RAMState **)opaque;
3765     int ret;
3766
3767     ret = ram_dirty_bitmap_sync_all(s, rs);
3768     if (ret) {
3769         return ret;
3770     }
3771
3772     ram_state_resume_prepare(rs, s->to_dst_file);
3773
3774     return 0;
3775 }
3776
3777 static SaveVMHandlers savevm_ram_handlers = {
3778     .save_setup = ram_save_setup,
3779     .save_live_iterate = ram_save_iterate,
3780     .save_live_complete_postcopy = ram_save_complete,
3781     .save_live_complete_precopy = ram_save_complete,
3782     .has_postcopy = ram_has_postcopy,
3783     .save_live_pending = ram_save_pending,
3784     .load_state = ram_load,
3785     .save_cleanup = ram_save_cleanup,
3786     .load_setup = ram_load_setup,
3787     .load_cleanup = ram_load_cleanup,
3788     .resume_prepare = ram_resume_prepare,
3789 };
3790
3791 void ram_mig_init(void)
3792 {
3793     qemu_mutex_init(&XBZRLE.lock);
3794     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
3795 }