migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include "qemu/cutils.h"
  32 #include "qemu/bitops.h"
  33 #include "qemu/bitmap.h"
  34 #include "qemu/main-loop.h"
  35 #include "xbzrle.h"
  36 #include "ram.h"
  37 #include "migration.h"
  38 #include "migration/register.h"
  39 #include "migration/misc.h"
  40 #include "qemu-file.h"
  41 #include "postcopy-ram.h"
  42 #include "page_cache.h"
  43 #include "qemu/error-report.h"
  44 #include "qapi/error.h"
  45 #include "qapi/qapi-types-migration.h"
  46 #include "qapi/qapi-events-migration.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "block.h"
  54 #include "sysemu/sysemu.h"
  55 #include "savevm.h"
  56 #include "qemu/iov.h"
  57 #include "multifd.h"
  58
  59 /***********************************************************/
  60 /* ram save/restore */
  61
  62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  63  * worked for pages that where filled with the same char.  We switched
  64  * it to only search for the zero value.  And to avoid confusion with
  65  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  66  */
  67
  68 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  69 #define RAM_SAVE_FLAG_ZERO     0x02
  70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  71 #define RAM_SAVE_FLAG_PAGE     0x08
  72 #define RAM_SAVE_FLAG_EOS      0x10
  73 #define RAM_SAVE_FLAG_CONTINUE 0x20
  74 #define RAM_SAVE_FLAG_XBZRLE   0x40
  75 /* 0x80 is reserved in migration.h start with 0x100 next */
  76 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  77
  78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  79 {
  80     return buffer_is_zero(p, size);
  81 }
  82
  83 XBZRLECacheStats xbzrle_counters;
  84
  85 /* struct contains XBZRLE cache and a static page
  86    used by the compression */
  87 static struct {
  88     /* buffer used for XBZRLE encoding */
  89     uint8_t *encoded_buf;
  90     /* buffer for storing page content */
  91     uint8_t *current_buf;
  92     /* Cache for XBZRLE, Protected by lock. */
  93     PageCache *cache;
  94     QemuMutex lock;
  95     /* it will store a page full of zeros */
  96     uint8_t *zero_target_page;
  97     /* buffer used for XBZRLE decoding */
  98     uint8_t *decoded_buf;
  99 } XBZRLE;
 100
 101 static void XBZRLE_cache_lock(void)
 102 {
 103     if (migrate_use_xbzrle())
 104         qemu_mutex_lock(&XBZRLE.lock);
 105 }
 106
 107 static void XBZRLE_cache_unlock(void)
 108 {
 109     if (migrate_use_xbzrle())
 110         qemu_mutex_unlock(&XBZRLE.lock);
 111 }
 112
 113 /**
 114  * xbzrle_cache_resize: resize the xbzrle cache
 115  *
 116  * This function is called from qmp_migrate_set_cache_size in main
 117  * thread, possibly while a migration is in progress.  A running
 118  * migration may be using the cache and might finish during this call,
 119  * hence changes to the cache are protected by XBZRLE.lock().
 120  *
 121  * Returns 0 for success or -1 for error
 122  *
 123  * @new_size: new cache size
 124  * @errp: set *errp if the check failed, with reason
 125  */
 126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 127 {
 128     PageCache *new_cache;
 129     int64_t ret = 0;
 130
 131     /* Check for truncation */
 132     if (new_size != (size_t)new_size) {
 133         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 134                    "exceeding address space");
 135         return -1;
 136     }
 137
 138     if (new_size == migrate_xbzrle_cache_size()) {
 139         /* nothing to do */
 140         return 0;
 141     }
 142
 143     XBZRLE_cache_lock();
 144
 145     if (XBZRLE.cache != NULL) {
 146         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 147         if (!new_cache) {
 148             ret = -1;
 149             goto out;
 150         }
 151
 152         cache_fini(XBZRLE.cache);
 153         XBZRLE.cache = new_cache;
 154     }
 155 out:
 156     XBZRLE_cache_unlock();
 157     return ret;
 158 }
 159
 160 static bool ramblock_is_ignored(RAMBlock *block)
 161 {
 162     return !qemu_ram_is_migratable(block) ||
 163            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 164 }
 165
 166 /* Should be holding either ram_list.mutex, or the RCU lock. */
 167 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 168     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 169         if (ramblock_is_ignored(block)) {} else
 170
 171 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 172     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 173         if (!qemu_ram_is_migratable(block)) {} else
 174
 175 #undef RAMBLOCK_FOREACH
 176
 177 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 178 {
 179     RAMBlock *block;
 180     int ret = 0;
 181
 182     RCU_READ_LOCK_GUARD();
 183
 184     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 185         ret = func(block, opaque);
 186         if (ret) {
 187             break;
 188         }
 189     }
 190     return ret;
 191 }
 192
 193 static void ramblock_recv_map_init(void)
 194 {
 195     RAMBlock *rb;
 196
 197     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 198         assert(!rb->receivedmap);
 199         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 200     }
 201 }
 202
 203 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 204 {
 205     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 206                     rb->receivedmap);
 207 }
 208
 209 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 210 {
 211     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 212 }
 213
 214 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 215 {
 216     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 217 }
 218
 219 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 220                                     size_t nr)
 221 {
 222     bitmap_set_atomic(rb->receivedmap,
 223                       ramblock_recv_bitmap_offset(host_addr, rb),
 224                       nr);
 225 }
 226
 227 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 228
 229 /*
 230  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 231  *
 232  * Returns >0 if success with sent bytes, or <0 if error.
 233  */
 234 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 235                                   const char *block_name)
 236 {
 237     RAMBlock *block = qemu_ram_block_by_name(block_name);
 238     unsigned long *le_bitmap, nbits;
 239     uint64_t size;
 240
 241     if (!block) {
 242         error_report("%s: invalid block name: %s", __func__, block_name);
 243         return -1;
 244     }
 245
 246     nbits = block->used_length >> TARGET_PAGE_BITS;
 247
 248     /*
 249      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 250      * machines we may need 4 more bytes for padding (see below
 251      * comment). So extend it a bit before hand.
 252      */
 253     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 254
 255     /*
 256      * Always use little endian when sending the bitmap. This is
 257      * required that when source and destination VMs are not using the
 258      * same endianess. (Note: big endian won't work.)
 259      */
 260     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 261
 262     /* Size of the bitmap, in bytes */
 263     size = DIV_ROUND_UP(nbits, 8);
 264
 265     /*
 266      * size is always aligned to 8 bytes for 64bit machines, but it
 267      * may not be true for 32bit machines. We need this padding to
 268      * make sure the migration can survive even between 32bit and
 269      * 64bit machines.
 270      */
 271     size = ROUND_UP(size, 8);
 272
 273     qemu_put_be64(file, size);
 274     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 275     /*
 276      * Mark as an end, in case the middle part is screwed up due to
 277      * some "misterious" reason.
 278      */
 279     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 280     qemu_fflush(file);
 281
 282     g_free(le_bitmap);
 283
 284     if (qemu_file_get_error(file)) {
 285         return qemu_file_get_error(file);
 286     }
 287
 288     return size + sizeof(size);
 289 }
 290
 291 /*
 292  * An outstanding page request, on the source, having been received
 293  * and queued
 294  */
 295 struct RAMSrcPageRequest {
 296     RAMBlock *rb;
 297     hwaddr    offset;
 298     hwaddr    len;
 299
 300     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 301 };
 302
 303 /* State of RAM for migration */
 304 struct RAMState {
 305     /* QEMUFile used for this migration */
 306     QEMUFile *f;
 307     /* Last block that we have visited searching for dirty pages */
 308     RAMBlock *last_seen_block;
 309     /* Last block from where we have sent data */
 310     RAMBlock *last_sent_block;
 311     /* Last dirty target page we have sent */
 312     ram_addr_t last_page;
 313     /* last ram version we have seen */
 314     uint32_t last_version;
 315     /* We are in the first round */
 316     bool ram_bulk_stage;
 317     /* The free page optimization is enabled */
 318     bool fpo_enabled;
 319     /* How many times we have dirty too many pages */
 320     int dirty_rate_high_cnt;
 321     /* these variables are used for bitmap sync */
 322     /* last time we did a full bitmap_sync */
 323     int64_t time_last_bitmap_sync;
 324     /* bytes transferred at start_time */
 325     uint64_t bytes_xfer_prev;
 326     /* number of dirty pages since start_time */
 327     uint64_t num_dirty_pages_period;
 328     /* xbzrle misses since the beginning of the period */
 329     uint64_t xbzrle_cache_miss_prev;
 330
 331     /* compression statistics since the beginning of the period */
 332     /* amount of count that no free thread to compress data */
 333     uint64_t compress_thread_busy_prev;
 334     /* amount bytes after compression */
 335     uint64_t compressed_size_prev;
 336     /* amount of compressed pages */
 337     uint64_t compress_pages_prev;
 338
 339     /* total handled target pages at the beginning of period */
 340     uint64_t target_page_count_prev;
 341     /* total handled target pages since start */
 342     uint64_t target_page_count;
 343     /* number of dirty bits in the bitmap */
 344     uint64_t migration_dirty_pages;
 345     /* Protects modification of the bitmap and migration dirty pages */
 346     QemuMutex bitmap_mutex;
 347     /* The RAMBlock used in the last src_page_requests */
 348     RAMBlock *last_req_rb;
 349     /* Queue of outstanding page requests from the destination */
 350     QemuMutex src_page_req_mutex;
 351     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 352 };
 353 typedef struct RAMState RAMState;
 354
 355 static RAMState *ram_state;
 356
 357 static NotifierWithReturnList precopy_notifier_list;
 358
 359 void precopy_infrastructure_init(void)
 360 {
 361     notifier_with_return_list_init(&precopy_notifier_list);
 362 }
 363
 364 void precopy_add_notifier(NotifierWithReturn *n)
 365 {
 366     notifier_with_return_list_add(&precopy_notifier_list, n);
 367 }
 368
 369 void precopy_remove_notifier(NotifierWithReturn *n)
 370 {
 371     notifier_with_return_remove(n);
 372 }
 373
 374 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 375 {
 376     PrecopyNotifyData pnd;
 377     pnd.reason = reason;
 378     pnd.errp = errp;
 379
 380     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 381 }
 382
 383 void precopy_enable_free_page_optimization(void)
 384 {
 385     if (!ram_state) {
 386         return;
 387     }
 388
 389     ram_state->fpo_enabled = true;
 390 }
 391
 392 uint64_t ram_bytes_remaining(void)
 393 {
 394     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 395                        0;
 396 }
 397
 398 MigrationStats ram_counters;
 399
 400 /* used by the search for pages to send */
 401 struct PageSearchStatus {
 402     /* Current block being searched */
 403     RAMBlock    *block;
 404     /* Current page to search from */
 405     unsigned long page;
 406     /* Set once we wrap around */
 407     bool         complete_round;
 408 };
 409 typedef struct PageSearchStatus PageSearchStatus;
 410
 411 CompressionStats compression_counters;
 412
 413 struct CompressParam {
 414     bool done;
 415     bool quit;
 416     bool zero_page;
 417     QEMUFile *file;
 418     QemuMutex mutex;
 419     QemuCond cond;
 420     RAMBlock *block;
 421     ram_addr_t offset;
 422
 423     /* internally used fields */
 424     z_stream stream;
 425     uint8_t *originbuf;
 426 };
 427 typedef struct CompressParam CompressParam;
 428
 429 struct DecompressParam {
 430     bool done;
 431     bool quit;
 432     QemuMutex mutex;
 433     QemuCond cond;
 434     void *des;
 435     uint8_t *compbuf;
 436     int len;
 437     z_stream stream;
 438 };
 439 typedef struct DecompressParam DecompressParam;
 440
 441 static CompressParam *comp_param;
 442 static QemuThread *compress_threads;
 443 /* comp_done_cond is used to wake up the migration thread when
 444  * one of the compression threads has finished the compression.
 445  * comp_done_lock is used to co-work with comp_done_cond.
 446  */
 447 static QemuMutex comp_done_lock;
 448 static QemuCond comp_done_cond;
 449 /* The empty QEMUFileOps will be used by file in CompressParam */
 450 static const QEMUFileOps empty_ops = { };
 451
 452 static QEMUFile *decomp_file;
 453 static DecompressParam *decomp_param;
 454 static QemuThread *decompress_threads;
 455 static QemuMutex decomp_done_lock;
 456 static QemuCond decomp_done_cond;
 457
 458 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 459                                  ram_addr_t offset, uint8_t *source_buf);
 460
 461 static void *do_data_compress(void *opaque)
 462 {
 463     CompressParam *param = opaque;
 464     RAMBlock *block;
 465     ram_addr_t offset;
 466     bool zero_page;
 467
 468     qemu_mutex_lock(&param->mutex);
 469     while (!param->quit) {
 470         if (param->block) {
 471             block = param->block;
 472             offset = param->offset;
 473             param->block = NULL;
 474             qemu_mutex_unlock(&param->mutex);
 475
 476             zero_page = do_compress_ram_page(param->file, &param->stream,
 477                                              block, offset, param->originbuf);
 478
 479             qemu_mutex_lock(&comp_done_lock);
 480             param->done = true;
 481             param->zero_page = zero_page;
 482             qemu_cond_signal(&comp_done_cond);
 483             qemu_mutex_unlock(&comp_done_lock);
 484
 485             qemu_mutex_lock(&param->mutex);
 486         } else {
 487             qemu_cond_wait(&param->cond, &param->mutex);
 488         }
 489     }
 490     qemu_mutex_unlock(&param->mutex);
 491
 492     return NULL;
 493 }
 494
 495 static void compress_threads_save_cleanup(void)
 496 {
 497     int i, thread_count;
 498
 499     if (!migrate_use_compression() || !comp_param) {
 500         return;
 501     }
 502
 503     thread_count = migrate_compress_threads();
 504     for (i = 0; i < thread_count; i++) {
 505         /*
 506          * we use it as a indicator which shows if the thread is
 507          * properly init'd or not
 508          */
 509         if (!comp_param[i].file) {
 510             break;
 511         }
 512
 513         qemu_mutex_lock(&comp_param[i].mutex);
 514         comp_param[i].quit = true;
 515         qemu_cond_signal(&comp_param[i].cond);
 516         qemu_mutex_unlock(&comp_param[i].mutex);
 517
 518         qemu_thread_join(compress_threads + i);
 519         qemu_mutex_destroy(&comp_param[i].mutex);
 520         qemu_cond_destroy(&comp_param[i].cond);
 521         deflateEnd(&comp_param[i].stream);
 522         g_free(comp_param[i].originbuf);
 523         qemu_fclose(comp_param[i].file);
 524         comp_param[i].file = NULL;
 525     }
 526     qemu_mutex_destroy(&comp_done_lock);
 527     qemu_cond_destroy(&comp_done_cond);
 528     g_free(compress_threads);
 529     g_free(comp_param);
 530     compress_threads = NULL;
 531     comp_param = NULL;
 532 }
 533
 534 static int compress_threads_save_setup(void)
 535 {
 536     int i, thread_count;
 537
 538     if (!migrate_use_compression()) {
 539         return 0;
 540     }
 541     thread_count = migrate_compress_threads();
 542     compress_threads = g_new0(QemuThread, thread_count);
 543     comp_param = g_new0(CompressParam, thread_count);
 544     qemu_cond_init(&comp_done_cond);
 545     qemu_mutex_init(&comp_done_lock);
 546     for (i = 0; i < thread_count; i++) {
 547         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 548         if (!comp_param[i].originbuf) {
 549             goto exit;
 550         }
 551
 552         if (deflateInit(&comp_param[i].stream,
 553                         migrate_compress_level()) != Z_OK) {
 554             g_free(comp_param[i].originbuf);
 555             goto exit;
 556         }
 557
 558         /* comp_param[i].file is just used as a dummy buffer to save data,
 559          * set its ops to empty.
 560          */
 561         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 562         comp_param[i].done = true;
 563         comp_param[i].quit = false;
 564         qemu_mutex_init(&comp_param[i].mutex);
 565         qemu_cond_init(&comp_param[i].cond);
 566         qemu_thread_create(compress_threads + i, "compress",
 567                            do_data_compress, comp_param + i,
 568                            QEMU_THREAD_JOINABLE);
 569     }
 570     return 0;
 571
 572 exit:
 573     compress_threads_save_cleanup();
 574     return -1;
 575 }
 576
 577 /**
 578  * save_page_header: write page header to wire
 579  *
 580  * If this is the 1st block, it also writes the block identification
 581  *
 582  * Returns the number of bytes written
 583  *
 584  * @f: QEMUFile where to send the data
 585  * @block: block that contains the page we want to send
 586  * @offset: offset inside the block for the page
 587  *          in the lower bits, it contains flags
 588  */
 589 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 590                                ram_addr_t offset)
 591 {
 592     size_t size, len;
 593
 594     if (block == rs->last_sent_block) {
 595         offset |= RAM_SAVE_FLAG_CONTINUE;
 596     }
 597     qemu_put_be64(f, offset);
 598     size = 8;
 599
 600     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 601         len = strlen(block->idstr);
 602         qemu_put_byte(f, len);
 603         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 604         size += 1 + len;
 605         rs->last_sent_block = block;
 606     }
 607     return size;
 608 }
 609
 610 /**
 611  * mig_throttle_guest_down: throotle down the guest
 612  *
 613  * Reduce amount of guest cpu execution to hopefully slow down memory
 614  * writes. If guest dirty memory rate is reduced below the rate at
 615  * which we can transfer pages to the destination then we should be
 616  * able to complete migration. Some workloads dirty memory way too
 617  * fast and will not effectively converge, even with auto-converge.
 618  */
 619 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 620                                     uint64_t bytes_dirty_threshold)
 621 {
 622     MigrationState *s = migrate_get_current();
 623     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 624     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 625     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 626     int pct_max = s->parameters.max_cpu_throttle;
 627
 628     uint64_t throttle_now = cpu_throttle_get_percentage();
 629     uint64_t cpu_now, cpu_ideal, throttle_inc;
 630
 631     /* We have not started throttling yet. Let's start it. */
 632     if (!cpu_throttle_active()) {
 633         cpu_throttle_set(pct_initial);
 634     } else {
 635         /* Throttling already on, just increase the rate */
 636         if (!pct_tailslow) {
 637             throttle_inc = pct_increment;
 638         } else {
 639             /* Compute the ideal CPU percentage used by Guest, which may
 640              * make the dirty rate match the dirty rate threshold. */
 641             cpu_now = 100 - throttle_now;
 642             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 643                         bytes_dirty_period);
 644             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 645         }
 646         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 647     }
 648 }
 649
 650 /**
 651  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 652  *
 653  * @rs: current RAM state
 654  * @current_addr: address for the zero page
 655  *
 656  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 657  * The important thing is that a stale (not-yet-0'd) page be replaced
 658  * by the new data.
 659  * As a bonus, if the page wasn't in the cache it gets added so that
 660  * when a small write is made into the 0'd page it gets XBZRLE sent.
 661  */
 662 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 663 {
 664     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 665         return;
 666     }
 667
 668     /* We don't care if this fails to allocate a new cache page
 669      * as long as it updated an old one */
 670     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 671                  ram_counters.dirty_sync_count);
 672 }
 673
 674 #define ENCODING_FLAG_XBZRLE 0x1
 675
 676 /**
 677  * save_xbzrle_page: compress and send current page
 678  *
 679  * Returns: 1 means that we wrote the page
 680  *          0 means that page is identical to the one already sent
 681  *          -1 means that xbzrle would be longer than normal
 682  *
 683  * @rs: current RAM state
 684  * @current_data: pointer to the address of the page contents
 685  * @current_addr: addr of the page
 686  * @block: block that contains the page we want to send
 687  * @offset: offset inside the block for the page
 688  * @last_stage: if we are at the completion stage
 689  */
 690 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 691                             ram_addr_t current_addr, RAMBlock *block,
 692                             ram_addr_t offset, bool last_stage)
 693 {
 694     int encoded_len = 0, bytes_xbzrle;
 695     uint8_t *prev_cached_page;
 696
 697     if (!cache_is_cached(XBZRLE.cache, current_addr,
 698                          ram_counters.dirty_sync_count)) {
 699         xbzrle_counters.cache_miss++;
 700         if (!last_stage) {
 701             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 702                              ram_counters.dirty_sync_count) == -1) {
 703                 return -1;
 704             } else {
 705                 /* update *current_data when the page has been
 706                    inserted into cache */
 707                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 708             }
 709         }
 710         return -1;
 711     }
 712
 713     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 714
 715     /* save current buffer into memory */
 716     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 717
 718     /* XBZRLE encoding (if there is no overflow) */
 719     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 720                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 721                                        TARGET_PAGE_SIZE);
 722
 723     /*
 724      * Update the cache contents, so that it corresponds to the data
 725      * sent, in all cases except where we skip the page.
 726      */
 727     if (!last_stage && encoded_len != 0) {
 728         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 729         /*
 730          * In the case where we couldn't compress, ensure that the caller
 731          * sends the data from the cache, since the guest might have
 732          * changed the RAM since we copied it.
 733          */
 734         *current_data = prev_cached_page;
 735     }
 736
 737     if (encoded_len == 0) {
 738         trace_save_xbzrle_page_skipping();
 739         return 0;
 740     } else if (encoded_len == -1) {
 741         trace_save_xbzrle_page_overflow();
 742         xbzrle_counters.overflow++;
 743         return -1;
 744     }
 745
 746     /* Send XBZRLE based compressed page */
 747     bytes_xbzrle = save_page_header(rs, rs->f, block,
 748                                     offset | RAM_SAVE_FLAG_XBZRLE);
 749     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 750     qemu_put_be16(rs->f, encoded_len);
 751     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 752     bytes_xbzrle += encoded_len + 1 + 2;
 753     xbzrle_counters.pages++;
 754     xbzrle_counters.bytes += bytes_xbzrle;
 755     ram_counters.transferred += bytes_xbzrle;
 756
 757     return 1;
 758 }
 759
 760 /**
 761  * migration_bitmap_find_dirty: find the next dirty page from start
 762  *
 763  * Returns the page offset within memory region of the start of a dirty page
 764  *
 765  * @rs: current RAM state
 766  * @rb: RAMBlock where to search for dirty pages
 767  * @start: page where we start the search
 768  */
 769 static inline
 770 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 771                                           unsigned long start)
 772 {
 773     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 774     unsigned long *bitmap = rb->bmap;
 775     unsigned long next;
 776
 777     if (ramblock_is_ignored(rb)) {
 778         return size;
 779     }
 780
 781     /*
 782      * When the free page optimization is enabled, we need to check the bitmap
 783      * to send the non-free pages rather than all the pages in the bulk stage.
 784      */
 785     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
 786         next = start + 1;
 787     } else {
 788         next = find_next_bit(bitmap, size, start);
 789     }
 790
 791     return next;
 792 }
 793
 794 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 795                                                 RAMBlock *rb,
 796                                                 unsigned long page)
 797 {
 798     bool ret;
 799
 800     qemu_mutex_lock(&rs->bitmap_mutex);
 801
 802     /*
 803      * Clear dirty bitmap if needed.  This _must_ be called before we
 804      * send any of the page in the chunk because we need to make sure
 805      * we can capture further page content changes when we sync dirty
 806      * log the next time.  So as long as we are going to send any of
 807      * the page in the chunk we clear the remote dirty bitmap for all.
 808      * Clearing it earlier won't be a problem, but too late will.
 809      */
 810     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
 811         uint8_t shift = rb->clear_bmap_shift;
 812         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
 813         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 814
 815         /*
 816          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 817          * can make things easier sometimes since then start address
 818          * of the small chunk will always be 64 pages aligned so the
 819          * bitmap will always be aligned to unsigned long.  We should
 820          * even be able to remove this restriction but I'm simply
 821          * keeping it.
 822          */
 823         assert(shift >= 6);
 824         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 825         memory_region_clear_dirty_bitmap(rb->mr, start, size);
 826     }
 827
 828     ret = test_and_clear_bit(page, rb->bmap);
 829
 830     if (ret) {
 831         rs->migration_dirty_pages--;
 832     }
 833     qemu_mutex_unlock(&rs->bitmap_mutex);
 834
 835     return ret;
 836 }
 837
 838 /* Called with RCU critical section */
 839 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 840 {
 841     rs->migration_dirty_pages +=
 842         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
 843                                               &rs->num_dirty_pages_period);
 844 }
 845
 846 /**
 847  * ram_pagesize_summary: calculate all the pagesizes of a VM
 848  *
 849  * Returns a summary bitmap of the page sizes of all RAMBlocks
 850  *
 851  * For VMs with just normal pages this is equivalent to the host page
 852  * size. If it's got some huge pages then it's the OR of all the
 853  * different page sizes.
 854  */
 855 uint64_t ram_pagesize_summary(void)
 856 {
 857     RAMBlock *block;
 858     uint64_t summary = 0;
 859
 860     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 861         summary |= block->page_size;
 862     }
 863
 864     return summary;
 865 }
 866
 867 uint64_t ram_get_total_transferred_pages(void)
 868 {
 869     return  ram_counters.normal + ram_counters.duplicate +
 870                 compression_counters.pages + xbzrle_counters.pages;
 871 }
 872
 873 static void migration_update_rates(RAMState *rs, int64_t end_time)
 874 {
 875     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 876     double compressed_size;
 877
 878     /* calculate period counters */
 879     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 880                 / (end_time - rs->time_last_bitmap_sync);
 881
 882     if (!page_count) {
 883         return;
 884     }
 885
 886     if (migrate_use_xbzrle()) {
 887         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 888             rs->xbzrle_cache_miss_prev) / page_count;
 889         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 890     }
 891
 892     if (migrate_use_compression()) {
 893         compression_counters.busy_rate = (double)(compression_counters.busy -
 894             rs->compress_thread_busy_prev) / page_count;
 895         rs->compress_thread_busy_prev = compression_counters.busy;
 896
 897         compressed_size = compression_counters.compressed_size -
 898                           rs->compressed_size_prev;
 899         if (compressed_size) {
 900             double uncompressed_size = (compression_counters.pages -
 901                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 902
 903             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 904             compression_counters.compression_rate =
 905                                         uncompressed_size / compressed_size;
 906
 907             rs->compress_pages_prev = compression_counters.pages;
 908             rs->compressed_size_prev = compression_counters.compressed_size;
 909         }
 910     }
 911 }
 912
 913 static void migration_trigger_throttle(RAMState *rs)
 914 {
 915     MigrationState *s = migrate_get_current();
 916     uint64_t threshold = s->parameters.throttle_trigger_threshold;
 917
 918     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 919     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 920     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 921
 922     /* During block migration the auto-converge logic incorrectly detects
 923      * that ram migration makes no progress. Avoid this by disabling the
 924      * throttling logic during the bulk phase of block migration. */
 925     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 926         /* The following detection logic can be refined later. For now:
 927            Check to see if the ratio between dirtied bytes and the approx.
 928            amount of bytes that just got transferred since the last time
 929            we were in this routine reaches the threshold. If that happens
 930            twice, start or increase throttling. */
 931
 932         if ((bytes_dirty_period > bytes_dirty_threshold) &&
 933             (++rs->dirty_rate_high_cnt >= 2)) {
 934             trace_migration_throttle();
 935             rs->dirty_rate_high_cnt = 0;
 936             mig_throttle_guest_down(bytes_dirty_period,
 937                                     bytes_dirty_threshold);
 938         }
 939     }
 940 }
 941
 942 static void migration_bitmap_sync(RAMState *rs)
 943 {
 944     RAMBlock *block;
 945     int64_t end_time;
 946
 947     ram_counters.dirty_sync_count++;
 948
 949     if (!rs->time_last_bitmap_sync) {
 950         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 951     }
 952
 953     trace_migration_bitmap_sync_start();
 954     memory_global_dirty_log_sync();
 955
 956     qemu_mutex_lock(&rs->bitmap_mutex);
 957     WITH_RCU_READ_LOCK_GUARD() {
 958         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 959             ramblock_sync_dirty_bitmap(rs, block);
 960         }
 961         ram_counters.remaining = ram_bytes_remaining();
 962     }
 963     qemu_mutex_unlock(&rs->bitmap_mutex);
 964
 965     memory_global_after_dirty_log_sync();
 966     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 967
 968     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 969
 970     /* more than 1 second = 1000 millisecons */
 971     if (end_time > rs->time_last_bitmap_sync + 1000) {
 972         migration_trigger_throttle(rs);
 973
 974         migration_update_rates(rs, end_time);
 975
 976         rs->target_page_count_prev = rs->target_page_count;
 977
 978         /* reset period counters */
 979         rs->time_last_bitmap_sync = end_time;
 980         rs->num_dirty_pages_period = 0;
 981         rs->bytes_xfer_prev = ram_counters.transferred;
 982     }
 983     if (migrate_use_events()) {
 984         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
 985     }
 986 }
 987
 988 static void migration_bitmap_sync_precopy(RAMState *rs)
 989 {
 990     Error *local_err = NULL;
 991
 992     /*
 993      * The current notifier usage is just an optimization to migration, so we
 994      * don't stop the normal migration process in the error case.
 995      */
 996     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
 997         error_report_err(local_err);
 998         local_err = NULL;
 999     }
1000
1001     migration_bitmap_sync(rs);
1002
1003     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1004         error_report_err(local_err);
1005     }
1006 }
1007
1008 /**
1009  * save_zero_page_to_file: send the zero page to the file
1010  *
1011  * Returns the size of data written to the file, 0 means the page is not
1012  * a zero page
1013  *
1014  * @rs: current RAM state
1015  * @file: the file where the data is saved
1016  * @block: block that contains the page we want to send
1017  * @offset: offset inside the block for the page
1018  */
1019 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1020                                   RAMBlock *block, ram_addr_t offset)
1021 {
1022     uint8_t *p = block->host + offset;
1023     int len = 0;
1024
1025     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1026         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1027         qemu_put_byte(file, 0);
1028         len += 1;
1029     }
1030     return len;
1031 }
1032
1033 /**
1034  * save_zero_page: send the zero page to the stream
1035  *
1036  * Returns the number of pages written.
1037  *
1038  * @rs: current RAM state
1039  * @block: block that contains the page we want to send
1040  * @offset: offset inside the block for the page
1041  */
1042 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1043 {
1044     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1045
1046     if (len) {
1047         ram_counters.duplicate++;
1048         ram_counters.transferred += len;
1049         return 1;
1050     }
1051     return -1;
1052 }
1053
1054 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1055 {
1056     if (!migrate_release_ram() || !migration_in_postcopy()) {
1057         return;
1058     }
1059
1060     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1061 }
1062
1063 /*
1064  * @pages: the number of pages written by the control path,
1065  *        < 0 - error
1066  *        > 0 - number of pages written
1067  *
1068  * Return true if the pages has been saved, otherwise false is returned.
1069  */
1070 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1071                               int *pages)
1072 {
1073     uint64_t bytes_xmit = 0;
1074     int ret;
1075
1076     *pages = -1;
1077     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1078                                 &bytes_xmit);
1079     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1080         return false;
1081     }
1082
1083     if (bytes_xmit) {
1084         ram_counters.transferred += bytes_xmit;
1085         *pages = 1;
1086     }
1087
1088     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1089         return true;
1090     }
1091
1092     if (bytes_xmit > 0) {
1093         ram_counters.normal++;
1094     } else if (bytes_xmit == 0) {
1095         ram_counters.duplicate++;
1096     }
1097
1098     return true;
1099 }
1100
1101 /*
1102  * directly send the page to the stream
1103  *
1104  * Returns the number of pages written.
1105  *
1106  * @rs: current RAM state
1107  * @block: block that contains the page we want to send
1108  * @offset: offset inside the block for the page
1109  * @buf: the page to be sent
1110  * @async: send to page asyncly
1111  */
1112 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1113                             uint8_t *buf, bool async)
1114 {
1115     ram_counters.transferred += save_page_header(rs, rs->f, block,
1116                                                  offset | RAM_SAVE_FLAG_PAGE);
1117     if (async) {
1118         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1119                               migrate_release_ram() &
1120                               migration_in_postcopy());
1121     } else {
1122         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1123     }
1124     ram_counters.transferred += TARGET_PAGE_SIZE;
1125     ram_counters.normal++;
1126     return 1;
1127 }
1128
1129 /**
1130  * ram_save_page: send the given page to the stream
1131  *
1132  * Returns the number of pages written.
1133  *          < 0 - error
1134  *          >=0 - Number of pages written - this might legally be 0
1135  *                if xbzrle noticed the page was the same.
1136  *
1137  * @rs: current RAM state
1138  * @block: block that contains the page we want to send
1139  * @offset: offset inside the block for the page
1140  * @last_stage: if we are at the completion stage
1141  */
1142 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1143 {
1144     int pages = -1;
1145     uint8_t *p;
1146     bool send_async = true;
1147     RAMBlock *block = pss->block;
1148     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1149     ram_addr_t current_addr = block->offset + offset;
1150
1151     p = block->host + offset;
1152     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1153
1154     XBZRLE_cache_lock();
1155     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1156         migrate_use_xbzrle()) {
1157         pages = save_xbzrle_page(rs, &p, current_addr, block,
1158                                  offset, last_stage);
1159         if (!last_stage) {
1160             /* Can't send this cached data async, since the cache page
1161              * might get updated before it gets to the wire
1162              */
1163             send_async = false;
1164         }
1165     }
1166
1167     /* XBZRLE overflow or normal page */
1168     if (pages == -1) {
1169         pages = save_normal_page(rs, block, offset, p, send_async);
1170     }
1171
1172     XBZRLE_cache_unlock();
1173
1174     return pages;
1175 }
1176
1177 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1178                                  ram_addr_t offset)
1179 {
1180     if (multifd_queue_page(rs->f, block, offset) < 0) {
1181         return -1;
1182     }
1183     ram_counters.normal++;
1184
1185     return 1;
1186 }
1187
1188 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1189                                  ram_addr_t offset, uint8_t *source_buf)
1190 {
1191     RAMState *rs = ram_state;
1192     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1193     bool zero_page = false;
1194     int ret;
1195
1196     if (save_zero_page_to_file(rs, f, block, offset)) {
1197         zero_page = true;
1198         goto exit;
1199     }
1200
1201     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1202
1203     /*
1204      * copy it to a internal buffer to avoid it being modified by VM
1205      * so that we can catch up the error during compression and
1206      * decompression
1207      */
1208     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1209     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1210     if (ret < 0) {
1211         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1212         error_report("compressed data failed!");
1213         return false;
1214     }
1215
1216 exit:
1217     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1218     return zero_page;
1219 }
1220
1221 static void
1222 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1223 {
1224     ram_counters.transferred += bytes_xmit;
1225
1226     if (param->zero_page) {
1227         ram_counters.duplicate++;
1228         return;
1229     }
1230
1231     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1232     compression_counters.compressed_size += bytes_xmit - 8;
1233     compression_counters.pages++;
1234 }
1235
1236 static bool save_page_use_compression(RAMState *rs);
1237
1238 static void flush_compressed_data(RAMState *rs)
1239 {
1240     int idx, len, thread_count;
1241
1242     if (!save_page_use_compression(rs)) {
1243         return;
1244     }
1245     thread_count = migrate_compress_threads();
1246
1247     qemu_mutex_lock(&comp_done_lock);
1248     for (idx = 0; idx < thread_count; idx++) {
1249         while (!comp_param[idx].done) {
1250             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1251         }
1252     }
1253     qemu_mutex_unlock(&comp_done_lock);
1254
1255     for (idx = 0; idx < thread_count; idx++) {
1256         qemu_mutex_lock(&comp_param[idx].mutex);
1257         if (!comp_param[idx].quit) {
1258             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1259             /*
1260              * it's safe to fetch zero_page without holding comp_done_lock
1261              * as there is no further request submitted to the thread,
1262              * i.e, the thread should be waiting for a request at this point.
1263              */
1264             update_compress_thread_counts(&comp_param[idx], len);
1265         }
1266         qemu_mutex_unlock(&comp_param[idx].mutex);
1267     }
1268 }
1269
1270 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1271                                        ram_addr_t offset)
1272 {
1273     param->block = block;
1274     param->offset = offset;
1275 }
1276
1277 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1278                                            ram_addr_t offset)
1279 {
1280     int idx, thread_count, bytes_xmit = -1, pages = -1;
1281     bool wait = migrate_compress_wait_thread();
1282
1283     thread_count = migrate_compress_threads();
1284     qemu_mutex_lock(&comp_done_lock);
1285 retry:
1286     for (idx = 0; idx < thread_count; idx++) {
1287         if (comp_param[idx].done) {
1288             comp_param[idx].done = false;
1289             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1290             qemu_mutex_lock(&comp_param[idx].mutex);
1291             set_compress_params(&comp_param[idx], block, offset);
1292             qemu_cond_signal(&comp_param[idx].cond);
1293             qemu_mutex_unlock(&comp_param[idx].mutex);
1294             pages = 1;
1295             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1296             break;
1297         }
1298     }
1299
1300     /*
1301      * wait for the free thread if the user specifies 'compress-wait-thread',
1302      * otherwise we will post the page out in the main thread as normal page.
1303      */
1304     if (pages < 0 && wait) {
1305         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1306         goto retry;
1307     }
1308     qemu_mutex_unlock(&comp_done_lock);
1309
1310     return pages;
1311 }
1312
1313 /**
1314  * find_dirty_block: find the next dirty page and update any state
1315  * associated with the search process.
1316  *
1317  * Returns true if a page is found
1318  *
1319  * @rs: current RAM state
1320  * @pss: data about the state of the current dirty page scan
1321  * @again: set to false if the search has scanned the whole of RAM
1322  */
1323 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1324 {
1325     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1326     if (pss->complete_round && pss->block == rs->last_seen_block &&
1327         pss->page >= rs->last_page) {
1328         /*
1329          * We've been once around the RAM and haven't found anything.
1330          * Give up.
1331          */
1332         *again = false;
1333         return false;
1334     }
1335     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1336         >= pss->block->used_length) {
1337         /* Didn't find anything in this RAM Block */
1338         pss->page = 0;
1339         pss->block = QLIST_NEXT_RCU(pss->block, next);
1340         if (!pss->block) {
1341             /*
1342              * If memory migration starts over, we will meet a dirtied page
1343              * which may still exists in compression threads's ring, so we
1344              * should flush the compressed data to make sure the new page
1345              * is not overwritten by the old one in the destination.
1346              *
1347              * Also If xbzrle is on, stop using the data compression at this
1348              * point. In theory, xbzrle can do better than compression.
1349              */
1350             flush_compressed_data(rs);
1351
1352             /* Hit the end of the list */
1353             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1354             /* Flag that we've looped */
1355             pss->complete_round = true;
1356             rs->ram_bulk_stage = false;
1357         }
1358         /* Didn't find anything this time, but try again on the new block */
1359         *again = true;
1360         return false;
1361     } else {
1362         /* Can go around again, but... */
1363         *again = true;
1364         /* We've found something so probably don't need to */
1365         return true;
1366     }
1367 }
1368
1369 /**
1370  * unqueue_page: gets a page of the queue
1371  *
1372  * Helper for 'get_queued_page' - gets a page off the queue
1373  *
1374  * Returns the block of the page (or NULL if none available)
1375  *
1376  * @rs: current RAM state
1377  * @offset: used to return the offset within the RAMBlock
1378  */
1379 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1380 {
1381     RAMBlock *block = NULL;
1382
1383     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1384         return NULL;
1385     }
1386
1387     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1388     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1389         struct RAMSrcPageRequest *entry =
1390                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1391         block = entry->rb;
1392         *offset = entry->offset;
1393
1394         if (entry->len > TARGET_PAGE_SIZE) {
1395             entry->len -= TARGET_PAGE_SIZE;
1396             entry->offset += TARGET_PAGE_SIZE;
1397         } else {
1398             memory_region_unref(block->mr);
1399             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1400             g_free(entry);
1401             migration_consume_urgent_request();
1402         }
1403     }
1404
1405     return block;
1406 }
1407
1408 /**
1409  * get_queued_page: unqueue a page from the postcopy requests
1410  *
1411  * Skips pages that are already sent (!dirty)
1412  *
1413  * Returns true if a queued page is found
1414  *
1415  * @rs: current RAM state
1416  * @pss: data about the state of the current dirty page scan
1417  */
1418 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1419 {
1420     RAMBlock  *block;
1421     ram_addr_t offset;
1422     bool dirty;
1423
1424     do {
1425         block = unqueue_page(rs, &offset);
1426         /*
1427          * We're sending this page, and since it's postcopy nothing else
1428          * will dirty it, and we must make sure it doesn't get sent again
1429          * even if this queue request was received after the background
1430          * search already sent it.
1431          */
1432         if (block) {
1433             unsigned long page;
1434
1435             page = offset >> TARGET_PAGE_BITS;
1436             dirty = test_bit(page, block->bmap);
1437             if (!dirty) {
1438                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1439                                                 page);
1440             } else {
1441                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1442             }
1443         }
1444
1445     } while (block && !dirty);
1446
1447     if (block) {
1448         /*
1449          * As soon as we start servicing pages out of order, then we have
1450          * to kill the bulk stage, since the bulk stage assumes
1451          * in (migration_bitmap_find_and_reset_dirty) that every page is
1452          * dirty, that's no longer true.
1453          */
1454         rs->ram_bulk_stage = false;
1455
1456         /*
1457          * We want the background search to continue from the queued page
1458          * since the guest is likely to want other pages near to the page
1459          * it just requested.
1460          */
1461         pss->block = block;
1462         pss->page = offset >> TARGET_PAGE_BITS;
1463
1464         /*
1465          * This unqueued page would break the "one round" check, even is
1466          * really rare.
1467          */
1468         pss->complete_round = false;
1469     }
1470
1471     return !!block;
1472 }
1473
1474 /**
1475  * migration_page_queue_free: drop any remaining pages in the ram
1476  * request queue
1477  *
1478  * It should be empty at the end anyway, but in error cases there may
1479  * be some left.  in case that there is any page left, we drop it.
1480  *
1481  */
1482 static void migration_page_queue_free(RAMState *rs)
1483 {
1484     struct RAMSrcPageRequest *mspr, *next_mspr;
1485     /* This queue generally should be empty - but in the case of a failed
1486      * migration might have some droppings in.
1487      */
1488     RCU_READ_LOCK_GUARD();
1489     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1490         memory_region_unref(mspr->rb->mr);
1491         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1492         g_free(mspr);
1493     }
1494 }
1495
1496 /**
1497  * ram_save_queue_pages: queue the page for transmission
1498  *
1499  * A request from postcopy destination for example.
1500  *
1501  * Returns zero on success or negative on error
1502  *
1503  * @rbname: Name of the RAMBLock of the request. NULL means the
1504  *          same that last one.
1505  * @start: starting address from the start of the RAMBlock
1506  * @len: length (in bytes) to send
1507  */
1508 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1509 {
1510     RAMBlock *ramblock;
1511     RAMState *rs = ram_state;
1512
1513     ram_counters.postcopy_requests++;
1514     RCU_READ_LOCK_GUARD();
1515
1516     if (!rbname) {
1517         /* Reuse last RAMBlock */
1518         ramblock = rs->last_req_rb;
1519
1520         if (!ramblock) {
1521             /*
1522              * Shouldn't happen, we can't reuse the last RAMBlock if
1523              * it's the 1st request.
1524              */
1525             error_report("ram_save_queue_pages no previous block");
1526             return -1;
1527         }
1528     } else {
1529         ramblock = qemu_ram_block_by_name(rbname);
1530
1531         if (!ramblock) {
1532             /* We shouldn't be asked for a non-existent RAMBlock */
1533             error_report("ram_save_queue_pages no block '%s'", rbname);
1534             return -1;
1535         }
1536         rs->last_req_rb = ramblock;
1537     }
1538     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1539     if (start+len > ramblock->used_length) {
1540         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1541                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1542                      __func__, start, len, ramblock->used_length);
1543         return -1;
1544     }
1545
1546     struct RAMSrcPageRequest *new_entry =
1547         g_malloc0(sizeof(struct RAMSrcPageRequest));
1548     new_entry->rb = ramblock;
1549     new_entry->offset = start;
1550     new_entry->len = len;
1551
1552     memory_region_ref(ramblock->mr);
1553     qemu_mutex_lock(&rs->src_page_req_mutex);
1554     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1555     migration_make_urgent_request();
1556     qemu_mutex_unlock(&rs->src_page_req_mutex);
1557
1558     return 0;
1559 }
1560
1561 static bool save_page_use_compression(RAMState *rs)
1562 {
1563     if (!migrate_use_compression()) {
1564         return false;
1565     }
1566
1567     /*
1568      * If xbzrle is on, stop using the data compression after first
1569      * round of migration even if compression is enabled. In theory,
1570      * xbzrle can do better than compression.
1571      */
1572     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1573         return true;
1574     }
1575
1576     return false;
1577 }
1578
1579 /*
1580  * try to compress the page before posting it out, return true if the page
1581  * has been properly handled by compression, otherwise needs other
1582  * paths to handle it
1583  */
1584 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1585 {
1586     if (!save_page_use_compression(rs)) {
1587         return false;
1588     }
1589
1590     /*
1591      * When starting the process of a new block, the first page of
1592      * the block should be sent out before other pages in the same
1593      * block, and all the pages in last block should have been sent
1594      * out, keeping this order is important, because the 'cont' flag
1595      * is used to avoid resending the block name.
1596      *
1597      * We post the fist page as normal page as compression will take
1598      * much CPU resource.
1599      */
1600     if (block != rs->last_sent_block) {
1601         flush_compressed_data(rs);
1602         return false;
1603     }
1604
1605     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1606         return true;
1607     }
1608
1609     compression_counters.busy++;
1610     return false;
1611 }
1612
1613 /**
1614  * ram_save_target_page: save one target page
1615  *
1616  * Returns the number of pages written
1617  *
1618  * @rs: current RAM state
1619  * @pss: data about the page we want to send
1620  * @last_stage: if we are at the completion stage
1621  */
1622 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1623                                 bool last_stage)
1624 {
1625     RAMBlock *block = pss->block;
1626     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1627     int res;
1628
1629     if (control_save_page(rs, block, offset, &res)) {
1630         return res;
1631     }
1632
1633     if (save_compress_page(rs, block, offset)) {
1634         return 1;
1635     }
1636
1637     res = save_zero_page(rs, block, offset);
1638     if (res > 0) {
1639         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1640          * page would be stale
1641          */
1642         if (!save_page_use_compression(rs)) {
1643             XBZRLE_cache_lock();
1644             xbzrle_cache_zero_page(rs, block->offset + offset);
1645             XBZRLE_cache_unlock();
1646         }
1647         ram_release_pages(block->idstr, offset, res);
1648         return res;
1649     }
1650
1651     /*
1652      * Do not use multifd for:
1653      * 1. Compression as the first page in the new block should be posted out
1654      *    before sending the compressed page
1655      * 2. In postcopy as one whole host page should be placed
1656      */
1657     if (!save_page_use_compression(rs) && migrate_use_multifd()
1658         && !migration_in_postcopy()) {
1659         return ram_save_multifd_page(rs, block, offset);
1660     }
1661
1662     return ram_save_page(rs, pss, last_stage);
1663 }
1664
1665 /**
1666  * ram_save_host_page: save a whole host page
1667  *
1668  * Starting at *offset send pages up to the end of the current host
1669  * page. It's valid for the initial offset to point into the middle of
1670  * a host page in which case the remainder of the hostpage is sent.
1671  * Only dirty target pages are sent. Note that the host page size may
1672  * be a huge page for this block.
1673  * The saving stops at the boundary of the used_length of the block
1674  * if the RAMBlock isn't a multiple of the host page size.
1675  *
1676  * Returns the number of pages written or negative on error
1677  *
1678  * @rs: current RAM state
1679  * @ms: current migration state
1680  * @pss: data about the page we want to send
1681  * @last_stage: if we are at the completion stage
1682  */
1683 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1684                               bool last_stage)
1685 {
1686     int tmppages, pages = 0;
1687     size_t pagesize_bits =
1688         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1689
1690     if (ramblock_is_ignored(pss->block)) {
1691         error_report("block %s should not be migrated !", pss->block->idstr);
1692         return 0;
1693     }
1694
1695     do {
1696         /* Check the pages is dirty and if it is send it */
1697         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1698             pss->page++;
1699             continue;
1700         }
1701
1702         tmppages = ram_save_target_page(rs, pss, last_stage);
1703         if (tmppages < 0) {
1704             return tmppages;
1705         }
1706
1707         pages += tmppages;
1708         pss->page++;
1709         /* Allow rate limiting to happen in the middle of huge pages */
1710         migration_rate_limit();
1711     } while ((pss->page & (pagesize_bits - 1)) &&
1712              offset_in_ramblock(pss->block,
1713                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1714
1715     /* The offset we leave with is the last one we looked at */
1716     pss->page--;
1717     return pages;
1718 }
1719
1720 /**
1721  * ram_find_and_save_block: finds a dirty page and sends it to f
1722  *
1723  * Called within an RCU critical section.
1724  *
1725  * Returns the number of pages written where zero means no dirty pages,
1726  * or negative on error
1727  *
1728  * @rs: current RAM state
1729  * @last_stage: if we are at the completion stage
1730  *
1731  * On systems where host-page-size > target-page-size it will send all the
1732  * pages in a host page that are dirty.
1733  */
1734
1735 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1736 {
1737     PageSearchStatus pss;
1738     int pages = 0;
1739     bool again, found;
1740
1741     /* No dirty page as there is zero RAM */
1742     if (!ram_bytes_total()) {
1743         return pages;
1744     }
1745
1746     pss.block = rs->last_seen_block;
1747     pss.page = rs->last_page;
1748     pss.complete_round = false;
1749
1750     if (!pss.block) {
1751         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1752     }
1753
1754     do {
1755         again = true;
1756         found = get_queued_page(rs, &pss);
1757
1758         if (!found) {
1759             /* priority queue empty, so just search for something dirty */
1760             found = find_dirty_block(rs, &pss, &again);
1761         }
1762
1763         if (found) {
1764             pages = ram_save_host_page(rs, &pss, last_stage);
1765         }
1766     } while (!pages && again);
1767
1768     rs->last_seen_block = pss.block;
1769     rs->last_page = pss.page;
1770
1771     return pages;
1772 }
1773
1774 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1775 {
1776     uint64_t pages = size / TARGET_PAGE_SIZE;
1777
1778     if (zero) {
1779         ram_counters.duplicate += pages;
1780     } else {
1781         ram_counters.normal += pages;
1782         ram_counters.transferred += size;
1783         qemu_update_position(f, size);
1784     }
1785 }
1786
1787 static uint64_t ram_bytes_total_common(bool count_ignored)
1788 {
1789     RAMBlock *block;
1790     uint64_t total = 0;
1791
1792     RCU_READ_LOCK_GUARD();
1793
1794     if (count_ignored) {
1795         RAMBLOCK_FOREACH_MIGRATABLE(block) {
1796             total += block->used_length;
1797         }
1798     } else {
1799         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1800             total += block->used_length;
1801         }
1802     }
1803     return total;
1804 }
1805
1806 uint64_t ram_bytes_total(void)
1807 {
1808     return ram_bytes_total_common(false);
1809 }
1810
1811 static void xbzrle_load_setup(void)
1812 {
1813     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1814 }
1815
1816 static void xbzrle_load_cleanup(void)
1817 {
1818     g_free(XBZRLE.decoded_buf);
1819     XBZRLE.decoded_buf = NULL;
1820 }
1821
1822 static void ram_state_cleanup(RAMState **rsp)
1823 {
1824     if (*rsp) {
1825         migration_page_queue_free(*rsp);
1826         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1827         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1828         g_free(*rsp);
1829         *rsp = NULL;
1830     }
1831 }
1832
1833 static void xbzrle_cleanup(void)
1834 {
1835     XBZRLE_cache_lock();
1836     if (XBZRLE.cache) {
1837         cache_fini(XBZRLE.cache);
1838         g_free(XBZRLE.encoded_buf);
1839         g_free(XBZRLE.current_buf);
1840         g_free(XBZRLE.zero_target_page);
1841         XBZRLE.cache = NULL;
1842         XBZRLE.encoded_buf = NULL;
1843         XBZRLE.current_buf = NULL;
1844         XBZRLE.zero_target_page = NULL;
1845     }
1846     XBZRLE_cache_unlock();
1847 }
1848
1849 static void ram_save_cleanup(void *opaque)
1850 {
1851     RAMState **rsp = opaque;
1852     RAMBlock *block;
1853
1854     /* caller have hold iothread lock or is in a bh, so there is
1855      * no writing race against the migration bitmap
1856      */
1857     memory_global_dirty_log_stop();
1858
1859     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1860         g_free(block->clear_bmap);
1861         block->clear_bmap = NULL;
1862         g_free(block->bmap);
1863         block->bmap = NULL;
1864     }
1865
1866     xbzrle_cleanup();
1867     compress_threads_save_cleanup();
1868     ram_state_cleanup(rsp);
1869 }
1870
1871 static void ram_state_reset(RAMState *rs)
1872 {
1873     rs->last_seen_block = NULL;
1874     rs->last_sent_block = NULL;
1875     rs->last_page = 0;
1876     rs->last_version = ram_list.version;
1877     rs->ram_bulk_stage = true;
1878     rs->fpo_enabled = false;
1879 }
1880
1881 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1882
1883 /*
1884  * 'expected' is the value you expect the bitmap mostly to be full
1885  * of; it won't bother printing lines that are all this value.
1886  * If 'todump' is null the migration bitmap is dumped.
1887  */
1888 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1889                            unsigned long pages)
1890 {
1891     int64_t cur;
1892     int64_t linelen = 128;
1893     char linebuf[129];
1894
1895     for (cur = 0; cur < pages; cur += linelen) {
1896         int64_t curb;
1897         bool found = false;
1898         /*
1899          * Last line; catch the case where the line length
1900          * is longer than remaining ram
1901          */
1902         if (cur + linelen > pages) {
1903             linelen = pages - cur;
1904         }
1905         for (curb = 0; curb < linelen; curb++) {
1906             bool thisbit = test_bit(cur + curb, todump);
1907             linebuf[curb] = thisbit ? '1' : '.';
1908             found = found || (thisbit != expected);
1909         }
1910         if (found) {
1911             linebuf[curb] = '\0';
1912             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1913         }
1914     }
1915 }
1916
1917 /* **** functions for postcopy ***** */
1918
1919 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1920 {
1921     struct RAMBlock *block;
1922
1923     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1924         unsigned long *bitmap = block->bmap;
1925         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1926         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1927
1928         while (run_start < range) {
1929             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1930             ram_discard_range(block->idstr,
1931                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1932                               ((ram_addr_t)(run_end - run_start))
1933                                 << TARGET_PAGE_BITS);
1934             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1935         }
1936     }
1937 }
1938
1939 /**
1940  * postcopy_send_discard_bm_ram: discard a RAMBlock
1941  *
1942  * Returns zero on success
1943  *
1944  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1945  *
1946  * @ms: current migration state
1947  * @block: RAMBlock to discard
1948  */
1949 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1950 {
1951     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1952     unsigned long current;
1953     unsigned long *bitmap = block->bmap;
1954
1955     for (current = 0; current < end; ) {
1956         unsigned long one = find_next_bit(bitmap, end, current);
1957         unsigned long zero, discard_length;
1958
1959         if (one >= end) {
1960             break;
1961         }
1962
1963         zero = find_next_zero_bit(bitmap, end, one + 1);
1964
1965         if (zero >= end) {
1966             discard_length = end - one;
1967         } else {
1968             discard_length = zero - one;
1969         }
1970         postcopy_discard_send_range(ms, one, discard_length);
1971         current = one + discard_length;
1972     }
1973
1974     return 0;
1975 }
1976
1977 /**
1978  * postcopy_each_ram_send_discard: discard all RAMBlocks
1979  *
1980  * Returns 0 for success or negative for error
1981  *
1982  * Utility for the outgoing postcopy code.
1983  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1984  *   passing it bitmap indexes and name.
1985  * (qemu_ram_foreach_block ends up passing unscaled lengths
1986  *  which would mean postcopy code would have to deal with target page)
1987  *
1988  * @ms: current migration state
1989  */
1990 static int postcopy_each_ram_send_discard(MigrationState *ms)
1991 {
1992     struct RAMBlock *block;
1993     int ret;
1994
1995     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1996         postcopy_discard_send_init(ms, block->idstr);
1997
1998         /*
1999          * Postcopy sends chunks of bitmap over the wire, but it
2000          * just needs indexes at this point, avoids it having
2001          * target page specific code.
2002          */
2003         ret = postcopy_send_discard_bm_ram(ms, block);
2004         postcopy_discard_send_finish(ms);
2005         if (ret) {
2006             return ret;
2007         }
2008     }
2009
2010     return 0;
2011 }
2012
2013 /**
2014  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2015  *
2016  * Helper for postcopy_chunk_hostpages; it's called twice to
2017  * canonicalize the two bitmaps, that are similar, but one is
2018  * inverted.
2019  *
2020  * Postcopy requires that all target pages in a hostpage are dirty or
2021  * clean, not a mix.  This function canonicalizes the bitmaps.
2022  *
2023  * @ms: current migration state
2024  * @block: block that contains the page we want to canonicalize
2025  */
2026 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2027 {
2028     RAMState *rs = ram_state;
2029     unsigned long *bitmap = block->bmap;
2030     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2031     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2032     unsigned long run_start;
2033
2034     if (block->page_size == TARGET_PAGE_SIZE) {
2035         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2036         return;
2037     }
2038
2039     /* Find a dirty page */
2040     run_start = find_next_bit(bitmap, pages, 0);
2041
2042     while (run_start < pages) {
2043
2044         /*
2045          * If the start of this run of pages is in the middle of a host
2046          * page, then we need to fixup this host page.
2047          */
2048         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2049             /* Find the end of this run */
2050             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2051             /*
2052              * If the end isn't at the start of a host page, then the
2053              * run doesn't finish at the end of a host page
2054              * and we need to discard.
2055              */
2056         }
2057
2058         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2059             unsigned long page;
2060             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2061                                                              host_ratio);
2062             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2063
2064             /* Clean up the bitmap */
2065             for (page = fixup_start_addr;
2066                  page < fixup_start_addr + host_ratio; page++) {
2067                 /*
2068                  * Remark them as dirty, updating the count for any pages
2069                  * that weren't previously dirty.
2070                  */
2071                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2072             }
2073         }
2074
2075         /* Find the next dirty page for the next iteration */
2076         run_start = find_next_bit(bitmap, pages, run_start);
2077     }
2078 }
2079
2080 /**
2081  * postcopy_chunk_hostpages: discard any partially sent host page
2082  *
2083  * Utility for the outgoing postcopy code.
2084  *
2085  * Discard any partially sent host-page size chunks, mark any partially
2086  * dirty host-page size chunks as all dirty.  In this case the host-page
2087  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2088  *
2089  * Returns zero on success
2090  *
2091  * @ms: current migration state
2092  * @block: block we want to work with
2093  */
2094 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2095 {
2096     postcopy_discard_send_init(ms, block->idstr);
2097
2098     /*
2099      * Ensure that all partially dirty host pages are made fully dirty.
2100      */
2101     postcopy_chunk_hostpages_pass(ms, block);
2102
2103     postcopy_discard_send_finish(ms);
2104     return 0;
2105 }
2106
2107 /**
2108  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2109  *
2110  * Returns zero on success
2111  *
2112  * Transmit the set of pages to be discarded after precopy to the target
2113  * these are pages that:
2114  *     a) Have been previously transmitted but are now dirty again
2115  *     b) Pages that have never been transmitted, this ensures that
2116  *        any pages on the destination that have been mapped by background
2117  *        tasks get discarded (transparent huge pages is the specific concern)
2118  * Hopefully this is pretty sparse
2119  *
2120  * @ms: current migration state
2121  */
2122 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2123 {
2124     RAMState *rs = ram_state;
2125     RAMBlock *block;
2126     int ret;
2127
2128     RCU_READ_LOCK_GUARD();
2129
2130     /* This should be our last sync, the src is now paused */
2131     migration_bitmap_sync(rs);
2132
2133     /* Easiest way to make sure we don't resume in the middle of a host-page */
2134     rs->last_seen_block = NULL;
2135     rs->last_sent_block = NULL;
2136     rs->last_page = 0;
2137
2138     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2139         /* Deal with TPS != HPS and huge pages */
2140         ret = postcopy_chunk_hostpages(ms, block);
2141         if (ret) {
2142             return ret;
2143         }
2144
2145 #ifdef DEBUG_POSTCOPY
2146         ram_debug_dump_bitmap(block->bmap, true,
2147                               block->used_length >> TARGET_PAGE_BITS);
2148 #endif
2149     }
2150     trace_ram_postcopy_send_discard_bitmap();
2151
2152     return postcopy_each_ram_send_discard(ms);
2153 }
2154
2155 /**
2156  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2157  *
2158  * Returns zero on success
2159  *
2160  * @rbname: name of the RAMBlock of the request. NULL means the
2161  *          same that last one.
2162  * @start: RAMBlock starting page
2163  * @length: RAMBlock size
2164  */
2165 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2166 {
2167     trace_ram_discard_range(rbname, start, length);
2168
2169     RCU_READ_LOCK_GUARD();
2170     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2171
2172     if (!rb) {
2173         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2174         return -1;
2175     }
2176
2177     /*
2178      * On source VM, we don't need to update the received bitmap since
2179      * we don't even have one.
2180      */
2181     if (rb->receivedmap) {
2182         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2183                      length >> qemu_target_page_bits());
2184     }
2185
2186     return ram_block_discard_range(rb, start, length);
2187 }
2188
2189 /*
2190  * For every allocation, we will try not to crash the VM if the
2191  * allocation failed.
2192  */
2193 static int xbzrle_init(void)
2194 {
2195     Error *local_err = NULL;
2196
2197     if (!migrate_use_xbzrle()) {
2198         return 0;
2199     }
2200
2201     XBZRLE_cache_lock();
2202
2203     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2204     if (!XBZRLE.zero_target_page) {
2205         error_report("%s: Error allocating zero page", __func__);
2206         goto err_out;
2207     }
2208
2209     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2210                               TARGET_PAGE_SIZE, &local_err);
2211     if (!XBZRLE.cache) {
2212         error_report_err(local_err);
2213         goto free_zero_page;
2214     }
2215
2216     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2217     if (!XBZRLE.encoded_buf) {
2218         error_report("%s: Error allocating encoded_buf", __func__);
2219         goto free_cache;
2220     }
2221
2222     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2223     if (!XBZRLE.current_buf) {
2224         error_report("%s: Error allocating current_buf", __func__);
2225         goto free_encoded_buf;
2226     }
2227
2228     /* We are all good */
2229     XBZRLE_cache_unlock();
2230     return 0;
2231
2232 free_encoded_buf:
2233     g_free(XBZRLE.encoded_buf);
2234     XBZRLE.encoded_buf = NULL;
2235 free_cache:
2236     cache_fini(XBZRLE.cache);
2237     XBZRLE.cache = NULL;
2238 free_zero_page:
2239     g_free(XBZRLE.zero_target_page);
2240     XBZRLE.zero_target_page = NULL;
2241 err_out:
2242     XBZRLE_cache_unlock();
2243     return -ENOMEM;
2244 }
2245
2246 static int ram_state_init(RAMState **rsp)
2247 {
2248     *rsp = g_try_new0(RAMState, 1);
2249
2250     if (!*rsp) {
2251         error_report("%s: Init ramstate fail", __func__);
2252         return -1;
2253     }
2254
2255     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2256     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2257     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2258
2259     /*
2260      * Count the total number of pages used by ram blocks not including any
2261      * gaps due to alignment or unplugs.
2262      * This must match with the initial values of dirty bitmap.
2263      */
2264     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2265     ram_state_reset(*rsp);
2266
2267     return 0;
2268 }
2269
2270 static void ram_list_init_bitmaps(void)
2271 {
2272     MigrationState *ms = migrate_get_current();
2273     RAMBlock *block;
2274     unsigned long pages;
2275     uint8_t shift;
2276
2277     /* Skip setting bitmap if there is no RAM */
2278     if (ram_bytes_total()) {
2279         shift = ms->clear_bitmap_shift;
2280         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2281             error_report("clear_bitmap_shift (%u) too big, using "
2282                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2283             shift = CLEAR_BITMAP_SHIFT_MAX;
2284         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2285             error_report("clear_bitmap_shift (%u) too small, using "
2286                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2287             shift = CLEAR_BITMAP_SHIFT_MIN;
2288         }
2289
2290         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2291             pages = block->max_length >> TARGET_PAGE_BITS;
2292             /*
2293              * The initial dirty bitmap for migration must be set with all
2294              * ones to make sure we'll migrate every guest RAM page to
2295              * destination.
2296              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2297              * new migration after a failed migration, ram_list.
2298              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2299              * guest memory.
2300              */
2301             block->bmap = bitmap_new(pages);
2302             bitmap_set(block->bmap, 0, pages);
2303             block->clear_bmap_shift = shift;
2304             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2305         }
2306     }
2307 }
2308
2309 static void ram_init_bitmaps(RAMState *rs)
2310 {
2311     /* For memory_global_dirty_log_start below.  */
2312     qemu_mutex_lock_iothread();
2313     qemu_mutex_lock_ramlist();
2314
2315     WITH_RCU_READ_LOCK_GUARD() {
2316         ram_list_init_bitmaps();
2317         memory_global_dirty_log_start();
2318         migration_bitmap_sync_precopy(rs);
2319     }
2320     qemu_mutex_unlock_ramlist();
2321     qemu_mutex_unlock_iothread();
2322 }
2323
2324 static int ram_init_all(RAMState **rsp)
2325 {
2326     if (ram_state_init(rsp)) {
2327         return -1;
2328     }
2329
2330     if (xbzrle_init()) {
2331         ram_state_cleanup(rsp);
2332         return -1;
2333     }
2334
2335     ram_init_bitmaps(*rsp);
2336
2337     return 0;
2338 }
2339
2340 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2341 {
2342     RAMBlock *block;
2343     uint64_t pages = 0;
2344
2345     /*
2346      * Postcopy is not using xbzrle/compression, so no need for that.
2347      * Also, since source are already halted, we don't need to care
2348      * about dirty page logging as well.
2349      */
2350
2351     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2352         pages += bitmap_count_one(block->bmap,
2353                                   block->used_length >> TARGET_PAGE_BITS);
2354     }
2355
2356     /* This may not be aligned with current bitmaps. Recalculate. */
2357     rs->migration_dirty_pages = pages;
2358
2359     rs->last_seen_block = NULL;
2360     rs->last_sent_block = NULL;
2361     rs->last_page = 0;
2362     rs->last_version = ram_list.version;
2363     /*
2364      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2365      * matter what we have sent.
2366      */
2367     rs->ram_bulk_stage = false;
2368
2369     /* Update RAMState cache of output QEMUFile */
2370     rs->f = out;
2371
2372     trace_ram_state_resume_prepare(pages);
2373 }
2374
2375 /*
2376  * This function clears bits of the free pages reported by the caller from the
2377  * migration dirty bitmap. @addr is the host address corresponding to the
2378  * start of the continuous guest free pages, and @len is the total bytes of
2379  * those pages.
2380  */
2381 void qemu_guest_free_page_hint(void *addr, size_t len)
2382 {
2383     RAMBlock *block;
2384     ram_addr_t offset;
2385     size_t used_len, start, npages;
2386     MigrationState *s = migrate_get_current();
2387
2388     /* This function is currently expected to be used during live migration */
2389     if (!migration_is_setup_or_active(s->state)) {
2390         return;
2391     }
2392
2393     for (; len > 0; len -= used_len, addr += used_len) {
2394         block = qemu_ram_block_from_host(addr, false, &offset);
2395         if (unlikely(!block || offset >= block->used_length)) {
2396             /*
2397              * The implementation might not support RAMBlock resize during
2398              * live migration, but it could happen in theory with future
2399              * updates. So we add a check here to capture that case.
2400              */
2401             error_report_once("%s unexpected error", __func__);
2402             return;
2403         }
2404
2405         if (len <= block->used_length - offset) {
2406             used_len = len;
2407         } else {
2408             used_len = block->used_length - offset;
2409         }
2410
2411         start = offset >> TARGET_PAGE_BITS;
2412         npages = used_len >> TARGET_PAGE_BITS;
2413
2414         qemu_mutex_lock(&ram_state->bitmap_mutex);
2415         ram_state->migration_dirty_pages -=
2416                       bitmap_count_one_with_offset(block->bmap, start, npages);
2417         bitmap_clear(block->bmap, start, npages);
2418         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2419     }
2420 }
2421
2422 /*
2423  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2424  * long-running RCU critical section.  When rcu-reclaims in the code
2425  * start to become numerous it will be necessary to reduce the
2426  * granularity of these critical sections.
2427  */
2428
2429 /**
2430  * ram_save_setup: Setup RAM for migration
2431  *
2432  * Returns zero to indicate success and negative for error
2433  *
2434  * @f: QEMUFile where to send the data
2435  * @opaque: RAMState pointer
2436  */
2437 static int ram_save_setup(QEMUFile *f, void *opaque)
2438 {
2439     RAMState **rsp = opaque;
2440     RAMBlock *block;
2441
2442     if (compress_threads_save_setup()) {
2443         return -1;
2444     }
2445
2446     /* migration has already setup the bitmap, reuse it. */
2447     if (!migration_in_colo_state()) {
2448         if (ram_init_all(rsp) != 0) {
2449             compress_threads_save_cleanup();
2450             return -1;
2451         }
2452     }
2453     (*rsp)->f = f;
2454
2455     WITH_RCU_READ_LOCK_GUARD() {
2456         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2457
2458         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2459             qemu_put_byte(f, strlen(block->idstr));
2460             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2461             qemu_put_be64(f, block->used_length);
2462             if (migrate_postcopy_ram() && block->page_size !=
2463                                           qemu_host_page_size) {
2464                 qemu_put_be64(f, block->page_size);
2465             }
2466             if (migrate_ignore_shared()) {
2467                 qemu_put_be64(f, block->mr->addr);
2468             }
2469         }
2470     }
2471
2472     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2473     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2474
2475     multifd_send_sync_main(f);
2476     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2477     qemu_fflush(f);
2478
2479     return 0;
2480 }
2481
2482 /**
2483  * ram_save_iterate: iterative stage for migration
2484  *
2485  * Returns zero to indicate success and negative for error
2486  *
2487  * @f: QEMUFile where to send the data
2488  * @opaque: RAMState pointer
2489  */
2490 static int ram_save_iterate(QEMUFile *f, void *opaque)
2491 {
2492     RAMState **temp = opaque;
2493     RAMState *rs = *temp;
2494     int ret = 0;
2495     int i;
2496     int64_t t0;
2497     int done = 0;
2498
2499     if (blk_mig_bulk_active()) {
2500         /* Avoid transferring ram during bulk phase of block migration as
2501          * the bulk phase will usually take a long time and transferring
2502          * ram updates during that time is pointless. */
2503         goto out;
2504     }
2505
2506     WITH_RCU_READ_LOCK_GUARD() {
2507         if (ram_list.version != rs->last_version) {
2508             ram_state_reset(rs);
2509         }
2510
2511         /* Read version before ram_list.blocks */
2512         smp_rmb();
2513
2514         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2515
2516         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2517         i = 0;
2518         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2519                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2520             int pages;
2521
2522             if (qemu_file_get_error(f)) {
2523                 break;
2524             }
2525
2526             pages = ram_find_and_save_block(rs, false);
2527             /* no more pages to sent */
2528             if (pages == 0) {
2529                 done = 1;
2530                 break;
2531             }
2532
2533             if (pages < 0) {
2534                 qemu_file_set_error(f, pages);
2535                 break;
2536             }
2537
2538             rs->target_page_count += pages;
2539
2540             /*
2541              * During postcopy, it is necessary to make sure one whole host
2542              * page is sent in one chunk.
2543              */
2544             if (migrate_postcopy_ram()) {
2545                 flush_compressed_data(rs);
2546             }
2547
2548             /*
2549              * we want to check in the 1st loop, just in case it was the 1st
2550              * time and we had to sync the dirty bitmap.
2551              * qemu_clock_get_ns() is a bit expensive, so we only check each
2552              * some iterations
2553              */
2554             if ((i & 63) == 0) {
2555                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2556                               1000000;
2557                 if (t1 > MAX_WAIT) {
2558                     trace_ram_save_iterate_big_wait(t1, i);
2559                     break;
2560                 }
2561             }
2562             i++;
2563         }
2564     }
2565
2566     /*
2567      * Must occur before EOS (or any QEMUFile operation)
2568      * because of RDMA protocol.
2569      */
2570     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2571
2572 out:
2573     if (ret >= 0
2574         && migration_is_setup_or_active(migrate_get_current()->state)) {
2575         multifd_send_sync_main(rs->f);
2576         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2577         qemu_fflush(f);
2578         ram_counters.transferred += 8;
2579
2580         ret = qemu_file_get_error(f);
2581     }
2582     if (ret < 0) {
2583         return ret;
2584     }
2585
2586     return done;
2587 }
2588
2589 /**
2590  * ram_save_complete: function called to send the remaining amount of ram
2591  *
2592  * Returns zero to indicate success or negative on error
2593  *
2594  * Called with iothread lock
2595  *
2596  * @f: QEMUFile where to send the data
2597  * @opaque: RAMState pointer
2598  */
2599 static int ram_save_complete(QEMUFile *f, void *opaque)
2600 {
2601     RAMState **temp = opaque;
2602     RAMState *rs = *temp;
2603     int ret = 0;
2604
2605     WITH_RCU_READ_LOCK_GUARD() {
2606         if (!migration_in_postcopy()) {
2607             migration_bitmap_sync_precopy(rs);
2608         }
2609
2610         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2611
2612         /* try transferring iterative blocks of memory */
2613
2614         /* flush all remaining blocks regardless of rate limiting */
2615         while (true) {
2616             int pages;
2617
2618             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2619             /* no more blocks to sent */
2620             if (pages == 0) {
2621                 break;
2622             }
2623             if (pages < 0) {
2624                 ret = pages;
2625                 break;
2626             }
2627         }
2628
2629         flush_compressed_data(rs);
2630         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2631     }
2632
2633     if (ret >= 0) {
2634         multifd_send_sync_main(rs->f);
2635         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2636         qemu_fflush(f);
2637     }
2638
2639     return ret;
2640 }
2641
2642 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2643                              uint64_t *res_precopy_only,
2644                              uint64_t *res_compatible,
2645                              uint64_t *res_postcopy_only)
2646 {
2647     RAMState **temp = opaque;
2648     RAMState *rs = *temp;
2649     uint64_t remaining_size;
2650
2651     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2652
2653     if (!migration_in_postcopy() &&
2654         remaining_size < max_size) {
2655         qemu_mutex_lock_iothread();
2656         WITH_RCU_READ_LOCK_GUARD() {
2657             migration_bitmap_sync_precopy(rs);
2658         }
2659         qemu_mutex_unlock_iothread();
2660         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2661     }
2662
2663     if (migrate_postcopy_ram()) {
2664         /* We can do postcopy, and all the data is postcopiable */
2665         *res_compatible += remaining_size;
2666     } else {
2667         *res_precopy_only += remaining_size;
2668     }
2669 }
2670
2671 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2672 {
2673     unsigned int xh_len;
2674     int xh_flags;
2675     uint8_t *loaded_data;
2676
2677     /* extract RLE header */
2678     xh_flags = qemu_get_byte(f);
2679     xh_len = qemu_get_be16(f);
2680
2681     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2682         error_report("Failed to load XBZRLE page - wrong compression!");
2683         return -1;
2684     }
2685
2686     if (xh_len > TARGET_PAGE_SIZE) {
2687         error_report("Failed to load XBZRLE page - len overflow!");
2688         return -1;
2689     }
2690     loaded_data = XBZRLE.decoded_buf;
2691     /* load data and decode */
2692     /* it can change loaded_data to point to an internal buffer */
2693     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2694
2695     /* decode RLE */
2696     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2697                              TARGET_PAGE_SIZE) == -1) {
2698         error_report("Failed to load XBZRLE page - decode error!");
2699         return -1;
2700     }
2701
2702     return 0;
2703 }
2704
2705 /**
2706  * ram_block_from_stream: read a RAMBlock id from the migration stream
2707  *
2708  * Must be called from within a rcu critical section.
2709  *
2710  * Returns a pointer from within the RCU-protected ram_list.
2711  *
2712  * @f: QEMUFile where to read the data from
2713  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2714  */
2715 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2716 {
2717     static RAMBlock *block = NULL;
2718     char id[256];
2719     uint8_t len;
2720
2721     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2722         if (!block) {
2723             error_report("Ack, bad migration stream!");
2724             return NULL;
2725         }
2726         return block;
2727     }
2728
2729     len = qemu_get_byte(f);
2730     qemu_get_buffer(f, (uint8_t *)id, len);
2731     id[len] = 0;
2732
2733     block = qemu_ram_block_by_name(id);
2734     if (!block) {
2735         error_report("Can't find block %s", id);
2736         return NULL;
2737     }
2738
2739     if (ramblock_is_ignored(block)) {
2740         error_report("block %s should not be migrated !", id);
2741         return NULL;
2742     }
2743
2744     return block;
2745 }
2746
2747 static inline void *host_from_ram_block_offset(RAMBlock *block,
2748                                                ram_addr_t offset)
2749 {
2750     if (!offset_in_ramblock(block, offset)) {
2751         return NULL;
2752     }
2753
2754     return block->host + offset;
2755 }
2756
2757 static inline void *colo_cache_from_block_offset(RAMBlock *block,
2758                              ram_addr_t offset, bool record_bitmap)
2759 {
2760     if (!offset_in_ramblock(block, offset)) {
2761         return NULL;
2762     }
2763     if (!block->colo_cache) {
2764         error_report("%s: colo_cache is NULL in block :%s",
2765                      __func__, block->idstr);
2766         return NULL;
2767     }
2768
2769     /*
2770     * During colo checkpoint, we need bitmap of these migrated pages.
2771     * It help us to decide which pages in ram cache should be flushed
2772     * into VM's RAM later.
2773     */
2774     if (record_bitmap &&
2775         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2776         ram_state->migration_dirty_pages++;
2777     }
2778     return block->colo_cache + offset;
2779 }
2780
2781 /**
2782  * ram_handle_compressed: handle the zero page case
2783  *
2784  * If a page (or a whole RDMA chunk) has been
2785  * determined to be zero, then zap it.
2786  *
2787  * @host: host address for the zero page
2788  * @ch: what the page is filled from.  We only support zero
2789  * @size: size of the zero page
2790  */
2791 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2792 {
2793     if (ch != 0 || !is_zero_range(host, size)) {
2794         memset(host, ch, size);
2795     }
2796 }
2797
2798 /* return the size after decompression, or negative value on error */
2799 static int
2800 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2801                      const uint8_t *source, size_t source_len)
2802 {
2803     int err;
2804
2805     err = inflateReset(stream);
2806     if (err != Z_OK) {
2807         return -1;
2808     }
2809
2810     stream->avail_in = source_len;
2811     stream->next_in = (uint8_t *)source;
2812     stream->avail_out = dest_len;
2813     stream->next_out = dest;
2814
2815     err = inflate(stream, Z_NO_FLUSH);
2816     if (err != Z_STREAM_END) {
2817         return -1;
2818     }
2819
2820     return stream->total_out;
2821 }
2822
2823 static void *do_data_decompress(void *opaque)
2824 {
2825     DecompressParam *param = opaque;
2826     unsigned long pagesize;
2827     uint8_t *des;
2828     int len, ret;
2829
2830     qemu_mutex_lock(&param->mutex);
2831     while (!param->quit) {
2832         if (param->des) {
2833             des = param->des;
2834             len = param->len;
2835             param->des = 0;
2836             qemu_mutex_unlock(&param->mutex);
2837
2838             pagesize = TARGET_PAGE_SIZE;
2839
2840             ret = qemu_uncompress_data(&param->stream, des, pagesize,
2841                                        param->compbuf, len);
2842             if (ret < 0 && migrate_get_current()->decompress_error_check) {
2843                 error_report("decompress data failed");
2844                 qemu_file_set_error(decomp_file, ret);
2845             }
2846
2847             qemu_mutex_lock(&decomp_done_lock);
2848             param->done = true;
2849             qemu_cond_signal(&decomp_done_cond);
2850             qemu_mutex_unlock(&decomp_done_lock);
2851
2852             qemu_mutex_lock(&param->mutex);
2853         } else {
2854             qemu_cond_wait(&param->cond, &param->mutex);
2855         }
2856     }
2857     qemu_mutex_unlock(&param->mutex);
2858
2859     return NULL;
2860 }
2861
2862 static int wait_for_decompress_done(void)
2863 {
2864     int idx, thread_count;
2865
2866     if (!migrate_use_compression()) {
2867         return 0;
2868     }
2869
2870     thread_count = migrate_decompress_threads();
2871     qemu_mutex_lock(&decomp_done_lock);
2872     for (idx = 0; idx < thread_count; idx++) {
2873         while (!decomp_param[idx].done) {
2874             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2875         }
2876     }
2877     qemu_mutex_unlock(&decomp_done_lock);
2878     return qemu_file_get_error(decomp_file);
2879 }
2880
2881 static void compress_threads_load_cleanup(void)
2882 {
2883     int i, thread_count;
2884
2885     if (!migrate_use_compression()) {
2886         return;
2887     }
2888     thread_count = migrate_decompress_threads();
2889     for (i = 0; i < thread_count; i++) {
2890         /*
2891          * we use it as a indicator which shows if the thread is
2892          * properly init'd or not
2893          */
2894         if (!decomp_param[i].compbuf) {
2895             break;
2896         }
2897
2898         qemu_mutex_lock(&decomp_param[i].mutex);
2899         decomp_param[i].quit = true;
2900         qemu_cond_signal(&decomp_param[i].cond);
2901         qemu_mutex_unlock(&decomp_param[i].mutex);
2902     }
2903     for (i = 0; i < thread_count; i++) {
2904         if (!decomp_param[i].compbuf) {
2905             break;
2906         }
2907
2908         qemu_thread_join(decompress_threads + i);
2909         qemu_mutex_destroy(&decomp_param[i].mutex);
2910         qemu_cond_destroy(&decomp_param[i].cond);
2911         inflateEnd(&decomp_param[i].stream);
2912         g_free(decomp_param[i].compbuf);
2913         decomp_param[i].compbuf = NULL;
2914     }
2915     g_free(decompress_threads);
2916     g_free(decomp_param);
2917     decompress_threads = NULL;
2918     decomp_param = NULL;
2919     decomp_file = NULL;
2920 }
2921
2922 static int compress_threads_load_setup(QEMUFile *f)
2923 {
2924     int i, thread_count;
2925
2926     if (!migrate_use_compression()) {
2927         return 0;
2928     }
2929
2930     thread_count = migrate_decompress_threads();
2931     decompress_threads = g_new0(QemuThread, thread_count);
2932     decomp_param = g_new0(DecompressParam, thread_count);
2933     qemu_mutex_init(&decomp_done_lock);
2934     qemu_cond_init(&decomp_done_cond);
2935     decomp_file = f;
2936     for (i = 0; i < thread_count; i++) {
2937         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2938             goto exit;
2939         }
2940
2941         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2942         qemu_mutex_init(&decomp_param[i].mutex);
2943         qemu_cond_init(&decomp_param[i].cond);
2944         decomp_param[i].done = true;
2945         decomp_param[i].quit = false;
2946         qemu_thread_create(decompress_threads + i, "decompress",
2947                            do_data_decompress, decomp_param + i,
2948                            QEMU_THREAD_JOINABLE);
2949     }
2950     return 0;
2951 exit:
2952     compress_threads_load_cleanup();
2953     return -1;
2954 }
2955
2956 static void decompress_data_with_multi_threads(QEMUFile *f,
2957                                                void *host, int len)
2958 {
2959     int idx, thread_count;
2960
2961     thread_count = migrate_decompress_threads();
2962     qemu_mutex_lock(&decomp_done_lock);
2963     while (true) {
2964         for (idx = 0; idx < thread_count; idx++) {
2965             if (decomp_param[idx].done) {
2966                 decomp_param[idx].done = false;
2967                 qemu_mutex_lock(&decomp_param[idx].mutex);
2968                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2969                 decomp_param[idx].des = host;
2970                 decomp_param[idx].len = len;
2971                 qemu_cond_signal(&decomp_param[idx].cond);
2972                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2973                 break;
2974             }
2975         }
2976         if (idx < thread_count) {
2977             break;
2978         } else {
2979             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2980         }
2981     }
2982     qemu_mutex_unlock(&decomp_done_lock);
2983 }
2984
2985 /*
2986  * colo cache: this is for secondary VM, we cache the whole
2987  * memory of the secondary VM, it is need to hold the global lock
2988  * to call this helper.
2989  */
2990 int colo_init_ram_cache(void)
2991 {
2992     RAMBlock *block;
2993
2994     WITH_RCU_READ_LOCK_GUARD() {
2995         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2996             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
2997                                                     NULL,
2998                                                     false);
2999             if (!block->colo_cache) {
3000                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3001                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3002                              block->used_length);
3003                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3004                     if (block->colo_cache) {
3005                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3006                         block->colo_cache = NULL;
3007                     }
3008                 }
3009                 return -errno;
3010             }
3011         }
3012     }
3013
3014     /*
3015     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3016     * with to decide which page in cache should be flushed into SVM's RAM. Here
3017     * we use the same name 'ram_bitmap' as for migration.
3018     */
3019     if (ram_bytes_total()) {
3020         RAMBlock *block;
3021
3022         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3023             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3024             block->bmap = bitmap_new(pages);
3025         }
3026     }
3027
3028     ram_state_init(&ram_state);
3029     return 0;
3030 }
3031
3032 /* TODO: duplicated with ram_init_bitmaps */
3033 void colo_incoming_start_dirty_log(void)
3034 {
3035     RAMBlock *block = NULL;
3036     /* For memory_global_dirty_log_start below. */
3037     qemu_mutex_lock_iothread();
3038     qemu_mutex_lock_ramlist();
3039
3040     memory_global_dirty_log_sync();
3041     WITH_RCU_READ_LOCK_GUARD() {
3042         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3043             ramblock_sync_dirty_bitmap(ram_state, block);
3044             /* Discard this dirty bitmap record */
3045             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3046         }
3047         memory_global_dirty_log_start();
3048     }
3049     ram_state->migration_dirty_pages = 0;
3050     qemu_mutex_unlock_ramlist();
3051     qemu_mutex_unlock_iothread();
3052 }
3053
3054 /* It is need to hold the global lock to call this helper */
3055 void colo_release_ram_cache(void)
3056 {
3057     RAMBlock *block;
3058
3059     memory_global_dirty_log_stop();
3060     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3061         g_free(block->bmap);
3062         block->bmap = NULL;
3063     }
3064
3065     WITH_RCU_READ_LOCK_GUARD() {
3066         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3067             if (block->colo_cache) {
3068                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3069                 block->colo_cache = NULL;
3070             }
3071         }
3072     }
3073     ram_state_cleanup(&ram_state);
3074 }
3075
3076 /**
3077  * ram_load_setup: Setup RAM for migration incoming side
3078  *
3079  * Returns zero to indicate success and negative for error
3080  *
3081  * @f: QEMUFile where to receive the data
3082  * @opaque: RAMState pointer
3083  */
3084 static int ram_load_setup(QEMUFile *f, void *opaque)
3085 {
3086     if (compress_threads_load_setup(f)) {
3087         return -1;
3088     }
3089
3090     xbzrle_load_setup();
3091     ramblock_recv_map_init();
3092
3093     return 0;
3094 }
3095
3096 static int ram_load_cleanup(void *opaque)
3097 {
3098     RAMBlock *rb;
3099
3100     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3101         qemu_ram_block_writeback(rb);
3102     }
3103
3104     xbzrle_load_cleanup();
3105     compress_threads_load_cleanup();
3106
3107     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3108         g_free(rb->receivedmap);
3109         rb->receivedmap = NULL;
3110     }
3111
3112     return 0;
3113 }
3114
3115 /**
3116  * ram_postcopy_incoming_init: allocate postcopy data structures
3117  *
3118  * Returns 0 for success and negative if there was one error
3119  *
3120  * @mis: current migration incoming state
3121  *
3122  * Allocate data structures etc needed by incoming migration with
3123  * postcopy-ram. postcopy-ram's similarly names
3124  * postcopy_ram_incoming_init does the work.
3125  */
3126 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3127 {
3128     return postcopy_ram_incoming_init(mis);
3129 }
3130
3131 /**
3132  * ram_load_postcopy: load a page in postcopy case
3133  *
3134  * Returns 0 for success or -errno in case of error
3135  *
3136  * Called in postcopy mode by ram_load().
3137  * rcu_read_lock is taken prior to this being called.
3138  *
3139  * @f: QEMUFile where to send the data
3140  */
3141 static int ram_load_postcopy(QEMUFile *f)
3142 {
3143     int flags = 0, ret = 0;
3144     bool place_needed = false;
3145     bool matches_target_page_size = false;
3146     MigrationIncomingState *mis = migration_incoming_get_current();
3147     /* Temporary page that is later 'placed' */
3148     void *postcopy_host_page = mis->postcopy_tmp_page;
3149     void *this_host = NULL;
3150     bool all_zero = false;
3151     int target_pages = 0;
3152
3153     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3154         ram_addr_t addr;
3155         void *host = NULL;
3156         void *page_buffer = NULL;
3157         void *place_source = NULL;
3158         RAMBlock *block = NULL;
3159         uint8_t ch;
3160         int len;
3161
3162         addr = qemu_get_be64(f);
3163
3164         /*
3165          * If qemu file error, we should stop here, and then "addr"
3166          * may be invalid
3167          */
3168         ret = qemu_file_get_error(f);
3169         if (ret) {
3170             break;
3171         }
3172
3173         flags = addr & ~TARGET_PAGE_MASK;
3174         addr &= TARGET_PAGE_MASK;
3175
3176         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3177         place_needed = false;
3178         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3179                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3180             block = ram_block_from_stream(f, flags);
3181
3182             host = host_from_ram_block_offset(block, addr);
3183             if (!host) {
3184                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3185                 ret = -EINVAL;
3186                 break;
3187             }
3188             target_pages++;
3189             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3190             /*
3191              * Postcopy requires that we place whole host pages atomically;
3192              * these may be huge pages for RAMBlocks that are backed by
3193              * hugetlbfs.
3194              * To make it atomic, the data is read into a temporary page
3195              * that's moved into place later.
3196              * The migration protocol uses,  possibly smaller, target-pages
3197              * however the source ensures it always sends all the components
3198              * of a host page in one chunk.
3199              */
3200             page_buffer = postcopy_host_page +
3201                           ((uintptr_t)host & (block->page_size - 1));
3202             /* If all TP are zero then we can optimise the place */
3203             if (target_pages == 1) {
3204                 all_zero = true;
3205                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3206                                                     block->page_size);
3207             } else {
3208                 /* not the 1st TP within the HP */
3209                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3210                     (uintptr_t)this_host) {
3211                     error_report("Non-same host page %p/%p",
3212                                   host, this_host);
3213                     ret = -EINVAL;
3214                     break;
3215                 }
3216             }
3217
3218             /*
3219              * If it's the last part of a host page then we place the host
3220              * page
3221              */
3222             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3223                 place_needed = true;
3224                 target_pages = 0;
3225             }
3226             place_source = postcopy_host_page;
3227         }
3228
3229         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3230         case RAM_SAVE_FLAG_ZERO:
3231             ch = qemu_get_byte(f);
3232             /*
3233              * Can skip to set page_buffer when
3234              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3235              */
3236             if (ch || !matches_target_page_size) {
3237                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3238             }
3239             if (ch) {
3240                 all_zero = false;
3241             }
3242             break;
3243
3244         case RAM_SAVE_FLAG_PAGE:
3245             all_zero = false;
3246             if (!matches_target_page_size) {
3247                 /* For huge pages, we always use temporary buffer */
3248                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3249             } else {
3250                 /*
3251                  * For small pages that matches target page size, we
3252                  * avoid the qemu_file copy.  Instead we directly use
3253                  * the buffer of QEMUFile to place the page.  Note: we
3254                  * cannot do any QEMUFile operation before using that
3255                  * buffer to make sure the buffer is valid when
3256                  * placing the page.
3257                  */
3258                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3259                                          TARGET_PAGE_SIZE);
3260             }
3261             break;
3262         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3263             all_zero = false;
3264             len = qemu_get_be32(f);
3265             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3266                 error_report("Invalid compressed data length: %d", len);
3267                 ret = -EINVAL;
3268                 break;
3269             }
3270             decompress_data_with_multi_threads(f, page_buffer, len);
3271             break;
3272
3273         case RAM_SAVE_FLAG_EOS:
3274             /* normal exit */
3275             multifd_recv_sync_main();
3276             break;
3277         default:
3278             error_report("Unknown combination of migration flags: %#x"
3279                          " (postcopy mode)", flags);
3280             ret = -EINVAL;
3281             break;
3282         }
3283
3284         /* Got the whole host page, wait for decompress before placing. */
3285         if (place_needed) {
3286             ret |= wait_for_decompress_done();
3287         }
3288
3289         /* Detect for any possible file errors */
3290         if (!ret && qemu_file_get_error(f)) {
3291             ret = qemu_file_get_error(f);
3292         }
3293
3294         if (!ret && place_needed) {
3295             /* This gets called at the last target page in the host page */
3296             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3297                                                        block->page_size);
3298
3299             if (all_zero) {
3300                 ret = postcopy_place_page_zero(mis, place_dest,
3301                                                block);
3302             } else {
3303                 ret = postcopy_place_page(mis, place_dest,
3304                                           place_source, block);
3305             }
3306         }
3307     }
3308
3309     return ret;
3310 }
3311
3312 static bool postcopy_is_advised(void)
3313 {
3314     PostcopyState ps = postcopy_state_get();
3315     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3316 }
3317
3318 static bool postcopy_is_running(void)
3319 {
3320     PostcopyState ps = postcopy_state_get();
3321     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3322 }
3323
3324 /*
3325  * Flush content of RAM cache into SVM's memory.
3326  * Only flush the pages that be dirtied by PVM or SVM or both.
3327  */
3328 static void colo_flush_ram_cache(void)
3329 {
3330     RAMBlock *block = NULL;
3331     void *dst_host;
3332     void *src_host;
3333     unsigned long offset = 0;
3334
3335     memory_global_dirty_log_sync();
3336     WITH_RCU_READ_LOCK_GUARD() {
3337         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3338             ramblock_sync_dirty_bitmap(ram_state, block);
3339         }
3340     }
3341
3342     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3343     WITH_RCU_READ_LOCK_GUARD() {
3344         block = QLIST_FIRST_RCU(&ram_list.blocks);
3345
3346         while (block) {
3347             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3348
3349             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3350                 >= block->used_length) {
3351                 offset = 0;
3352                 block = QLIST_NEXT_RCU(block, next);
3353             } else {
3354                 migration_bitmap_clear_dirty(ram_state, block, offset);
3355                 dst_host = block->host
3356                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3357                 src_host = block->colo_cache
3358                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3359                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3360             }
3361         }
3362     }
3363     trace_colo_flush_ram_cache_end();
3364 }
3365
3366 /**
3367  * ram_load_precopy: load pages in precopy case
3368  *
3369  * Returns 0 for success or -errno in case of error
3370  *
3371  * Called in precopy mode by ram_load().
3372  * rcu_read_lock is taken prior to this being called.
3373  *
3374  * @f: QEMUFile where to send the data
3375  */
3376 static int ram_load_precopy(QEMUFile *f)
3377 {
3378     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3379     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3380     bool postcopy_advised = postcopy_is_advised();
3381     if (!migrate_use_compression()) {
3382         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3383     }
3384
3385     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3386         ram_addr_t addr, total_ram_bytes;
3387         void *host = NULL, *host_bak = NULL;
3388         uint8_t ch;
3389
3390         /*
3391          * Yield periodically to let main loop run, but an iteration of
3392          * the main loop is expensive, so do it each some iterations
3393          */
3394         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3395             aio_co_schedule(qemu_get_current_aio_context(),
3396                             qemu_coroutine_self());
3397             qemu_coroutine_yield();
3398         }
3399         i++;
3400
3401         addr = qemu_get_be64(f);
3402         flags = addr & ~TARGET_PAGE_MASK;
3403         addr &= TARGET_PAGE_MASK;
3404
3405         if (flags & invalid_flags) {
3406             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3407                 error_report("Received an unexpected compressed page");
3408             }
3409
3410             ret = -EINVAL;
3411             break;
3412         }
3413
3414         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3415                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3416             RAMBlock *block = ram_block_from_stream(f, flags);
3417
3418             host = host_from_ram_block_offset(block, addr);
3419             /*
3420              * After going into COLO stage, we should not load the page
3421              * into SVM's memory directly, we put them into colo_cache firstly.
3422              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3423              * Previously, we copied all these memory in preparing stage of COLO
3424              * while we need to stop VM, which is a time-consuming process.
3425              * Here we optimize it by a trick, back-up every page while in
3426              * migration process while COLO is enabled, though it affects the
3427              * speed of the migration, but it obviously reduce the downtime of
3428              * back-up all SVM'S memory in COLO preparing stage.
3429              */
3430             if (migration_incoming_colo_enabled()) {
3431                 if (migration_incoming_in_colo_state()) {
3432                     /* In COLO stage, put all pages into cache temporarily */
3433                     host = colo_cache_from_block_offset(block, addr, true);
3434                 } else {
3435                    /*
3436                     * In migration stage but before COLO stage,
3437                     * Put all pages into both cache and SVM's memory.
3438                     */
3439                     host_bak = colo_cache_from_block_offset(block, addr, false);
3440                 }
3441             }
3442             if (!host) {
3443                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3444                 ret = -EINVAL;
3445                 break;
3446             }
3447             if (!migration_incoming_in_colo_state()) {
3448                 ramblock_recv_bitmap_set(block, host);
3449             }
3450
3451             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3452         }
3453
3454         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3455         case RAM_SAVE_FLAG_MEM_SIZE:
3456             /* Synchronize RAM block list */
3457             total_ram_bytes = addr;
3458             while (!ret && total_ram_bytes) {
3459                 RAMBlock *block;
3460                 char id[256];
3461                 ram_addr_t length;
3462
3463                 len = qemu_get_byte(f);
3464                 qemu_get_buffer(f, (uint8_t *)id, len);
3465                 id[len] = 0;
3466                 length = qemu_get_be64(f);
3467
3468                 block = qemu_ram_block_by_name(id);
3469                 if (block && !qemu_ram_is_migratable(block)) {
3470                     error_report("block %s should not be migrated !", id);
3471                     ret = -EINVAL;
3472                 } else if (block) {
3473                     if (length != block->used_length) {
3474                         Error *local_err = NULL;
3475
3476                         ret = qemu_ram_resize(block, length,
3477                                               &local_err);
3478                         if (local_err) {
3479                             error_report_err(local_err);
3480                         }
3481                     }
3482                     /* For postcopy we need to check hugepage sizes match */
3483                     if (postcopy_advised &&
3484                         block->page_size != qemu_host_page_size) {
3485                         uint64_t remote_page_size = qemu_get_be64(f);
3486                         if (remote_page_size != block->page_size) {
3487                             error_report("Mismatched RAM page size %s "
3488                                          "(local) %zd != %" PRId64,
3489                                          id, block->page_size,
3490                                          remote_page_size);
3491                             ret = -EINVAL;
3492                         }
3493                     }
3494                     if (migrate_ignore_shared()) {
3495                         hwaddr addr = qemu_get_be64(f);
3496                         if (ramblock_is_ignored(block) &&
3497                             block->mr->addr != addr) {
3498                             error_report("Mismatched GPAs for block %s "
3499                                          "%" PRId64 "!= %" PRId64,
3500                                          id, (uint64_t)addr,
3501                                          (uint64_t)block->mr->addr);
3502                             ret = -EINVAL;
3503                         }
3504                     }
3505                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3506                                           block->idstr);
3507                 } else {
3508                     error_report("Unknown ramblock \"%s\", cannot "
3509                                  "accept migration", id);
3510                     ret = -EINVAL;
3511                 }
3512
3513                 total_ram_bytes -= length;
3514             }
3515             break;
3516
3517         case RAM_SAVE_FLAG_ZERO:
3518             ch = qemu_get_byte(f);
3519             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3520             break;
3521
3522         case RAM_SAVE_FLAG_PAGE:
3523             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3524             break;
3525
3526         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3527             len = qemu_get_be32(f);
3528             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3529                 error_report("Invalid compressed data length: %d", len);
3530                 ret = -EINVAL;
3531                 break;
3532             }
3533             decompress_data_with_multi_threads(f, host, len);
3534             break;
3535
3536         case RAM_SAVE_FLAG_XBZRLE:
3537             if (load_xbzrle(f, addr, host) < 0) {
3538                 error_report("Failed to decompress XBZRLE page at "
3539                              RAM_ADDR_FMT, addr);
3540                 ret = -EINVAL;
3541                 break;
3542             }
3543             break;
3544         case RAM_SAVE_FLAG_EOS:
3545             /* normal exit */
3546             multifd_recv_sync_main();
3547             break;
3548         default:
3549             if (flags & RAM_SAVE_FLAG_HOOK) {
3550                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3551             } else {
3552                 error_report("Unknown combination of migration flags: %#x",
3553                              flags);
3554                 ret = -EINVAL;
3555             }
3556         }
3557         if (!ret) {
3558             ret = qemu_file_get_error(f);
3559         }
3560         if (!ret && host_bak) {
3561             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3562         }
3563     }
3564
3565     ret |= wait_for_decompress_done();
3566     return ret;
3567 }
3568
3569 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3570 {
3571     int ret = 0;
3572     static uint64_t seq_iter;
3573     /*
3574      * If system is running in postcopy mode, page inserts to host memory must
3575      * be atomic
3576      */
3577     bool postcopy_running = postcopy_is_running();
3578
3579     seq_iter++;
3580
3581     if (version_id != 4) {
3582         return -EINVAL;
3583     }
3584
3585     /*
3586      * This RCU critical section can be very long running.
3587      * When RCU reclaims in the code start to become numerous,
3588      * it will be necessary to reduce the granularity of this
3589      * critical section.
3590      */
3591     WITH_RCU_READ_LOCK_GUARD() {
3592         if (postcopy_running) {
3593             ret = ram_load_postcopy(f);
3594         } else {
3595             ret = ram_load_precopy(f);
3596         }
3597     }
3598     trace_ram_load_complete(ret, seq_iter);
3599
3600     if (!ret  && migration_incoming_in_colo_state()) {
3601         colo_flush_ram_cache();
3602     }
3603     return ret;
3604 }
3605
3606 static bool ram_has_postcopy(void *opaque)
3607 {
3608     RAMBlock *rb;
3609     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3610         if (ramblock_is_pmem(rb)) {
3611             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3612                          "is not supported now!", rb->idstr, rb->host);
3613             return false;
3614         }
3615     }
3616
3617     return migrate_postcopy_ram();
3618 }
3619
3620 /* Sync all the dirty bitmap with destination VM.  */
3621 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3622 {
3623     RAMBlock *block;
3624     QEMUFile *file = s->to_dst_file;
3625     int ramblock_count = 0;
3626
3627     trace_ram_dirty_bitmap_sync_start();
3628
3629     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3630         qemu_savevm_send_recv_bitmap(file, block->idstr);
3631         trace_ram_dirty_bitmap_request(block->idstr);
3632         ramblock_count++;
3633     }
3634
3635     trace_ram_dirty_bitmap_sync_wait();
3636
3637     /* Wait until all the ramblocks' dirty bitmap synced */
3638     while (ramblock_count--) {
3639         qemu_sem_wait(&s->rp_state.rp_sem);
3640     }
3641
3642     trace_ram_dirty_bitmap_sync_complete();
3643
3644     return 0;
3645 }
3646
3647 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3648 {
3649     qemu_sem_post(&s->rp_state.rp_sem);
3650 }
3651
3652 /*
3653  * Read the received bitmap, revert it as the initial dirty bitmap.
3654  * This is only used when the postcopy migration is paused but wants
3655  * to resume from a middle point.
3656  */
3657 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3658 {
3659     int ret = -EINVAL;
3660     QEMUFile *file = s->rp_state.from_dst_file;
3661     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3662     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3663     uint64_t size, end_mark;
3664
3665     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3666
3667     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3668         error_report("%s: incorrect state %s", __func__,
3669                      MigrationStatus_str(s->state));
3670         return -EINVAL;
3671     }
3672
3673     /*
3674      * Note: see comments in ramblock_recv_bitmap_send() on why we
3675      * need the endianess convertion, and the paddings.
3676      */
3677     local_size = ROUND_UP(local_size, 8);
3678
3679     /* Add paddings */
3680     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3681
3682     size = qemu_get_be64(file);
3683
3684     /* The size of the bitmap should match with our ramblock */
3685     if (size != local_size) {
3686         error_report("%s: ramblock '%s' bitmap size mismatch "
3687                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3688                      block->idstr, size, local_size);
3689         ret = -EINVAL;
3690         goto out;
3691     }
3692
3693     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3694     end_mark = qemu_get_be64(file);
3695
3696     ret = qemu_file_get_error(file);
3697     if (ret || size != local_size) {
3698         error_report("%s: read bitmap failed for ramblock '%s': %d"
3699                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3700                      __func__, block->idstr, ret, local_size, size);
3701         ret = -EIO;
3702         goto out;
3703     }
3704
3705     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3706         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3707                      __func__, block->idstr, end_mark);
3708         ret = -EINVAL;
3709         goto out;
3710     }
3711
3712     /*
3713      * Endianess convertion. We are during postcopy (though paused).
3714      * The dirty bitmap won't change. We can directly modify it.
3715      */
3716     bitmap_from_le(block->bmap, le_bitmap, nbits);
3717
3718     /*
3719      * What we received is "received bitmap". Revert it as the initial
3720      * dirty bitmap for this ramblock.
3721      */
3722     bitmap_complement(block->bmap, block->bmap, nbits);
3723
3724     trace_ram_dirty_bitmap_reload_complete(block->idstr);
3725
3726     /*
3727      * We succeeded to sync bitmap for current ramblock. If this is
3728      * the last one to sync, we need to notify the main send thread.
3729      */
3730     ram_dirty_bitmap_reload_notify(s);
3731
3732     ret = 0;
3733 out:
3734     g_free(le_bitmap);
3735     return ret;
3736 }
3737
3738 static int ram_resume_prepare(MigrationState *s, void *opaque)
3739 {
3740     RAMState *rs = *(RAMState **)opaque;
3741     int ret;
3742
3743     ret = ram_dirty_bitmap_sync_all(s, rs);
3744     if (ret) {
3745         return ret;
3746     }
3747
3748     ram_state_resume_prepare(rs, s->to_dst_file);
3749
3750     return 0;
3751 }
3752
3753 static SaveVMHandlers savevm_ram_handlers = {
3754     .save_setup = ram_save_setup,
3755     .save_live_iterate = ram_save_iterate,
3756     .save_live_complete_postcopy = ram_save_complete,
3757     .save_live_complete_precopy = ram_save_complete,
3758     .has_postcopy = ram_has_postcopy,
3759     .save_live_pending = ram_save_pending,
3760     .load_state = ram_load,
3761     .save_cleanup = ram_save_cleanup,
3762     .load_setup = ram_load_setup,
3763     .load_cleanup = ram_load_cleanup,
3764     .resume_prepare = ram_resume_prepare,
3765 };
3766
3767 void ram_mig_init(void)
3768 {
3769     qemu_mutex_init(&XBZRLE.lock);
3770     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
3771 }