migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/madvise.h"
  34 #include "qemu/main-loop.h"
  35 #include "io/channel-null.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration/register.h"
  40 #include "migration/misc.h"
  41 #include "qemu-file.h"
  42 #include "postcopy-ram.h"
  43 #include "page_cache.h"
  44 #include "qemu/error-report.h"
  45 #include "qapi/error.h"
  46 #include "qapi/qapi-types-migration.h"
  47 #include "qapi/qapi-events-migration.h"
  48 #include "qapi/qmp/qerror.h"
  49 #include "trace.h"
  50 #include "exec/ram_addr.h"
  51 #include "exec/target_page.h"
  52 #include "qemu/rcu_queue.h"
  53 #include "migration/colo.h"
  54 #include "block.h"
  55 #include "sysemu/cpu-throttle.h"
  56 #include "savevm.h"
  57 #include "qemu/iov.h"
  58 #include "multifd.h"
  59 #include "sysemu/runstate.h"
  60
  61 #include "hw/boards.h" /* for machine_dump_guest_core() */
  62
  63 #if defined(__linux__)
  64 #include "qemu/userfaultfd.h"
  65 #endif /* defined(__linux__) */
  66
  67 /***********************************************************/
  68 /* ram save/restore */
  69
  70 /*
  71  * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  72  * worked for pages that were filled with the same char.  We switched
  73  * it to only search for the zero value.  And to avoid confusion with
  74  * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
  75  */
  76 /*
  77  * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
  78  */
  79 #define RAM_SAVE_FLAG_FULL     0x01
  80 #define RAM_SAVE_FLAG_ZERO     0x02
  81 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  82 #define RAM_SAVE_FLAG_PAGE     0x08
  83 #define RAM_SAVE_FLAG_EOS      0x10
  84 #define RAM_SAVE_FLAG_CONTINUE 0x20
  85 #define RAM_SAVE_FLAG_XBZRLE   0x40
  86 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
  87 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  88 /* We can't use any flag that is bigger than 0x200 */
  89
  90 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
  91      uint8_t *, int) = xbzrle_encode_buffer;
  92 #if defined(CONFIG_AVX512BW_OPT)
  93 #include "qemu/cpuid.h"
  94 static void __attribute__((constructor)) init_cpu_flag(void)
  95 {
  96     unsigned max = __get_cpuid_max(0, NULL);
  97     int a, b, c, d;
  98     if (max >= 1) {
  99         __cpuid(1, a, b, c, d);
 100          /* We must check that AVX is not just available, but usable.  */
 101         if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
 102             int bv;
 103             __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
 104             __cpuid_count(7, 0, a, b, c, d);
 105            /* 0xe6:
 106             *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
 107             *                    and ZMM16-ZMM31 state are enabled by OS)
 108             *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
 109             */
 110             if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
 111                 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
 112             }
 113         }
 114     }
 115 }
 116 #endif
 117
 118 XBZRLECacheStats xbzrle_counters;
 119
 120 /* used by the search for pages to send */
 121 struct PageSearchStatus {
 122     /* The migration channel used for a specific host page */
 123     QEMUFile    *pss_channel;
 124     /* Last block from where we have sent data */
 125     RAMBlock *last_sent_block;
 126     /* Current block being searched */
 127     RAMBlock    *block;
 128     /* Current page to search from */
 129     unsigned long page;
 130     /* Set once we wrap around */
 131     bool         complete_round;
 132     /* Whether we're sending a host page */
 133     bool          host_page_sending;
 134     /* The start/end of current host page.  Invalid if host_page_sending==false */
 135     unsigned long host_page_start;
 136     unsigned long host_page_end;
 137 };
 138 typedef struct PageSearchStatus PageSearchStatus;
 139
 140 /* struct contains XBZRLE cache and a static page
 141    used by the compression */
 142 static struct {
 143     /* buffer used for XBZRLE encoding */
 144     uint8_t *encoded_buf;
 145     /* buffer for storing page content */
 146     uint8_t *current_buf;
 147     /* Cache for XBZRLE, Protected by lock. */
 148     PageCache *cache;
 149     QemuMutex lock;
 150     /* it will store a page full of zeros */
 151     uint8_t *zero_target_page;
 152     /* buffer used for XBZRLE decoding */
 153     uint8_t *decoded_buf;
 154 } XBZRLE;
 155
 156 static void XBZRLE_cache_lock(void)
 157 {
 158     if (migrate_use_xbzrle()) {
 159         qemu_mutex_lock(&XBZRLE.lock);
 160     }
 161 }
 162
 163 static void XBZRLE_cache_unlock(void)
 164 {
 165     if (migrate_use_xbzrle()) {
 166         qemu_mutex_unlock(&XBZRLE.lock);
 167     }
 168 }
 169
 170 /**
 171  * xbzrle_cache_resize: resize the xbzrle cache
 172  *
 173  * This function is called from migrate_params_apply in main
 174  * thread, possibly while a migration is in progress.  A running
 175  * migration may be using the cache and might finish during this call,
 176  * hence changes to the cache are protected by XBZRLE.lock().
 177  *
 178  * Returns 0 for success or -1 for error
 179  *
 180  * @new_size: new cache size
 181  * @errp: set *errp if the check failed, with reason
 182  */
 183 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 184 {
 185     PageCache *new_cache;
 186     int64_t ret = 0;
 187
 188     /* Check for truncation */
 189     if (new_size != (size_t)new_size) {
 190         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 191                    "exceeding address space");
 192         return -1;
 193     }
 194
 195     if (new_size == migrate_xbzrle_cache_size()) {
 196         /* nothing to do */
 197         return 0;
 198     }
 199
 200     XBZRLE_cache_lock();
 201
 202     if (XBZRLE.cache != NULL) {
 203         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 204         if (!new_cache) {
 205             ret = -1;
 206             goto out;
 207         }
 208
 209         cache_fini(XBZRLE.cache);
 210         XBZRLE.cache = new_cache;
 211     }
 212 out:
 213     XBZRLE_cache_unlock();
 214     return ret;
 215 }
 216
 217 static bool postcopy_preempt_active(void)
 218 {
 219     return migrate_postcopy_preempt() && migration_in_postcopy();
 220 }
 221
 222 bool ramblock_is_ignored(RAMBlock *block)
 223 {
 224     return !qemu_ram_is_migratable(block) ||
 225            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 226 }
 227
 228 #undef RAMBLOCK_FOREACH
 229
 230 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 231 {
 232     RAMBlock *block;
 233     int ret = 0;
 234
 235     RCU_READ_LOCK_GUARD();
 236
 237     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 238         ret = func(block, opaque);
 239         if (ret) {
 240             break;
 241         }
 242     }
 243     return ret;
 244 }
 245
 246 static void ramblock_recv_map_init(void)
 247 {
 248     RAMBlock *rb;
 249
 250     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 251         assert(!rb->receivedmap);
 252         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 253     }
 254 }
 255
 256 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 257 {
 258     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 259                     rb->receivedmap);
 260 }
 261
 262 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 263 {
 264     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 265 }
 266
 267 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 268 {
 269     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 270 }
 271
 272 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 273                                     size_t nr)
 274 {
 275     bitmap_set_atomic(rb->receivedmap,
 276                       ramblock_recv_bitmap_offset(host_addr, rb),
 277                       nr);
 278 }
 279
 280 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 281
 282 /*
 283  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 284  *
 285  * Returns >0 if success with sent bytes, or <0 if error.
 286  */
 287 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 288                                   const char *block_name)
 289 {
 290     RAMBlock *block = qemu_ram_block_by_name(block_name);
 291     unsigned long *le_bitmap, nbits;
 292     uint64_t size;
 293
 294     if (!block) {
 295         error_report("%s: invalid block name: %s", __func__, block_name);
 296         return -1;
 297     }
 298
 299     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 300
 301     /*
 302      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 303      * machines we may need 4 more bytes for padding (see below
 304      * comment). So extend it a bit before hand.
 305      */
 306     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 307
 308     /*
 309      * Always use little endian when sending the bitmap. This is
 310      * required that when source and destination VMs are not using the
 311      * same endianness. (Note: big endian won't work.)
 312      */
 313     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 314
 315     /* Size of the bitmap, in bytes */
 316     size = DIV_ROUND_UP(nbits, 8);
 317
 318     /*
 319      * size is always aligned to 8 bytes for 64bit machines, but it
 320      * may not be true for 32bit machines. We need this padding to
 321      * make sure the migration can survive even between 32bit and
 322      * 64bit machines.
 323      */
 324     size = ROUND_UP(size, 8);
 325
 326     qemu_put_be64(file, size);
 327     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 328     /*
 329      * Mark as an end, in case the middle part is screwed up due to
 330      * some "mysterious" reason.
 331      */
 332     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 333     qemu_fflush(file);
 334
 335     g_free(le_bitmap);
 336
 337     if (qemu_file_get_error(file)) {
 338         return qemu_file_get_error(file);
 339     }
 340
 341     return size + sizeof(size);
 342 }
 343
 344 /*
 345  * An outstanding page request, on the source, having been received
 346  * and queued
 347  */
 348 struct RAMSrcPageRequest {
 349     RAMBlock *rb;
 350     hwaddr    offset;
 351     hwaddr    len;
 352
 353     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 354 };
 355
 356 /* State of RAM for migration */
 357 struct RAMState {
 358     /*
 359      * PageSearchStatus structures for the channels when send pages.
 360      * Protected by the bitmap_mutex.
 361      */
 362     PageSearchStatus pss[RAM_CHANNEL_MAX];
 363     /* UFFD file descriptor, used in 'write-tracking' migration */
 364     int uffdio_fd;
 365     /* total ram size in bytes */
 366     uint64_t ram_bytes_total;
 367     /* Last block that we have visited searching for dirty pages */
 368     RAMBlock *last_seen_block;
 369     /* Last dirty target page we have sent */
 370     ram_addr_t last_page;
 371     /* last ram version we have seen */
 372     uint32_t last_version;
 373     /* How many times we have dirty too many pages */
 374     int dirty_rate_high_cnt;
 375     /* these variables are used for bitmap sync */
 376     /* last time we did a full bitmap_sync */
 377     int64_t time_last_bitmap_sync;
 378     /* bytes transferred at start_time */
 379     uint64_t bytes_xfer_prev;
 380     /* number of dirty pages since start_time */
 381     uint64_t num_dirty_pages_period;
 382     /* xbzrle misses since the beginning of the period */
 383     uint64_t xbzrle_cache_miss_prev;
 384     /* Amount of xbzrle pages since the beginning of the period */
 385     uint64_t xbzrle_pages_prev;
 386     /* Amount of xbzrle encoded bytes since the beginning of the period */
 387     uint64_t xbzrle_bytes_prev;
 388     /* Start using XBZRLE (e.g., after the first round). */
 389     bool xbzrle_enabled;
 390     /* Are we on the last stage of migration */
 391     bool last_stage;
 392     /* compression statistics since the beginning of the period */
 393     /* amount of count that no free thread to compress data */
 394     uint64_t compress_thread_busy_prev;
 395     /* amount bytes after compression */
 396     uint64_t compressed_size_prev;
 397     /* amount of compressed pages */
 398     uint64_t compress_pages_prev;
 399
 400     /* total handled target pages at the beginning of period */
 401     uint64_t target_page_count_prev;
 402     /* total handled target pages since start */
 403     uint64_t target_page_count;
 404     /* number of dirty bits in the bitmap */
 405     uint64_t migration_dirty_pages;
 406     /*
 407      * Protects:
 408      * - dirty/clear bitmap
 409      * - migration_dirty_pages
 410      * - pss structures
 411      */
 412     QemuMutex bitmap_mutex;
 413     /* The RAMBlock used in the last src_page_requests */
 414     RAMBlock *last_req_rb;
 415     /* Queue of outstanding page requests from the destination */
 416     QemuMutex src_page_req_mutex;
 417     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 418 };
 419 typedef struct RAMState RAMState;
 420
 421 static RAMState *ram_state;
 422
 423 static NotifierWithReturnList precopy_notifier_list;
 424
 425 /* Whether postcopy has queued requests? */
 426 static bool postcopy_has_request(RAMState *rs)
 427 {
 428     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
 429 }
 430
 431 void precopy_infrastructure_init(void)
 432 {
 433     notifier_with_return_list_init(&precopy_notifier_list);
 434 }
 435
 436 void precopy_add_notifier(NotifierWithReturn *n)
 437 {
 438     notifier_with_return_list_add(&precopy_notifier_list, n);
 439 }
 440
 441 void precopy_remove_notifier(NotifierWithReturn *n)
 442 {
 443     notifier_with_return_remove(n);
 444 }
 445
 446 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 447 {
 448     PrecopyNotifyData pnd;
 449     pnd.reason = reason;
 450     pnd.errp = errp;
 451
 452     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 453 }
 454
 455 uint64_t ram_bytes_remaining(void)
 456 {
 457     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 458                        0;
 459 }
 460
 461 /*
 462  * NOTE: not all stats in ram_counters are used in reality.  See comments
 463  * for struct MigrationAtomicStats.  The ultimate result of ram migration
 464  * counters will be a merged version with both ram_counters and the atomic
 465  * fields in ram_atomic_counters.
 466  */
 467 MigrationStats ram_counters;
 468 MigrationAtomicStats ram_atomic_counters;
 469
 470 void ram_transferred_add(uint64_t bytes)
 471 {
 472     if (runstate_is_running()) {
 473         ram_counters.precopy_bytes += bytes;
 474     } else if (migration_in_postcopy()) {
 475         stat64_add(&ram_atomic_counters.postcopy_bytes, bytes);
 476     } else {
 477         ram_counters.downtime_bytes += bytes;
 478     }
 479     stat64_add(&ram_atomic_counters.transferred, bytes);
 480 }
 481
 482 void dirty_sync_missed_zero_copy(void)
 483 {
 484     ram_counters.dirty_sync_missed_zero_copy++;
 485 }
 486
 487 struct MigrationOps {
 488     int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
 489 };
 490 typedef struct MigrationOps MigrationOps;
 491
 492 MigrationOps *migration_ops;
 493
 494 CompressionStats compression_counters;
 495
 496 struct CompressParam {
 497     bool done;
 498     bool quit;
 499     bool zero_page;
 500     QEMUFile *file;
 501     QemuMutex mutex;
 502     QemuCond cond;
 503     RAMBlock *block;
 504     ram_addr_t offset;
 505
 506     /* internally used fields */
 507     z_stream stream;
 508     uint8_t *originbuf;
 509 };
 510 typedef struct CompressParam CompressParam;
 511
 512 struct DecompressParam {
 513     bool done;
 514     bool quit;
 515     QemuMutex mutex;
 516     QemuCond cond;
 517     void *des;
 518     uint8_t *compbuf;
 519     int len;
 520     z_stream stream;
 521 };
 522 typedef struct DecompressParam DecompressParam;
 523
 524 static CompressParam *comp_param;
 525 static QemuThread *compress_threads;
 526 /* comp_done_cond is used to wake up the migration thread when
 527  * one of the compression threads has finished the compression.
 528  * comp_done_lock is used to co-work with comp_done_cond.
 529  */
 530 static QemuMutex comp_done_lock;
 531 static QemuCond comp_done_cond;
 532
 533 static QEMUFile *decomp_file;
 534 static DecompressParam *decomp_param;
 535 static QemuThread *decompress_threads;
 536 static QemuMutex decomp_done_lock;
 537 static QemuCond decomp_done_cond;
 538
 539 static int ram_save_host_page_urgent(PageSearchStatus *pss);
 540
 541 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 542                                  ram_addr_t offset, uint8_t *source_buf);
 543
 544 /* NOTE: page is the PFN not real ram_addr_t. */
 545 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
 546 {
 547     pss->block = rb;
 548     pss->page = page;
 549     pss->complete_round = false;
 550 }
 551
 552 /*
 553  * Check whether two PSSs are actively sending the same page.  Return true
 554  * if it is, false otherwise.
 555  */
 556 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
 557 {
 558     return pss1->host_page_sending && pss2->host_page_sending &&
 559         (pss1->host_page_start == pss2->host_page_start);
 560 }
 561
 562 static void *do_data_compress(void *opaque)
 563 {
 564     CompressParam *param = opaque;
 565     RAMBlock *block;
 566     ram_addr_t offset;
 567     bool zero_page;
 568
 569     qemu_mutex_lock(&param->mutex);
 570     while (!param->quit) {
 571         if (param->block) {
 572             block = param->block;
 573             offset = param->offset;
 574             param->block = NULL;
 575             qemu_mutex_unlock(&param->mutex);
 576
 577             zero_page = do_compress_ram_page(param->file, &param->stream,
 578                                              block, offset, param->originbuf);
 579
 580             qemu_mutex_lock(&comp_done_lock);
 581             param->done = true;
 582             param->zero_page = zero_page;
 583             qemu_cond_signal(&comp_done_cond);
 584             qemu_mutex_unlock(&comp_done_lock);
 585
 586             qemu_mutex_lock(&param->mutex);
 587         } else {
 588             qemu_cond_wait(&param->cond, &param->mutex);
 589         }
 590     }
 591     qemu_mutex_unlock(&param->mutex);
 592
 593     return NULL;
 594 }
 595
 596 static void compress_threads_save_cleanup(void)
 597 {
 598     int i, thread_count;
 599
 600     if (!migrate_use_compression() || !comp_param) {
 601         return;
 602     }
 603
 604     thread_count = migrate_compress_threads();
 605     for (i = 0; i < thread_count; i++) {
 606         /*
 607          * we use it as a indicator which shows if the thread is
 608          * properly init'd or not
 609          */
 610         if (!comp_param[i].file) {
 611             break;
 612         }
 613
 614         qemu_mutex_lock(&comp_param[i].mutex);
 615         comp_param[i].quit = true;
 616         qemu_cond_signal(&comp_param[i].cond);
 617         qemu_mutex_unlock(&comp_param[i].mutex);
 618
 619         qemu_thread_join(compress_threads + i);
 620         qemu_mutex_destroy(&comp_param[i].mutex);
 621         qemu_cond_destroy(&comp_param[i].cond);
 622         deflateEnd(&comp_param[i].stream);
 623         g_free(comp_param[i].originbuf);
 624         qemu_fclose(comp_param[i].file);
 625         comp_param[i].file = NULL;
 626     }
 627     qemu_mutex_destroy(&comp_done_lock);
 628     qemu_cond_destroy(&comp_done_cond);
 629     g_free(compress_threads);
 630     g_free(comp_param);
 631     compress_threads = NULL;
 632     comp_param = NULL;
 633 }
 634
 635 static int compress_threads_save_setup(void)
 636 {
 637     int i, thread_count;
 638
 639     if (!migrate_use_compression()) {
 640         return 0;
 641     }
 642     thread_count = migrate_compress_threads();
 643     compress_threads = g_new0(QemuThread, thread_count);
 644     comp_param = g_new0(CompressParam, thread_count);
 645     qemu_cond_init(&comp_done_cond);
 646     qemu_mutex_init(&comp_done_lock);
 647     for (i = 0; i < thread_count; i++) {
 648         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 649         if (!comp_param[i].originbuf) {
 650             goto exit;
 651         }
 652
 653         if (deflateInit(&comp_param[i].stream,
 654                         migrate_compress_level()) != Z_OK) {
 655             g_free(comp_param[i].originbuf);
 656             goto exit;
 657         }
 658
 659         /* comp_param[i].file is just used as a dummy buffer to save data,
 660          * set its ops to empty.
 661          */
 662         comp_param[i].file = qemu_file_new_output(
 663             QIO_CHANNEL(qio_channel_null_new()));
 664         comp_param[i].done = true;
 665         comp_param[i].quit = false;
 666         qemu_mutex_init(&comp_param[i].mutex);
 667         qemu_cond_init(&comp_param[i].cond);
 668         qemu_thread_create(compress_threads + i, "compress",
 669                            do_data_compress, comp_param + i,
 670                            QEMU_THREAD_JOINABLE);
 671     }
 672     return 0;
 673
 674 exit:
 675     compress_threads_save_cleanup();
 676     return -1;
 677 }
 678
 679 /**
 680  * save_page_header: write page header to wire
 681  *
 682  * If this is the 1st block, it also writes the block identification
 683  *
 684  * Returns the number of bytes written
 685  *
 686  * @pss: current PSS channel status
 687  * @block: block that contains the page we want to send
 688  * @offset: offset inside the block for the page
 689  *          in the lower bits, it contains flags
 690  */
 691 static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block,
 692                                ram_addr_t offset)
 693 {
 694     size_t size, len;
 695     bool same_block = (block == pss->last_sent_block);
 696     QEMUFile *f = pss->pss_channel;
 697
 698     if (same_block) {
 699         offset |= RAM_SAVE_FLAG_CONTINUE;
 700     }
 701     qemu_put_be64(f, offset);
 702     size = 8;
 703
 704     if (!same_block) {
 705         len = strlen(block->idstr);
 706         qemu_put_byte(f, len);
 707         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 708         size += 1 + len;
 709         pss->last_sent_block = block;
 710     }
 711     return size;
 712 }
 713
 714 /**
 715  * mig_throttle_guest_down: throttle down the guest
 716  *
 717  * Reduce amount of guest cpu execution to hopefully slow down memory
 718  * writes. If guest dirty memory rate is reduced below the rate at
 719  * which we can transfer pages to the destination then we should be
 720  * able to complete migration. Some workloads dirty memory way too
 721  * fast and will not effectively converge, even with auto-converge.
 722  */
 723 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 724                                     uint64_t bytes_dirty_threshold)
 725 {
 726     MigrationState *s = migrate_get_current();
 727     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 728     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 729     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 730     int pct_max = s->parameters.max_cpu_throttle;
 731
 732     uint64_t throttle_now = cpu_throttle_get_percentage();
 733     uint64_t cpu_now, cpu_ideal, throttle_inc;
 734
 735     /* We have not started throttling yet. Let's start it. */
 736     if (!cpu_throttle_active()) {
 737         cpu_throttle_set(pct_initial);
 738     } else {
 739         /* Throttling already on, just increase the rate */
 740         if (!pct_tailslow) {
 741             throttle_inc = pct_increment;
 742         } else {
 743             /* Compute the ideal CPU percentage used by Guest, which may
 744              * make the dirty rate match the dirty rate threshold. */
 745             cpu_now = 100 - throttle_now;
 746             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 747                         bytes_dirty_period);
 748             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 749         }
 750         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 751     }
 752 }
 753
 754 void mig_throttle_counter_reset(void)
 755 {
 756     RAMState *rs = ram_state;
 757
 758     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 759     rs->num_dirty_pages_period = 0;
 760     rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
 761 }
 762
 763 /**
 764  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 765  *
 766  * @rs: current RAM state
 767  * @current_addr: address for the zero page
 768  *
 769  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 770  * The important thing is that a stale (not-yet-0'd) page be replaced
 771  * by the new data.
 772  * As a bonus, if the page wasn't in the cache it gets added so that
 773  * when a small write is made into the 0'd page it gets XBZRLE sent.
 774  */
 775 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 776 {
 777     /* We don't care if this fails to allocate a new cache page
 778      * as long as it updated an old one */
 779     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 780                  ram_counters.dirty_sync_count);
 781 }
 782
 783 #define ENCODING_FLAG_XBZRLE 0x1
 784
 785 /**
 786  * save_xbzrle_page: compress and send current page
 787  *
 788  * Returns: 1 means that we wrote the page
 789  *          0 means that page is identical to the one already sent
 790  *          -1 means that xbzrle would be longer than normal
 791  *
 792  * @rs: current RAM state
 793  * @pss: current PSS channel
 794  * @current_data: pointer to the address of the page contents
 795  * @current_addr: addr of the page
 796  * @block: block that contains the page we want to send
 797  * @offset: offset inside the block for the page
 798  */
 799 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
 800                             uint8_t **current_data, ram_addr_t current_addr,
 801                             RAMBlock *block, ram_addr_t offset)
 802 {
 803     int encoded_len = 0, bytes_xbzrle;
 804     uint8_t *prev_cached_page;
 805     QEMUFile *file = pss->pss_channel;
 806
 807     if (!cache_is_cached(XBZRLE.cache, current_addr,
 808                          ram_counters.dirty_sync_count)) {
 809         xbzrle_counters.cache_miss++;
 810         if (!rs->last_stage) {
 811             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 812                              ram_counters.dirty_sync_count) == -1) {
 813                 return -1;
 814             } else {
 815                 /* update *current_data when the page has been
 816                    inserted into cache */
 817                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 818             }
 819         }
 820         return -1;
 821     }
 822
 823     /*
 824      * Reaching here means the page has hit the xbzrle cache, no matter what
 825      * encoding result it is (normal encoding, overflow or skipping the page),
 826      * count the page as encoded. This is used to calculate the encoding rate.
 827      *
 828      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 829      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 830      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 831      * skipped page included. In this way, the encoding rate can tell if the
 832      * guest page is good for xbzrle encoding.
 833      */
 834     xbzrle_counters.pages++;
 835     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 836
 837     /* save current buffer into memory */
 838     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 839
 840     /* XBZRLE encoding (if there is no overflow) */
 841     encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
 842                                             TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 843                                             TARGET_PAGE_SIZE);
 844
 845     /*
 846      * Update the cache contents, so that it corresponds to the data
 847      * sent, in all cases except where we skip the page.
 848      */
 849     if (!rs->last_stage && encoded_len != 0) {
 850         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 851         /*
 852          * In the case where we couldn't compress, ensure that the caller
 853          * sends the data from the cache, since the guest might have
 854          * changed the RAM since we copied it.
 855          */
 856         *current_data = prev_cached_page;
 857     }
 858
 859     if (encoded_len == 0) {
 860         trace_save_xbzrle_page_skipping();
 861         return 0;
 862     } else if (encoded_len == -1) {
 863         trace_save_xbzrle_page_overflow();
 864         xbzrle_counters.overflow++;
 865         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 866         return -1;
 867     }
 868
 869     /* Send XBZRLE based compressed page */
 870     bytes_xbzrle = save_page_header(pss, block,
 871                                     offset | RAM_SAVE_FLAG_XBZRLE);
 872     qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
 873     qemu_put_be16(file, encoded_len);
 874     qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
 875     bytes_xbzrle += encoded_len + 1 + 2;
 876     /*
 877      * Like compressed_size (please see update_compress_thread_counts),
 878      * the xbzrle encoded bytes don't count the 8 byte header with
 879      * RAM_SAVE_FLAG_CONTINUE.
 880      */
 881     xbzrle_counters.bytes += bytes_xbzrle - 8;
 882     ram_transferred_add(bytes_xbzrle);
 883
 884     return 1;
 885 }
 886
 887 /**
 888  * pss_find_next_dirty: find the next dirty page of current ramblock
 889  *
 890  * This function updates pss->page to point to the next dirty page index
 891  * within the ramblock to migrate, or the end of ramblock when nothing
 892  * found.  Note that when pss->host_page_sending==true it means we're
 893  * during sending a host page, so we won't look for dirty page that is
 894  * outside the host page boundary.
 895  *
 896  * @pss: the current page search status
 897  */
 898 static void pss_find_next_dirty(PageSearchStatus *pss)
 899 {
 900     RAMBlock *rb = pss->block;
 901     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 902     unsigned long *bitmap = rb->bmap;
 903
 904     if (ramblock_is_ignored(rb)) {
 905         /* Points directly to the end, so we know no dirty page */
 906         pss->page = size;
 907         return;
 908     }
 909
 910     /*
 911      * If during sending a host page, only look for dirty pages within the
 912      * current host page being send.
 913      */
 914     if (pss->host_page_sending) {
 915         assert(pss->host_page_end);
 916         size = MIN(size, pss->host_page_end);
 917     }
 918
 919     pss->page = find_next_bit(bitmap, size, pss->page);
 920 }
 921
 922 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 923                                                        unsigned long page)
 924 {
 925     uint8_t shift;
 926     hwaddr size, start;
 927
 928     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 929         return;
 930     }
 931
 932     shift = rb->clear_bmap_shift;
 933     /*
 934      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 935      * can make things easier sometimes since then start address
 936      * of the small chunk will always be 64 pages aligned so the
 937      * bitmap will always be aligned to unsigned long. We should
 938      * even be able to remove this restriction but I'm simply
 939      * keeping it.
 940      */
 941     assert(shift >= 6);
 942
 943     size = 1ULL << (TARGET_PAGE_BITS + shift);
 944     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 945     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 946     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 947 }
 948
 949 static void
 950 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 951                                                  unsigned long start,
 952                                                  unsigned long npages)
 953 {
 954     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 955     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 956     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 957
 958     /*
 959      * Clear pages from start to start + npages - 1, so the end boundary is
 960      * exclusive.
 961      */
 962     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 963         migration_clear_memory_region_dirty_bitmap(rb, i);
 964     }
 965 }
 966
 967 /*
 968  * colo_bitmap_find_diry:find contiguous dirty pages from start
 969  *
 970  * Returns the page offset within memory region of the start of the contiguout
 971  * dirty page
 972  *
 973  * @rs: current RAM state
 974  * @rb: RAMBlock where to search for dirty pages
 975  * @start: page where we start the search
 976  * @num: the number of contiguous dirty pages
 977  */
 978 static inline
 979 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 980                                      unsigned long start, unsigned long *num)
 981 {
 982     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 983     unsigned long *bitmap = rb->bmap;
 984     unsigned long first, next;
 985
 986     *num = 0;
 987
 988     if (ramblock_is_ignored(rb)) {
 989         return size;
 990     }
 991
 992     first = find_next_bit(bitmap, size, start);
 993     if (first >= size) {
 994         return first;
 995     }
 996     next = find_next_zero_bit(bitmap, size, first + 1);
 997     assert(next >= first);
 998     *num = next - first;
 999     return first;
1000 }
1001
1002 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1003                                                 RAMBlock *rb,
1004                                                 unsigned long page)
1005 {
1006     bool ret;
1007
1008     /*
1009      * Clear dirty bitmap if needed.  This _must_ be called before we
1010      * send any of the page in the chunk because we need to make sure
1011      * we can capture further page content changes when we sync dirty
1012      * log the next time.  So as long as we are going to send any of
1013      * the page in the chunk we clear the remote dirty bitmap for all.
1014      * Clearing it earlier won't be a problem, but too late will.
1015      */
1016     migration_clear_memory_region_dirty_bitmap(rb, page);
1017
1018     ret = test_and_clear_bit(page, rb->bmap);
1019     if (ret) {
1020         rs->migration_dirty_pages--;
1021     }
1022
1023     return ret;
1024 }
1025
1026 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
1027                                        void *opaque)
1028 {
1029     const hwaddr offset = section->offset_within_region;
1030     const hwaddr size = int128_get64(section->size);
1031     const unsigned long start = offset >> TARGET_PAGE_BITS;
1032     const unsigned long npages = size >> TARGET_PAGE_BITS;
1033     RAMBlock *rb = section->mr->ram_block;
1034     uint64_t *cleared_bits = opaque;
1035
1036     /*
1037      * We don't grab ram_state->bitmap_mutex because we expect to run
1038      * only when starting migration or during postcopy recovery where
1039      * we don't have concurrent access.
1040      */
1041     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1042         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1043     }
1044     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1045     bitmap_clear(rb->bmap, start, npages);
1046 }
1047
1048 /*
1049  * Exclude all dirty pages from migration that fall into a discarded range as
1050  * managed by a RamDiscardManager responsible for the mapped memory region of
1051  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1052  *
1053  * Discarded pages ("logically unplugged") have undefined content and must
1054  * not get migrated, because even reading these pages for migration might
1055  * result in undesired behavior.
1056  *
1057  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1058  *
1059  * Note: The result is only stable while migrating (precopy/postcopy).
1060  */
1061 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1062 {
1063     uint64_t cleared_bits = 0;
1064
1065     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1066         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1067         MemoryRegionSection section = {
1068             .mr = rb->mr,
1069             .offset_within_region = 0,
1070             .size = int128_make64(qemu_ram_get_used_length(rb)),
1071         };
1072
1073         ram_discard_manager_replay_discarded(rdm, &section,
1074                                              dirty_bitmap_clear_section,
1075                                              &cleared_bits);
1076     }
1077     return cleared_bits;
1078 }
1079
1080 /*
1081  * Check if a host-page aligned page falls into a discarded range as managed by
1082  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1083  *
1084  * Note: The result is only stable while migrating (precopy/postcopy).
1085  */
1086 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1087 {
1088     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1089         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1090         MemoryRegionSection section = {
1091             .mr = rb->mr,
1092             .offset_within_region = start,
1093             .size = int128_make64(qemu_ram_pagesize(rb)),
1094         };
1095
1096         return !ram_discard_manager_is_populated(rdm, &section);
1097     }
1098     return false;
1099 }
1100
1101 /* Called with RCU critical section */
1102 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1103 {
1104     uint64_t new_dirty_pages =
1105         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1106
1107     rs->migration_dirty_pages += new_dirty_pages;
1108     rs->num_dirty_pages_period += new_dirty_pages;
1109 }
1110
1111 /**
1112  * ram_pagesize_summary: calculate all the pagesizes of a VM
1113  *
1114  * Returns a summary bitmap of the page sizes of all RAMBlocks
1115  *
1116  * For VMs with just normal pages this is equivalent to the host page
1117  * size. If it's got some huge pages then it's the OR of all the
1118  * different page sizes.
1119  */
1120 uint64_t ram_pagesize_summary(void)
1121 {
1122     RAMBlock *block;
1123     uint64_t summary = 0;
1124
1125     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1126         summary |= block->page_size;
1127     }
1128
1129     return summary;
1130 }
1131
1132 uint64_t ram_get_total_transferred_pages(void)
1133 {
1134     return  stat64_get(&ram_atomic_counters.normal) +
1135         stat64_get(&ram_atomic_counters.duplicate) +
1136         compression_counters.pages + xbzrle_counters.pages;
1137 }
1138
1139 static void migration_update_rates(RAMState *rs, int64_t end_time)
1140 {
1141     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1142     double compressed_size;
1143
1144     /* calculate period counters */
1145     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1146                 / (end_time - rs->time_last_bitmap_sync);
1147
1148     if (!page_count) {
1149         return;
1150     }
1151
1152     if (migrate_use_xbzrle()) {
1153         double encoded_size, unencoded_size;
1154
1155         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1156             rs->xbzrle_cache_miss_prev) / page_count;
1157         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1158         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1159                          TARGET_PAGE_SIZE;
1160         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1161         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1162             xbzrle_counters.encoding_rate = 0;
1163         } else {
1164             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1165         }
1166         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1167         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1168     }
1169
1170     if (migrate_use_compression()) {
1171         compression_counters.busy_rate = (double)(compression_counters.busy -
1172             rs->compress_thread_busy_prev) / page_count;
1173         rs->compress_thread_busy_prev = compression_counters.busy;
1174
1175         compressed_size = compression_counters.compressed_size -
1176                           rs->compressed_size_prev;
1177         if (compressed_size) {
1178             double uncompressed_size = (compression_counters.pages -
1179                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1180
1181             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1182             compression_counters.compression_rate =
1183                                         uncompressed_size / compressed_size;
1184
1185             rs->compress_pages_prev = compression_counters.pages;
1186             rs->compressed_size_prev = compression_counters.compressed_size;
1187         }
1188     }
1189 }
1190
1191 static void migration_trigger_throttle(RAMState *rs)
1192 {
1193     MigrationState *s = migrate_get_current();
1194     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1195     uint64_t bytes_xfer_period =
1196         stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev;
1197     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1198     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1199
1200     /* During block migration the auto-converge logic incorrectly detects
1201      * that ram migration makes no progress. Avoid this by disabling the
1202      * throttling logic during the bulk phase of block migration. */
1203     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1204         /* The following detection logic can be refined later. For now:
1205            Check to see if the ratio between dirtied bytes and the approx.
1206            amount of bytes that just got transferred since the last time
1207            we were in this routine reaches the threshold. If that happens
1208            twice, start or increase throttling. */
1209
1210         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1211             (++rs->dirty_rate_high_cnt >= 2)) {
1212             trace_migration_throttle();
1213             rs->dirty_rate_high_cnt = 0;
1214             mig_throttle_guest_down(bytes_dirty_period,
1215                                     bytes_dirty_threshold);
1216         }
1217     }
1218 }
1219
1220 static void migration_bitmap_sync(RAMState *rs)
1221 {
1222     RAMBlock *block;
1223     int64_t end_time;
1224
1225     ram_counters.dirty_sync_count++;
1226
1227     if (!rs->time_last_bitmap_sync) {
1228         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1229     }
1230
1231     trace_migration_bitmap_sync_start();
1232     memory_global_dirty_log_sync();
1233
1234     qemu_mutex_lock(&rs->bitmap_mutex);
1235     WITH_RCU_READ_LOCK_GUARD() {
1236         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1237             ramblock_sync_dirty_bitmap(rs, block);
1238         }
1239         ram_counters.remaining = ram_bytes_remaining();
1240     }
1241     qemu_mutex_unlock(&rs->bitmap_mutex);
1242
1243     memory_global_after_dirty_log_sync();
1244     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1245
1246     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1247
1248     /* more than 1 second = 1000 millisecons */
1249     if (end_time > rs->time_last_bitmap_sync + 1000) {
1250         migration_trigger_throttle(rs);
1251
1252         migration_update_rates(rs, end_time);
1253
1254         rs->target_page_count_prev = rs->target_page_count;
1255
1256         /* reset period counters */
1257         rs->time_last_bitmap_sync = end_time;
1258         rs->num_dirty_pages_period = 0;
1259         rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
1260     }
1261     if (migrate_use_events()) {
1262         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1263     }
1264 }
1265
1266 static void migration_bitmap_sync_precopy(RAMState *rs)
1267 {
1268     Error *local_err = NULL;
1269
1270     /*
1271      * The current notifier usage is just an optimization to migration, so we
1272      * don't stop the normal migration process in the error case.
1273      */
1274     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1275         error_report_err(local_err);
1276         local_err = NULL;
1277     }
1278
1279     migration_bitmap_sync(rs);
1280
1281     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1282         error_report_err(local_err);
1283     }
1284 }
1285
1286 void ram_release_page(const char *rbname, uint64_t offset)
1287 {
1288     if (!migrate_release_ram() || !migration_in_postcopy()) {
1289         return;
1290     }
1291
1292     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1293 }
1294
1295 /**
1296  * save_zero_page_to_file: send the zero page to the file
1297  *
1298  * Returns the size of data written to the file, 0 means the page is not
1299  * a zero page
1300  *
1301  * @pss: current PSS channel
1302  * @block: block that contains the page we want to send
1303  * @offset: offset inside the block for the page
1304  */
1305 static int save_zero_page_to_file(PageSearchStatus *pss,
1306                                   RAMBlock *block, ram_addr_t offset)
1307 {
1308     uint8_t *p = block->host + offset;
1309     QEMUFile *file = pss->pss_channel;
1310     int len = 0;
1311
1312     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1313         len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO);
1314         qemu_put_byte(file, 0);
1315         len += 1;
1316         ram_release_page(block->idstr, offset);
1317     }
1318     return len;
1319 }
1320
1321 /**
1322  * save_zero_page: send the zero page to the stream
1323  *
1324  * Returns the number of pages written.
1325  *
1326  * @pss: current PSS channel
1327  * @block: block that contains the page we want to send
1328  * @offset: offset inside the block for the page
1329  */
1330 static int save_zero_page(PageSearchStatus *pss, RAMBlock *block,
1331                           ram_addr_t offset)
1332 {
1333     int len = save_zero_page_to_file(pss, block, offset);
1334
1335     if (len) {
1336         stat64_add(&ram_atomic_counters.duplicate, 1);
1337         ram_transferred_add(len);
1338         return 1;
1339     }
1340     return -1;
1341 }
1342
1343 /*
1344  * @pages: the number of pages written by the control path,
1345  *        < 0 - error
1346  *        > 0 - number of pages written
1347  *
1348  * Return true if the pages has been saved, otherwise false is returned.
1349  */
1350 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1351                               ram_addr_t offset, int *pages)
1352 {
1353     uint64_t bytes_xmit = 0;
1354     int ret;
1355
1356     *pages = -1;
1357     ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1358                                 TARGET_PAGE_SIZE, &bytes_xmit);
1359     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1360         return false;
1361     }
1362
1363     if (bytes_xmit) {
1364         ram_transferred_add(bytes_xmit);
1365         *pages = 1;
1366     }
1367
1368     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1369         return true;
1370     }
1371
1372     if (bytes_xmit > 0) {
1373         stat64_add(&ram_atomic_counters.normal, 1);
1374     } else if (bytes_xmit == 0) {
1375         stat64_add(&ram_atomic_counters.duplicate, 1);
1376     }
1377
1378     return true;
1379 }
1380
1381 /*
1382  * directly send the page to the stream
1383  *
1384  * Returns the number of pages written.
1385  *
1386  * @pss: current PSS channel
1387  * @block: block that contains the page we want to send
1388  * @offset: offset inside the block for the page
1389  * @buf: the page to be sent
1390  * @async: send to page asyncly
1391  */
1392 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1393                             ram_addr_t offset, uint8_t *buf, bool async)
1394 {
1395     QEMUFile *file = pss->pss_channel;
1396
1397     ram_transferred_add(save_page_header(pss, block,
1398                                          offset | RAM_SAVE_FLAG_PAGE));
1399     if (async) {
1400         qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1401                               migrate_release_ram() &&
1402                               migration_in_postcopy());
1403     } else {
1404         qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1405     }
1406     ram_transferred_add(TARGET_PAGE_SIZE);
1407     stat64_add(&ram_atomic_counters.normal, 1);
1408     return 1;
1409 }
1410
1411 /**
1412  * ram_save_page: send the given page to the stream
1413  *
1414  * Returns the number of pages written.
1415  *          < 0 - error
1416  *          >=0 - Number of pages written - this might legally be 0
1417  *                if xbzrle noticed the page was the same.
1418  *
1419  * @rs: current RAM state
1420  * @block: block that contains the page we want to send
1421  * @offset: offset inside the block for the page
1422  */
1423 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1424 {
1425     int pages = -1;
1426     uint8_t *p;
1427     bool send_async = true;
1428     RAMBlock *block = pss->block;
1429     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1430     ram_addr_t current_addr = block->offset + offset;
1431
1432     p = block->host + offset;
1433     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1434
1435     XBZRLE_cache_lock();
1436     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1437         pages = save_xbzrle_page(rs, pss, &p, current_addr,
1438                                  block, offset);
1439         if (!rs->last_stage) {
1440             /* Can't send this cached data async, since the cache page
1441              * might get updated before it gets to the wire
1442              */
1443             send_async = false;
1444         }
1445     }
1446
1447     /* XBZRLE overflow or normal page */
1448     if (pages == -1) {
1449         pages = save_normal_page(pss, block, offset, p, send_async);
1450     }
1451
1452     XBZRLE_cache_unlock();
1453
1454     return pages;
1455 }
1456
1457 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1458                                  ram_addr_t offset)
1459 {
1460     if (multifd_queue_page(file, block, offset) < 0) {
1461         return -1;
1462     }
1463     stat64_add(&ram_atomic_counters.normal, 1);
1464
1465     return 1;
1466 }
1467
1468 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1469                                  ram_addr_t offset, uint8_t *source_buf)
1470 {
1471     RAMState *rs = ram_state;
1472     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
1473     uint8_t *p = block->host + offset;
1474     int ret;
1475
1476     if (save_zero_page_to_file(pss, block, offset)) {
1477         return true;
1478     }
1479
1480     save_page_header(pss, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1481
1482     /*
1483      * copy it to a internal buffer to avoid it being modified by VM
1484      * so that we can catch up the error during compression and
1485      * decompression
1486      */
1487     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1488     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1489     if (ret < 0) {
1490         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1491         error_report("compressed data failed!");
1492     }
1493     return false;
1494 }
1495
1496 static void
1497 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1498 {
1499     ram_transferred_add(bytes_xmit);
1500
1501     if (param->zero_page) {
1502         stat64_add(&ram_atomic_counters.duplicate, 1);
1503         return;
1504     }
1505
1506     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1507     compression_counters.compressed_size += bytes_xmit - 8;
1508     compression_counters.pages++;
1509 }
1510
1511 static bool save_page_use_compression(RAMState *rs);
1512
1513 static void flush_compressed_data(RAMState *rs)
1514 {
1515     MigrationState *ms = migrate_get_current();
1516     int idx, len, thread_count;
1517
1518     if (!save_page_use_compression(rs)) {
1519         return;
1520     }
1521     thread_count = migrate_compress_threads();
1522
1523     qemu_mutex_lock(&comp_done_lock);
1524     for (idx = 0; idx < thread_count; idx++) {
1525         while (!comp_param[idx].done) {
1526             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1527         }
1528     }
1529     qemu_mutex_unlock(&comp_done_lock);
1530
1531     for (idx = 0; idx < thread_count; idx++) {
1532         qemu_mutex_lock(&comp_param[idx].mutex);
1533         if (!comp_param[idx].quit) {
1534             len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
1535             /*
1536              * it's safe to fetch zero_page without holding comp_done_lock
1537              * as there is no further request submitted to the thread,
1538              * i.e, the thread should be waiting for a request at this point.
1539              */
1540             update_compress_thread_counts(&comp_param[idx], len);
1541         }
1542         qemu_mutex_unlock(&comp_param[idx].mutex);
1543     }
1544 }
1545
1546 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1547                                        ram_addr_t offset)
1548 {
1549     param->block = block;
1550     param->offset = offset;
1551 }
1552
1553 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
1554 {
1555     int idx, thread_count, bytes_xmit = -1, pages = -1;
1556     bool wait = migrate_compress_wait_thread();
1557     MigrationState *ms = migrate_get_current();
1558
1559     thread_count = migrate_compress_threads();
1560     qemu_mutex_lock(&comp_done_lock);
1561 retry:
1562     for (idx = 0; idx < thread_count; idx++) {
1563         if (comp_param[idx].done) {
1564             comp_param[idx].done = false;
1565             bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1566                                             comp_param[idx].file);
1567             qemu_mutex_lock(&comp_param[idx].mutex);
1568             set_compress_params(&comp_param[idx], block, offset);
1569             qemu_cond_signal(&comp_param[idx].cond);
1570             qemu_mutex_unlock(&comp_param[idx].mutex);
1571             pages = 1;
1572             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1573             break;
1574         }
1575     }
1576
1577     /*
1578      * wait for the free thread if the user specifies 'compress-wait-thread',
1579      * otherwise we will post the page out in the main thread as normal page.
1580      */
1581     if (pages < 0 && wait) {
1582         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1583         goto retry;
1584     }
1585     qemu_mutex_unlock(&comp_done_lock);
1586
1587     return pages;
1588 }
1589
1590 #define PAGE_ALL_CLEAN 0
1591 #define PAGE_TRY_AGAIN 1
1592 #define PAGE_DIRTY_FOUND 2
1593 /**
1594  * find_dirty_block: find the next dirty page and update any state
1595  * associated with the search process.
1596  *
1597  * Returns:
1598  *         PAGE_ALL_CLEAN: no dirty page found, give up
1599  *         PAGE_TRY_AGAIN: no dirty page found, retry for next block
1600  *         PAGE_DIRTY_FOUND: dirty page found
1601  *
1602  * @rs: current RAM state
1603  * @pss: data about the state of the current dirty page scan
1604  * @again: set to false if the search has scanned the whole of RAM
1605  */
1606 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1607 {
1608     /* Update pss->page for the next dirty bit in ramblock */
1609     pss_find_next_dirty(pss);
1610
1611     if (pss->complete_round && pss->block == rs->last_seen_block &&
1612         pss->page >= rs->last_page) {
1613         /*
1614          * We've been once around the RAM and haven't found anything.
1615          * Give up.
1616          */
1617         return PAGE_ALL_CLEAN;
1618     }
1619     if (!offset_in_ramblock(pss->block,
1620                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1621         /* Didn't find anything in this RAM Block */
1622         pss->page = 0;
1623         pss->block = QLIST_NEXT_RCU(pss->block, next);
1624         if (!pss->block) {
1625             /*
1626              * If memory migration starts over, we will meet a dirtied page
1627              * which may still exists in compression threads's ring, so we
1628              * should flush the compressed data to make sure the new page
1629              * is not overwritten by the old one in the destination.
1630              *
1631              * Also If xbzrle is on, stop using the data compression at this
1632              * point. In theory, xbzrle can do better than compression.
1633              */
1634             flush_compressed_data(rs);
1635
1636             /* Hit the end of the list */
1637             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1638             /* Flag that we've looped */
1639             pss->complete_round = true;
1640             /* After the first round, enable XBZRLE. */
1641             if (migrate_use_xbzrle()) {
1642                 rs->xbzrle_enabled = true;
1643             }
1644         }
1645         /* Didn't find anything this time, but try again on the new block */
1646         return PAGE_TRY_AGAIN;
1647     } else {
1648         /* We've found something */
1649         return PAGE_DIRTY_FOUND;
1650     }
1651 }
1652
1653 /**
1654  * unqueue_page: gets a page of the queue
1655  *
1656  * Helper for 'get_queued_page' - gets a page off the queue
1657  *
1658  * Returns the block of the page (or NULL if none available)
1659  *
1660  * @rs: current RAM state
1661  * @offset: used to return the offset within the RAMBlock
1662  */
1663 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1664 {
1665     struct RAMSrcPageRequest *entry;
1666     RAMBlock *block = NULL;
1667
1668     if (!postcopy_has_request(rs)) {
1669         return NULL;
1670     }
1671
1672     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1673
1674     /*
1675      * This should _never_ change even after we take the lock, because no one
1676      * should be taking anything off the request list other than us.
1677      */
1678     assert(postcopy_has_request(rs));
1679
1680     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1681     block = entry->rb;
1682     *offset = entry->offset;
1683
1684     if (entry->len > TARGET_PAGE_SIZE) {
1685         entry->len -= TARGET_PAGE_SIZE;
1686         entry->offset += TARGET_PAGE_SIZE;
1687     } else {
1688         memory_region_unref(block->mr);
1689         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1690         g_free(entry);
1691         migration_consume_urgent_request();
1692     }
1693
1694     return block;
1695 }
1696
1697 #if defined(__linux__)
1698 /**
1699  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1700  *   is found, return RAM block pointer and page offset
1701  *
1702  * Returns pointer to the RAMBlock containing faulting page,
1703  *   NULL if no write faults are pending
1704  *
1705  * @rs: current RAM state
1706  * @offset: page offset from the beginning of the block
1707  */
1708 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1709 {
1710     struct uffd_msg uffd_msg;
1711     void *page_address;
1712     RAMBlock *block;
1713     int res;
1714
1715     if (!migrate_background_snapshot()) {
1716         return NULL;
1717     }
1718
1719     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1720     if (res <= 0) {
1721         return NULL;
1722     }
1723
1724     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1725     block = qemu_ram_block_from_host(page_address, false, offset);
1726     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1727     return block;
1728 }
1729
1730 /**
1731  * ram_save_release_protection: release UFFD write protection after
1732  *   a range of pages has been saved
1733  *
1734  * @rs: current RAM state
1735  * @pss: page-search-status structure
1736  * @start_page: index of the first page in the range relative to pss->block
1737  *
1738  * Returns 0 on success, negative value in case of an error
1739 */
1740 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1741         unsigned long start_page)
1742 {
1743     int res = 0;
1744
1745     /* Check if page is from UFFD-managed region. */
1746     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1747         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1748         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1749
1750         /* Flush async buffers before un-protect. */
1751         qemu_fflush(pss->pss_channel);
1752         /* Un-protect memory range. */
1753         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1754                 false, false);
1755     }
1756
1757     return res;
1758 }
1759
1760 /* ram_write_tracking_available: check if kernel supports required UFFD features
1761  *
1762  * Returns true if supports, false otherwise
1763  */
1764 bool ram_write_tracking_available(void)
1765 {
1766     uint64_t uffd_features;
1767     int res;
1768
1769     res = uffd_query_features(&uffd_features);
1770     return (res == 0 &&
1771             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1772 }
1773
1774 /* ram_write_tracking_compatible: check if guest configuration is
1775  *   compatible with 'write-tracking'
1776  *
1777  * Returns true if compatible, false otherwise
1778  */
1779 bool ram_write_tracking_compatible(void)
1780 {
1781     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1782     int uffd_fd;
1783     RAMBlock *block;
1784     bool ret = false;
1785
1786     /* Open UFFD file descriptor */
1787     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1788     if (uffd_fd < 0) {
1789         return false;
1790     }
1791
1792     RCU_READ_LOCK_GUARD();
1793
1794     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1795         uint64_t uffd_ioctls;
1796
1797         /* Nothing to do with read-only and MMIO-writable regions */
1798         if (block->mr->readonly || block->mr->rom_device) {
1799             continue;
1800         }
1801         /* Try to register block memory via UFFD-IO to track writes */
1802         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1803                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1804             goto out;
1805         }
1806         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1807             goto out;
1808         }
1809     }
1810     ret = true;
1811
1812 out:
1813     uffd_close_fd(uffd_fd);
1814     return ret;
1815 }
1816
1817 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1818                                        ram_addr_t size)
1819 {
1820     const ram_addr_t end = offset + size;
1821
1822     /*
1823      * We read one byte of each page; this will preallocate page tables if
1824      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1825      * where no page was populated yet. This might require adaption when
1826      * supporting other mappings, like shmem.
1827      */
1828     for (; offset < end; offset += block->page_size) {
1829         char tmp = *((char *)block->host + offset);
1830
1831         /* Don't optimize the read out */
1832         asm volatile("" : "+r" (tmp));
1833     }
1834 }
1835
1836 static inline int populate_read_section(MemoryRegionSection *section,
1837                                         void *opaque)
1838 {
1839     const hwaddr size = int128_get64(section->size);
1840     hwaddr offset = section->offset_within_region;
1841     RAMBlock *block = section->mr->ram_block;
1842
1843     populate_read_range(block, offset, size);
1844     return 0;
1845 }
1846
1847 /*
1848  * ram_block_populate_read: preallocate page tables and populate pages in the
1849  *   RAM block by reading a byte of each page.
1850  *
1851  * Since it's solely used for userfault_fd WP feature, here we just
1852  *   hardcode page size to qemu_real_host_page_size.
1853  *
1854  * @block: RAM block to populate
1855  */
1856 static void ram_block_populate_read(RAMBlock *rb)
1857 {
1858     /*
1859      * Skip populating all pages that fall into a discarded range as managed by
1860      * a RamDiscardManager responsible for the mapped memory region of the
1861      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1862      * must not get populated automatically. We don't have to track
1863      * modifications via userfaultfd WP reliably, because these pages will
1864      * not be part of the migration stream either way -- see
1865      * ramblock_dirty_bitmap_exclude_discarded_pages().
1866      *
1867      * Note: The result is only stable while migrating (precopy/postcopy).
1868      */
1869     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1870         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1871         MemoryRegionSection section = {
1872             .mr = rb->mr,
1873             .offset_within_region = 0,
1874             .size = rb->mr->size,
1875         };
1876
1877         ram_discard_manager_replay_populated(rdm, &section,
1878                                              populate_read_section, NULL);
1879     } else {
1880         populate_read_range(rb, 0, rb->used_length);
1881     }
1882 }
1883
1884 /*
1885  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1886  */
1887 void ram_write_tracking_prepare(void)
1888 {
1889     RAMBlock *block;
1890
1891     RCU_READ_LOCK_GUARD();
1892
1893     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1894         /* Nothing to do with read-only and MMIO-writable regions */
1895         if (block->mr->readonly || block->mr->rom_device) {
1896             continue;
1897         }
1898
1899         /*
1900          * Populate pages of the RAM block before enabling userfault_fd
1901          * write protection.
1902          *
1903          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1904          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1905          * pages with pte_none() entries in page table.
1906          */
1907         ram_block_populate_read(block);
1908     }
1909 }
1910
1911 static inline int uffd_protect_section(MemoryRegionSection *section,
1912                                        void *opaque)
1913 {
1914     const hwaddr size = int128_get64(section->size);
1915     const hwaddr offset = section->offset_within_region;
1916     RAMBlock *rb = section->mr->ram_block;
1917     int uffd_fd = (uintptr_t)opaque;
1918
1919     return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1920                                   false);
1921 }
1922
1923 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1924 {
1925     assert(rb->flags & RAM_UF_WRITEPROTECT);
1926
1927     /* See ram_block_populate_read() */
1928     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1929         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1930         MemoryRegionSection section = {
1931             .mr = rb->mr,
1932             .offset_within_region = 0,
1933             .size = rb->mr->size,
1934         };
1935
1936         return ram_discard_manager_replay_populated(rdm, &section,
1937                                                     uffd_protect_section,
1938                                                     (void *)(uintptr_t)uffd_fd);
1939     }
1940     return uffd_change_protection(uffd_fd, rb->host,
1941                                   rb->used_length, true, false);
1942 }
1943
1944 /*
1945  * ram_write_tracking_start: start UFFD-WP memory tracking
1946  *
1947  * Returns 0 for success or negative value in case of error
1948  */
1949 int ram_write_tracking_start(void)
1950 {
1951     int uffd_fd;
1952     RAMState *rs = ram_state;
1953     RAMBlock *block;
1954
1955     /* Open UFFD file descriptor */
1956     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1957     if (uffd_fd < 0) {
1958         return uffd_fd;
1959     }
1960     rs->uffdio_fd = uffd_fd;
1961
1962     RCU_READ_LOCK_GUARD();
1963
1964     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1965         /* Nothing to do with read-only and MMIO-writable regions */
1966         if (block->mr->readonly || block->mr->rom_device) {
1967             continue;
1968         }
1969
1970         /* Register block memory with UFFD to track writes */
1971         if (uffd_register_memory(rs->uffdio_fd, block->host,
1972                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1973             goto fail;
1974         }
1975         block->flags |= RAM_UF_WRITEPROTECT;
1976         memory_region_ref(block->mr);
1977
1978         /* Apply UFFD write protection to the block memory range */
1979         if (ram_block_uffd_protect(block, uffd_fd)) {
1980             goto fail;
1981         }
1982
1983         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1984                 block->host, block->max_length);
1985     }
1986
1987     return 0;
1988
1989 fail:
1990     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1991
1992     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1993         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1994             continue;
1995         }
1996         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1997         /* Cleanup flags and remove reference */
1998         block->flags &= ~RAM_UF_WRITEPROTECT;
1999         memory_region_unref(block->mr);
2000     }
2001
2002     uffd_close_fd(uffd_fd);
2003     rs->uffdio_fd = -1;
2004     return -1;
2005 }
2006
2007 /**
2008  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
2009  */
2010 void ram_write_tracking_stop(void)
2011 {
2012     RAMState *rs = ram_state;
2013     RAMBlock *block;
2014
2015     RCU_READ_LOCK_GUARD();
2016
2017     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2018         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
2019             continue;
2020         }
2021         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
2022
2023         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
2024                 block->host, block->max_length);
2025
2026         /* Cleanup flags and remove reference */
2027         block->flags &= ~RAM_UF_WRITEPROTECT;
2028         memory_region_unref(block->mr);
2029     }
2030
2031     /* Finally close UFFD file descriptor */
2032     uffd_close_fd(rs->uffdio_fd);
2033     rs->uffdio_fd = -1;
2034 }
2035
2036 #else
2037 /* No target OS support, stubs just fail or ignore */
2038
2039 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
2040 {
2041     (void) rs;
2042     (void) offset;
2043
2044     return NULL;
2045 }
2046
2047 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2048         unsigned long start_page)
2049 {
2050     (void) rs;
2051     (void) pss;
2052     (void) start_page;
2053
2054     return 0;
2055 }
2056
2057 bool ram_write_tracking_available(void)
2058 {
2059     return false;
2060 }
2061
2062 bool ram_write_tracking_compatible(void)
2063 {
2064     assert(0);
2065     return false;
2066 }
2067
2068 int ram_write_tracking_start(void)
2069 {
2070     assert(0);
2071     return -1;
2072 }
2073
2074 void ram_write_tracking_stop(void)
2075 {
2076     assert(0);
2077 }
2078 #endif /* defined(__linux__) */
2079
2080 /**
2081  * get_queued_page: unqueue a page from the postcopy requests
2082  *
2083  * Skips pages that are already sent (!dirty)
2084  *
2085  * Returns true if a queued page is found
2086  *
2087  * @rs: current RAM state
2088  * @pss: data about the state of the current dirty page scan
2089  */
2090 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2091 {
2092     RAMBlock  *block;
2093     ram_addr_t offset;
2094     bool dirty;
2095
2096     do {
2097         block = unqueue_page(rs, &offset);
2098         /*
2099          * We're sending this page, and since it's postcopy nothing else
2100          * will dirty it, and we must make sure it doesn't get sent again
2101          * even if this queue request was received after the background
2102          * search already sent it.
2103          */
2104         if (block) {
2105             unsigned long page;
2106
2107             page = offset >> TARGET_PAGE_BITS;
2108             dirty = test_bit(page, block->bmap);
2109             if (!dirty) {
2110                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2111                                                 page);
2112             } else {
2113                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2114             }
2115         }
2116
2117     } while (block && !dirty);
2118
2119     if (!block) {
2120         /*
2121          * Poll write faults too if background snapshot is enabled; that's
2122          * when we have vcpus got blocked by the write protected pages.
2123          */
2124         block = poll_fault_page(rs, &offset);
2125     }
2126
2127     if (block) {
2128         /*
2129          * We want the background search to continue from the queued page
2130          * since the guest is likely to want other pages near to the page
2131          * it just requested.
2132          */
2133         pss->block = block;
2134         pss->page = offset >> TARGET_PAGE_BITS;
2135
2136         /*
2137          * This unqueued page would break the "one round" check, even is
2138          * really rare.
2139          */
2140         pss->complete_round = false;
2141     }
2142
2143     return !!block;
2144 }
2145
2146 /**
2147  * migration_page_queue_free: drop any remaining pages in the ram
2148  * request queue
2149  *
2150  * It should be empty at the end anyway, but in error cases there may
2151  * be some left.  in case that there is any page left, we drop it.
2152  *
2153  */
2154 static void migration_page_queue_free(RAMState *rs)
2155 {
2156     struct RAMSrcPageRequest *mspr, *next_mspr;
2157     /* This queue generally should be empty - but in the case of a failed
2158      * migration might have some droppings in.
2159      */
2160     RCU_READ_LOCK_GUARD();
2161     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2162         memory_region_unref(mspr->rb->mr);
2163         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2164         g_free(mspr);
2165     }
2166 }
2167
2168 /**
2169  * ram_save_queue_pages: queue the page for transmission
2170  *
2171  * A request from postcopy destination for example.
2172  *
2173  * Returns zero on success or negative on error
2174  *
2175  * @rbname: Name of the RAMBLock of the request. NULL means the
2176  *          same that last one.
2177  * @start: starting address from the start of the RAMBlock
2178  * @len: length (in bytes) to send
2179  */
2180 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2181 {
2182     RAMBlock *ramblock;
2183     RAMState *rs = ram_state;
2184
2185     ram_counters.postcopy_requests++;
2186     RCU_READ_LOCK_GUARD();
2187
2188     if (!rbname) {
2189         /* Reuse last RAMBlock */
2190         ramblock = rs->last_req_rb;
2191
2192         if (!ramblock) {
2193             /*
2194              * Shouldn't happen, we can't reuse the last RAMBlock if
2195              * it's the 1st request.
2196              */
2197             error_report("ram_save_queue_pages no previous block");
2198             return -1;
2199         }
2200     } else {
2201         ramblock = qemu_ram_block_by_name(rbname);
2202
2203         if (!ramblock) {
2204             /* We shouldn't be asked for a non-existent RAMBlock */
2205             error_report("ram_save_queue_pages no block '%s'", rbname);
2206             return -1;
2207         }
2208         rs->last_req_rb = ramblock;
2209     }
2210     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2211     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2212         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2213                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2214                      __func__, start, len, ramblock->used_length);
2215         return -1;
2216     }
2217
2218     /*
2219      * When with postcopy preempt, we send back the page directly in the
2220      * rp-return thread.
2221      */
2222     if (postcopy_preempt_active()) {
2223         ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2224         size_t page_size = qemu_ram_pagesize(ramblock);
2225         PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2226         int ret = 0;
2227
2228         qemu_mutex_lock(&rs->bitmap_mutex);
2229
2230         pss_init(pss, ramblock, page_start);
2231         /*
2232          * Always use the preempt channel, and make sure it's there.  It's
2233          * safe to access without lock, because when rp-thread is running
2234          * we should be the only one who operates on the qemufile
2235          */
2236         pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2237         assert(pss->pss_channel);
2238
2239         /*
2240          * It must be either one or multiple of host page size.  Just
2241          * assert; if something wrong we're mostly split brain anyway.
2242          */
2243         assert(len % page_size == 0);
2244         while (len) {
2245             if (ram_save_host_page_urgent(pss)) {
2246                 error_report("%s: ram_save_host_page_urgent() failed: "
2247                              "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2248                              __func__, ramblock->idstr, start);
2249                 ret = -1;
2250                 break;
2251             }
2252             /*
2253              * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2254              * will automatically be moved and point to the next host page
2255              * we're going to send, so no need to update here.
2256              *
2257              * Normally QEMU never sends >1 host page in requests, so
2258              * logically we don't even need that as the loop should only
2259              * run once, but just to be consistent.
2260              */
2261             len -= page_size;
2262         };
2263         qemu_mutex_unlock(&rs->bitmap_mutex);
2264
2265         return ret;
2266     }
2267
2268     struct RAMSrcPageRequest *new_entry =
2269         g_new0(struct RAMSrcPageRequest, 1);
2270     new_entry->rb = ramblock;
2271     new_entry->offset = start;
2272     new_entry->len = len;
2273
2274     memory_region_ref(ramblock->mr);
2275     qemu_mutex_lock(&rs->src_page_req_mutex);
2276     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2277     migration_make_urgent_request();
2278     qemu_mutex_unlock(&rs->src_page_req_mutex);
2279
2280     return 0;
2281 }
2282
2283 static bool save_page_use_compression(RAMState *rs)
2284 {
2285     if (!migrate_use_compression()) {
2286         return false;
2287     }
2288
2289     /*
2290      * If xbzrle is enabled (e.g., after first round of migration), stop
2291      * using the data compression. In theory, xbzrle can do better than
2292      * compression.
2293      */
2294     if (rs->xbzrle_enabled) {
2295         return false;
2296     }
2297
2298     return true;
2299 }
2300
2301 /*
2302  * try to compress the page before posting it out, return true if the page
2303  * has been properly handled by compression, otherwise needs other
2304  * paths to handle it
2305  */
2306 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2307                                RAMBlock *block, ram_addr_t offset)
2308 {
2309     if (!save_page_use_compression(rs)) {
2310         return false;
2311     }
2312
2313     /*
2314      * When starting the process of a new block, the first page of
2315      * the block should be sent out before other pages in the same
2316      * block, and all the pages in last block should have been sent
2317      * out, keeping this order is important, because the 'cont' flag
2318      * is used to avoid resending the block name.
2319      *
2320      * We post the fist page as normal page as compression will take
2321      * much CPU resource.
2322      */
2323     if (block != pss->last_sent_block) {
2324         flush_compressed_data(rs);
2325         return false;
2326     }
2327
2328     if (compress_page_with_multi_thread(block, offset) > 0) {
2329         return true;
2330     }
2331
2332     compression_counters.busy++;
2333     return false;
2334 }
2335
2336 /**
2337  * ram_save_target_page_legacy: save one target page
2338  *
2339  * Returns the number of pages written
2340  *
2341  * @rs: current RAM state
2342  * @pss: data about the page we want to send
2343  */
2344 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2345 {
2346     RAMBlock *block = pss->block;
2347     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2348     int res;
2349
2350     if (control_save_page(pss, block, offset, &res)) {
2351         return res;
2352     }
2353
2354     if (save_compress_page(rs, pss, block, offset)) {
2355         return 1;
2356     }
2357
2358     res = save_zero_page(pss, block, offset);
2359     if (res > 0) {
2360         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2361          * page would be stale
2362          */
2363         if (rs->xbzrle_enabled) {
2364             XBZRLE_cache_lock();
2365             xbzrle_cache_zero_page(rs, block->offset + offset);
2366             XBZRLE_cache_unlock();
2367         }
2368         return res;
2369     }
2370
2371     /*
2372      * Do not use multifd in postcopy as one whole host page should be
2373      * placed.  Meanwhile postcopy requires atomic update of pages, so even
2374      * if host page size == guest page size the dest guest during run may
2375      * still see partially copied pages which is data corruption.
2376      */
2377     if (migrate_use_multifd() && !migration_in_postcopy()) {
2378         return ram_save_multifd_page(pss->pss_channel, block, offset);
2379     }
2380
2381     return ram_save_page(rs, pss);
2382 }
2383
2384 /* Should be called before sending a host page */
2385 static void pss_host_page_prepare(PageSearchStatus *pss)
2386 {
2387     /* How many guest pages are there in one host page? */
2388     size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2389
2390     pss->host_page_sending = true;
2391     if (guest_pfns <= 1) {
2392         /*
2393          * This covers both when guest psize == host psize, or when guest
2394          * has larger psize than the host (guest_pfns==0).
2395          *
2396          * For the latter, we always send one whole guest page per
2397          * iteration of the host page (example: an Alpha VM on x86 host
2398          * will have guest psize 8K while host psize 4K).
2399          */
2400         pss->host_page_start = pss->page;
2401         pss->host_page_end = pss->page + 1;
2402     } else {
2403         /*
2404          * The host page spans over multiple guest pages, we send them
2405          * within the same host page iteration.
2406          */
2407         pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2408         pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2409     }
2410 }
2411
2412 /*
2413  * Whether the page pointed by PSS is within the host page being sent.
2414  * Must be called after a previous pss_host_page_prepare().
2415  */
2416 static bool pss_within_range(PageSearchStatus *pss)
2417 {
2418     ram_addr_t ram_addr;
2419
2420     assert(pss->host_page_sending);
2421
2422     /* Over host-page boundary? */
2423     if (pss->page >= pss->host_page_end) {
2424         return false;
2425     }
2426
2427     ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2428
2429     return offset_in_ramblock(pss->block, ram_addr);
2430 }
2431
2432 static void pss_host_page_finish(PageSearchStatus *pss)
2433 {
2434     pss->host_page_sending = false;
2435     /* This is not needed, but just to reset it */
2436     pss->host_page_start = pss->host_page_end = 0;
2437 }
2438
2439 /*
2440  * Send an urgent host page specified by `pss'.  Need to be called with
2441  * bitmap_mutex held.
2442  *
2443  * Returns 0 if save host page succeeded, false otherwise.
2444  */
2445 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2446 {
2447     bool page_dirty, sent = false;
2448     RAMState *rs = ram_state;
2449     int ret = 0;
2450
2451     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2452     pss_host_page_prepare(pss);
2453
2454     /*
2455      * If precopy is sending the same page, let it be done in precopy, or
2456      * we could send the same page in two channels and none of them will
2457      * receive the whole page.
2458      */
2459     if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2460         trace_postcopy_preempt_hit(pss->block->idstr,
2461                                    pss->page << TARGET_PAGE_BITS);
2462         return 0;
2463     }
2464
2465     do {
2466         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2467
2468         if (page_dirty) {
2469             /* Be strict to return code; it must be 1, or what else? */
2470             if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2471                 error_report_once("%s: ram_save_target_page failed", __func__);
2472                 ret = -1;
2473                 goto out;
2474             }
2475             sent = true;
2476         }
2477         pss_find_next_dirty(pss);
2478     } while (pss_within_range(pss));
2479 out:
2480     pss_host_page_finish(pss);
2481     /* For urgent requests, flush immediately if sent */
2482     if (sent) {
2483         qemu_fflush(pss->pss_channel);
2484     }
2485     return ret;
2486 }
2487
2488 /**
2489  * ram_save_host_page: save a whole host page
2490  *
2491  * Starting at *offset send pages up to the end of the current host
2492  * page. It's valid for the initial offset to point into the middle of
2493  * a host page in which case the remainder of the hostpage is sent.
2494  * Only dirty target pages are sent. Note that the host page size may
2495  * be a huge page for this block.
2496  *
2497  * The saving stops at the boundary of the used_length of the block
2498  * if the RAMBlock isn't a multiple of the host page size.
2499  *
2500  * The caller must be with ram_state.bitmap_mutex held to call this
2501  * function.  Note that this function can temporarily release the lock, but
2502  * when the function is returned it'll make sure the lock is still held.
2503  *
2504  * Returns the number of pages written or negative on error
2505  *
2506  * @rs: current RAM state
2507  * @pss: data about the page we want to send
2508  */
2509 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2510 {
2511     bool page_dirty, preempt_active = postcopy_preempt_active();
2512     int tmppages, pages = 0;
2513     size_t pagesize_bits =
2514         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2515     unsigned long start_page = pss->page;
2516     int res;
2517
2518     if (ramblock_is_ignored(pss->block)) {
2519         error_report("block %s should not be migrated !", pss->block->idstr);
2520         return 0;
2521     }
2522
2523     /* Update host page boundary information */
2524     pss_host_page_prepare(pss);
2525
2526     do {
2527         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2528
2529         /* Check the pages is dirty and if it is send it */
2530         if (page_dirty) {
2531             /*
2532              * Properly yield the lock only in postcopy preempt mode
2533              * because both migration thread and rp-return thread can
2534              * operate on the bitmaps.
2535              */
2536             if (preempt_active) {
2537                 qemu_mutex_unlock(&rs->bitmap_mutex);
2538             }
2539             tmppages = migration_ops->ram_save_target_page(rs, pss);
2540             if (tmppages >= 0) {
2541                 pages += tmppages;
2542                 /*
2543                  * Allow rate limiting to happen in the middle of huge pages if
2544                  * something is sent in the current iteration.
2545                  */
2546                 if (pagesize_bits > 1 && tmppages > 0) {
2547                     migration_rate_limit();
2548                 }
2549             }
2550             if (preempt_active) {
2551                 qemu_mutex_lock(&rs->bitmap_mutex);
2552             }
2553         } else {
2554             tmppages = 0;
2555         }
2556
2557         if (tmppages < 0) {
2558             pss_host_page_finish(pss);
2559             return tmppages;
2560         }
2561
2562         pss_find_next_dirty(pss);
2563     } while (pss_within_range(pss));
2564
2565     pss_host_page_finish(pss);
2566
2567     res = ram_save_release_protection(rs, pss, start_page);
2568     return (res < 0 ? res : pages);
2569 }
2570
2571 /**
2572  * ram_find_and_save_block: finds a dirty page and sends it to f
2573  *
2574  * Called within an RCU critical section.
2575  *
2576  * Returns the number of pages written where zero means no dirty pages,
2577  * or negative on error
2578  *
2579  * @rs: current RAM state
2580  *
2581  * On systems where host-page-size > target-page-size it will send all the
2582  * pages in a host page that are dirty.
2583  */
2584 static int ram_find_and_save_block(RAMState *rs)
2585 {
2586     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2587     int pages = 0;
2588
2589     /* No dirty page as there is zero RAM */
2590     if (!rs->ram_bytes_total) {
2591         return pages;
2592     }
2593
2594     /*
2595      * Always keep last_seen_block/last_page valid during this procedure,
2596      * because find_dirty_block() relies on these values (e.g., we compare
2597      * last_seen_block with pss.block to see whether we searched all the
2598      * ramblocks) to detect the completion of migration.  Having NULL value
2599      * of last_seen_block can conditionally cause below loop to run forever.
2600      */
2601     if (!rs->last_seen_block) {
2602         rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2603         rs->last_page = 0;
2604     }
2605
2606     pss_init(pss, rs->last_seen_block, rs->last_page);
2607
2608     while (true){
2609         if (!get_queued_page(rs, pss)) {
2610             /* priority queue empty, so just search for something dirty */
2611             int res = find_dirty_block(rs, pss);
2612             if (res != PAGE_DIRTY_FOUND) {
2613                 if (res == PAGE_ALL_CLEAN) {
2614                     break;
2615                 } else if (res == PAGE_TRY_AGAIN) {
2616                     continue;
2617                 }
2618             }
2619         }
2620         pages = ram_save_host_page(rs, pss);
2621         if (pages) {
2622             break;
2623         }
2624     }
2625
2626     rs->last_seen_block = pss->block;
2627     rs->last_page = pss->page;
2628
2629     return pages;
2630 }
2631
2632 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2633 {
2634     uint64_t pages = size / TARGET_PAGE_SIZE;
2635
2636     if (zero) {
2637         stat64_add(&ram_atomic_counters.duplicate, pages);
2638     } else {
2639         stat64_add(&ram_atomic_counters.normal, pages);
2640         ram_transferred_add(size);
2641         qemu_file_credit_transfer(f, size);
2642     }
2643 }
2644
2645 static uint64_t ram_bytes_total_with_ignored(void)
2646 {
2647     RAMBlock *block;
2648     uint64_t total = 0;
2649
2650     RCU_READ_LOCK_GUARD();
2651
2652     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2653         total += block->used_length;
2654     }
2655     return total;
2656 }
2657
2658 uint64_t ram_bytes_total(void)
2659 {
2660     RAMBlock *block;
2661     uint64_t total = 0;
2662
2663     RCU_READ_LOCK_GUARD();
2664
2665     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2666         total += block->used_length;
2667     }
2668     return total;
2669 }
2670
2671 static void xbzrle_load_setup(void)
2672 {
2673     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2674 }
2675
2676 static void xbzrle_load_cleanup(void)
2677 {
2678     g_free(XBZRLE.decoded_buf);
2679     XBZRLE.decoded_buf = NULL;
2680 }
2681
2682 static void ram_state_cleanup(RAMState **rsp)
2683 {
2684     if (*rsp) {
2685         migration_page_queue_free(*rsp);
2686         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2687         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2688         g_free(*rsp);
2689         *rsp = NULL;
2690     }
2691 }
2692
2693 static void xbzrle_cleanup(void)
2694 {
2695     XBZRLE_cache_lock();
2696     if (XBZRLE.cache) {
2697         cache_fini(XBZRLE.cache);
2698         g_free(XBZRLE.encoded_buf);
2699         g_free(XBZRLE.current_buf);
2700         g_free(XBZRLE.zero_target_page);
2701         XBZRLE.cache = NULL;
2702         XBZRLE.encoded_buf = NULL;
2703         XBZRLE.current_buf = NULL;
2704         XBZRLE.zero_target_page = NULL;
2705     }
2706     XBZRLE_cache_unlock();
2707 }
2708
2709 static void ram_save_cleanup(void *opaque)
2710 {
2711     RAMState **rsp = opaque;
2712     RAMBlock *block;
2713
2714     /* We don't use dirty log with background snapshots */
2715     if (!migrate_background_snapshot()) {
2716         /* caller have hold iothread lock or is in a bh, so there is
2717          * no writing race against the migration bitmap
2718          */
2719         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2720             /*
2721              * do not stop dirty log without starting it, since
2722              * memory_global_dirty_log_stop will assert that
2723              * memory_global_dirty_log_start/stop used in pairs
2724              */
2725             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2726         }
2727     }
2728
2729     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2730         g_free(block->clear_bmap);
2731         block->clear_bmap = NULL;
2732         g_free(block->bmap);
2733         block->bmap = NULL;
2734     }
2735
2736     xbzrle_cleanup();
2737     compress_threads_save_cleanup();
2738     ram_state_cleanup(rsp);
2739     g_free(migration_ops);
2740     migration_ops = NULL;
2741 }
2742
2743 static void ram_state_reset(RAMState *rs)
2744 {
2745     int i;
2746
2747     for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2748         rs->pss[i].last_sent_block = NULL;
2749     }
2750
2751     rs->last_seen_block = NULL;
2752     rs->last_page = 0;
2753     rs->last_version = ram_list.version;
2754     rs->xbzrle_enabled = false;
2755 }
2756
2757 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2758
2759 /* **** functions for postcopy ***** */
2760
2761 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2762 {
2763     struct RAMBlock *block;
2764
2765     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2766         unsigned long *bitmap = block->bmap;
2767         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2768         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2769
2770         while (run_start < range) {
2771             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2772             ram_discard_range(block->idstr,
2773                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2774                               ((ram_addr_t)(run_end - run_start))
2775                                 << TARGET_PAGE_BITS);
2776             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2777         }
2778     }
2779 }
2780
2781 /**
2782  * postcopy_send_discard_bm_ram: discard a RAMBlock
2783  *
2784  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2785  *
2786  * @ms: current migration state
2787  * @block: RAMBlock to discard
2788  */
2789 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2790 {
2791     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2792     unsigned long current;
2793     unsigned long *bitmap = block->bmap;
2794
2795     for (current = 0; current < end; ) {
2796         unsigned long one = find_next_bit(bitmap, end, current);
2797         unsigned long zero, discard_length;
2798
2799         if (one >= end) {
2800             break;
2801         }
2802
2803         zero = find_next_zero_bit(bitmap, end, one + 1);
2804
2805         if (zero >= end) {
2806             discard_length = end - one;
2807         } else {
2808             discard_length = zero - one;
2809         }
2810         postcopy_discard_send_range(ms, one, discard_length);
2811         current = one + discard_length;
2812     }
2813 }
2814
2815 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2816
2817 /**
2818  * postcopy_each_ram_send_discard: discard all RAMBlocks
2819  *
2820  * Utility for the outgoing postcopy code.
2821  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2822  *   passing it bitmap indexes and name.
2823  * (qemu_ram_foreach_block ends up passing unscaled lengths
2824  *  which would mean postcopy code would have to deal with target page)
2825  *
2826  * @ms: current migration state
2827  */
2828 static void postcopy_each_ram_send_discard(MigrationState *ms)
2829 {
2830     struct RAMBlock *block;
2831
2832     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2833         postcopy_discard_send_init(ms, block->idstr);
2834
2835         /*
2836          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2837          * host-page size chunks, mark any partially dirty host-page size
2838          * chunks as all dirty.  In this case the host-page is the host-page
2839          * for the particular RAMBlock, i.e. it might be a huge page.
2840          */
2841         postcopy_chunk_hostpages_pass(ms, block);
2842
2843         /*
2844          * Postcopy sends chunks of bitmap over the wire, but it
2845          * just needs indexes at this point, avoids it having
2846          * target page specific code.
2847          */
2848         postcopy_send_discard_bm_ram(ms, block);
2849         postcopy_discard_send_finish(ms);
2850     }
2851 }
2852
2853 /**
2854  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2855  *
2856  * Helper for postcopy_chunk_hostpages; it's called twice to
2857  * canonicalize the two bitmaps, that are similar, but one is
2858  * inverted.
2859  *
2860  * Postcopy requires that all target pages in a hostpage are dirty or
2861  * clean, not a mix.  This function canonicalizes the bitmaps.
2862  *
2863  * @ms: current migration state
2864  * @block: block that contains the page we want to canonicalize
2865  */
2866 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2867 {
2868     RAMState *rs = ram_state;
2869     unsigned long *bitmap = block->bmap;
2870     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2871     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2872     unsigned long run_start;
2873
2874     if (block->page_size == TARGET_PAGE_SIZE) {
2875         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2876         return;
2877     }
2878
2879     /* Find a dirty page */
2880     run_start = find_next_bit(bitmap, pages, 0);
2881
2882     while (run_start < pages) {
2883
2884         /*
2885          * If the start of this run of pages is in the middle of a host
2886          * page, then we need to fixup this host page.
2887          */
2888         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2889             /* Find the end of this run */
2890             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2891             /*
2892              * If the end isn't at the start of a host page, then the
2893              * run doesn't finish at the end of a host page
2894              * and we need to discard.
2895              */
2896         }
2897
2898         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2899             unsigned long page;
2900             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2901                                                              host_ratio);
2902             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2903
2904             /* Clean up the bitmap */
2905             for (page = fixup_start_addr;
2906                  page < fixup_start_addr + host_ratio; page++) {
2907                 /*
2908                  * Remark them as dirty, updating the count for any pages
2909                  * that weren't previously dirty.
2910                  */
2911                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2912             }
2913         }
2914
2915         /* Find the next dirty page for the next iteration */
2916         run_start = find_next_bit(bitmap, pages, run_start);
2917     }
2918 }
2919
2920 /**
2921  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2922  *
2923  * Transmit the set of pages to be discarded after precopy to the target
2924  * these are pages that:
2925  *     a) Have been previously transmitted but are now dirty again
2926  *     b) Pages that have never been transmitted, this ensures that
2927  *        any pages on the destination that have been mapped by background
2928  *        tasks get discarded (transparent huge pages is the specific concern)
2929  * Hopefully this is pretty sparse
2930  *
2931  * @ms: current migration state
2932  */
2933 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2934 {
2935     RAMState *rs = ram_state;
2936
2937     RCU_READ_LOCK_GUARD();
2938
2939     /* This should be our last sync, the src is now paused */
2940     migration_bitmap_sync(rs);
2941
2942     /* Easiest way to make sure we don't resume in the middle of a host-page */
2943     rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2944     rs->last_seen_block = NULL;
2945     rs->last_page = 0;
2946
2947     postcopy_each_ram_send_discard(ms);
2948
2949     trace_ram_postcopy_send_discard_bitmap();
2950 }
2951
2952 /**
2953  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2954  *
2955  * Returns zero on success
2956  *
2957  * @rbname: name of the RAMBlock of the request. NULL means the
2958  *          same that last one.
2959  * @start: RAMBlock starting page
2960  * @length: RAMBlock size
2961  */
2962 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2963 {
2964     trace_ram_discard_range(rbname, start, length);
2965
2966     RCU_READ_LOCK_GUARD();
2967     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2968
2969     if (!rb) {
2970         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2971         return -1;
2972     }
2973
2974     /*
2975      * On source VM, we don't need to update the received bitmap since
2976      * we don't even have one.
2977      */
2978     if (rb->receivedmap) {
2979         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2980                      length >> qemu_target_page_bits());
2981     }
2982
2983     return ram_block_discard_range(rb, start, length);
2984 }
2985
2986 /*
2987  * For every allocation, we will try not to crash the VM if the
2988  * allocation failed.
2989  */
2990 static int xbzrle_init(void)
2991 {
2992     Error *local_err = NULL;
2993
2994     if (!migrate_use_xbzrle()) {
2995         return 0;
2996     }
2997
2998     XBZRLE_cache_lock();
2999
3000     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3001     if (!XBZRLE.zero_target_page) {
3002         error_report("%s: Error allocating zero page", __func__);
3003         goto err_out;
3004     }
3005
3006     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3007                               TARGET_PAGE_SIZE, &local_err);
3008     if (!XBZRLE.cache) {
3009         error_report_err(local_err);
3010         goto free_zero_page;
3011     }
3012
3013     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3014     if (!XBZRLE.encoded_buf) {
3015         error_report("%s: Error allocating encoded_buf", __func__);
3016         goto free_cache;
3017     }
3018
3019     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3020     if (!XBZRLE.current_buf) {
3021         error_report("%s: Error allocating current_buf", __func__);
3022         goto free_encoded_buf;
3023     }
3024
3025     /* We are all good */
3026     XBZRLE_cache_unlock();
3027     return 0;
3028
3029 free_encoded_buf:
3030     g_free(XBZRLE.encoded_buf);
3031     XBZRLE.encoded_buf = NULL;
3032 free_cache:
3033     cache_fini(XBZRLE.cache);
3034     XBZRLE.cache = NULL;
3035 free_zero_page:
3036     g_free(XBZRLE.zero_target_page);
3037     XBZRLE.zero_target_page = NULL;
3038 err_out:
3039     XBZRLE_cache_unlock();
3040     return -ENOMEM;
3041 }
3042
3043 static int ram_state_init(RAMState **rsp)
3044 {
3045     *rsp = g_try_new0(RAMState, 1);
3046
3047     if (!*rsp) {
3048         error_report("%s: Init ramstate fail", __func__);
3049         return -1;
3050     }
3051
3052     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3053     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3054     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3055     (*rsp)->ram_bytes_total = ram_bytes_total();
3056
3057     /*
3058      * Count the total number of pages used by ram blocks not including any
3059      * gaps due to alignment or unplugs.
3060      * This must match with the initial values of dirty bitmap.
3061      */
3062     (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
3063     ram_state_reset(*rsp);
3064
3065     return 0;
3066 }
3067
3068 static void ram_list_init_bitmaps(void)
3069 {
3070     MigrationState *ms = migrate_get_current();
3071     RAMBlock *block;
3072     unsigned long pages;
3073     uint8_t shift;
3074
3075     /* Skip setting bitmap if there is no RAM */
3076     if (ram_bytes_total()) {
3077         shift = ms->clear_bitmap_shift;
3078         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3079             error_report("clear_bitmap_shift (%u) too big, using "
3080                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3081             shift = CLEAR_BITMAP_SHIFT_MAX;
3082         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3083             error_report("clear_bitmap_shift (%u) too small, using "
3084                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3085             shift = CLEAR_BITMAP_SHIFT_MIN;
3086         }
3087
3088         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3089             pages = block->max_length >> TARGET_PAGE_BITS;
3090             /*
3091              * The initial dirty bitmap for migration must be set with all
3092              * ones to make sure we'll migrate every guest RAM page to
3093              * destination.
3094              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3095              * new migration after a failed migration, ram_list.
3096              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3097              * guest memory.
3098              */
3099             block->bmap = bitmap_new(pages);
3100             bitmap_set(block->bmap, 0, pages);
3101             block->clear_bmap_shift = shift;
3102             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3103         }
3104     }
3105 }
3106
3107 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3108 {
3109     unsigned long pages;
3110     RAMBlock *rb;
3111
3112     RCU_READ_LOCK_GUARD();
3113
3114     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3115             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3116             rs->migration_dirty_pages -= pages;
3117     }
3118 }
3119
3120 static void ram_init_bitmaps(RAMState *rs)
3121 {
3122     /* For memory_global_dirty_log_start below.  */
3123     qemu_mutex_lock_iothread();
3124     qemu_mutex_lock_ramlist();
3125
3126     WITH_RCU_READ_LOCK_GUARD() {
3127         ram_list_init_bitmaps();
3128         /* We don't use dirty log with background snapshots */
3129         if (!migrate_background_snapshot()) {
3130             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3131             migration_bitmap_sync_precopy(rs);
3132         }
3133     }
3134     qemu_mutex_unlock_ramlist();
3135     qemu_mutex_unlock_iothread();
3136
3137     /*
3138      * After an eventual first bitmap sync, fixup the initial bitmap
3139      * containing all 1s to exclude any discarded pages from migration.
3140      */
3141     migration_bitmap_clear_discarded_pages(rs);
3142 }
3143
3144 static int ram_init_all(RAMState **rsp)
3145 {
3146     if (ram_state_init(rsp)) {
3147         return -1;
3148     }
3149
3150     if (xbzrle_init()) {
3151         ram_state_cleanup(rsp);
3152         return -1;
3153     }
3154
3155     ram_init_bitmaps(*rsp);
3156
3157     return 0;
3158 }
3159
3160 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3161 {
3162     RAMBlock *block;
3163     uint64_t pages = 0;
3164
3165     /*
3166      * Postcopy is not using xbzrle/compression, so no need for that.
3167      * Also, since source are already halted, we don't need to care
3168      * about dirty page logging as well.
3169      */
3170
3171     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3172         pages += bitmap_count_one(block->bmap,
3173                                   block->used_length >> TARGET_PAGE_BITS);
3174     }
3175
3176     /* This may not be aligned with current bitmaps. Recalculate. */
3177     rs->migration_dirty_pages = pages;
3178
3179     ram_state_reset(rs);
3180
3181     /* Update RAMState cache of output QEMUFile */
3182     rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
3183
3184     trace_ram_state_resume_prepare(pages);
3185 }
3186
3187 /*
3188  * This function clears bits of the free pages reported by the caller from the
3189  * migration dirty bitmap. @addr is the host address corresponding to the
3190  * start of the continuous guest free pages, and @len is the total bytes of
3191  * those pages.
3192  */
3193 void qemu_guest_free_page_hint(void *addr, size_t len)
3194 {
3195     RAMBlock *block;
3196     ram_addr_t offset;
3197     size_t used_len, start, npages;
3198     MigrationState *s = migrate_get_current();
3199
3200     /* This function is currently expected to be used during live migration */
3201     if (!migration_is_setup_or_active(s->state)) {
3202         return;
3203     }
3204
3205     for (; len > 0; len -= used_len, addr += used_len) {
3206         block = qemu_ram_block_from_host(addr, false, &offset);
3207         if (unlikely(!block || offset >= block->used_length)) {
3208             /*
3209              * The implementation might not support RAMBlock resize during
3210              * live migration, but it could happen in theory with future
3211              * updates. So we add a check here to capture that case.
3212              */
3213             error_report_once("%s unexpected error", __func__);
3214             return;
3215         }
3216
3217         if (len <= block->used_length - offset) {
3218             used_len = len;
3219         } else {
3220             used_len = block->used_length - offset;
3221         }
3222
3223         start = offset >> TARGET_PAGE_BITS;
3224         npages = used_len >> TARGET_PAGE_BITS;
3225
3226         qemu_mutex_lock(&ram_state->bitmap_mutex);
3227         /*
3228          * The skipped free pages are equavalent to be sent from clear_bmap's
3229          * perspective, so clear the bits from the memory region bitmap which
3230          * are initially set. Otherwise those skipped pages will be sent in
3231          * the next round after syncing from the memory region bitmap.
3232          */
3233         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3234         ram_state->migration_dirty_pages -=
3235                       bitmap_count_one_with_offset(block->bmap, start, npages);
3236         bitmap_clear(block->bmap, start, npages);
3237         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3238     }
3239 }
3240
3241 /*
3242  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3243  * long-running RCU critical section.  When rcu-reclaims in the code
3244  * start to become numerous it will be necessary to reduce the
3245  * granularity of these critical sections.
3246  */
3247
3248 /**
3249  * ram_save_setup: Setup RAM for migration
3250  *
3251  * Returns zero to indicate success and negative for error
3252  *
3253  * @f: QEMUFile where to send the data
3254  * @opaque: RAMState pointer
3255  */
3256 static int ram_save_setup(QEMUFile *f, void *opaque)
3257 {
3258     RAMState **rsp = opaque;
3259     RAMBlock *block;
3260     int ret;
3261
3262     if (compress_threads_save_setup()) {
3263         return -1;
3264     }
3265
3266     /* migration has already setup the bitmap, reuse it. */
3267     if (!migration_in_colo_state()) {
3268         if (ram_init_all(rsp) != 0) {
3269             compress_threads_save_cleanup();
3270             return -1;
3271         }
3272     }
3273     (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3274
3275     WITH_RCU_READ_LOCK_GUARD() {
3276         qemu_put_be64(f, ram_bytes_total_with_ignored()
3277                          | RAM_SAVE_FLAG_MEM_SIZE);
3278
3279         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3280             qemu_put_byte(f, strlen(block->idstr));
3281             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3282             qemu_put_be64(f, block->used_length);
3283             if (migrate_postcopy_ram() && block->page_size !=
3284                                           qemu_host_page_size) {
3285                 qemu_put_be64(f, block->page_size);
3286             }
3287             if (migrate_ignore_shared()) {
3288                 qemu_put_be64(f, block->mr->addr);
3289             }
3290         }
3291     }
3292
3293     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3294     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3295
3296     migration_ops = g_malloc0(sizeof(MigrationOps));
3297     migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3298     ret =  multifd_send_sync_main(f);
3299     if (ret < 0) {
3300         return ret;
3301     }
3302
3303     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3304     qemu_fflush(f);
3305
3306     return 0;
3307 }
3308
3309 /**
3310  * ram_save_iterate: iterative stage for migration
3311  *
3312  * Returns zero to indicate success and negative for error
3313  *
3314  * @f: QEMUFile where to send the data
3315  * @opaque: RAMState pointer
3316  */
3317 static int ram_save_iterate(QEMUFile *f, void *opaque)
3318 {
3319     RAMState **temp = opaque;
3320     RAMState *rs = *temp;
3321     int ret = 0;
3322     int i;
3323     int64_t t0;
3324     int done = 0;
3325
3326     if (blk_mig_bulk_active()) {
3327         /* Avoid transferring ram during bulk phase of block migration as
3328          * the bulk phase will usually take a long time and transferring
3329          * ram updates during that time is pointless. */
3330         goto out;
3331     }
3332
3333     /*
3334      * We'll take this lock a little bit long, but it's okay for two reasons.
3335      * Firstly, the only possible other thread to take it is who calls
3336      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3337      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3338      * guarantees that we'll at least released it in a regular basis.
3339      */
3340     qemu_mutex_lock(&rs->bitmap_mutex);
3341     WITH_RCU_READ_LOCK_GUARD() {
3342         if (ram_list.version != rs->last_version) {
3343             ram_state_reset(rs);
3344         }
3345
3346         /* Read version before ram_list.blocks */
3347         smp_rmb();
3348
3349         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3350
3351         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3352         i = 0;
3353         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3354                postcopy_has_request(rs)) {
3355             int pages;
3356
3357             if (qemu_file_get_error(f)) {
3358                 break;
3359             }
3360
3361             pages = ram_find_and_save_block(rs);
3362             /* no more pages to sent */
3363             if (pages == 0) {
3364                 done = 1;
3365                 break;
3366             }
3367
3368             if (pages < 0) {
3369                 qemu_file_set_error(f, pages);
3370                 break;
3371             }
3372
3373             rs->target_page_count += pages;
3374
3375             /*
3376              * During postcopy, it is necessary to make sure one whole host
3377              * page is sent in one chunk.
3378              */
3379             if (migrate_postcopy_ram()) {
3380                 flush_compressed_data(rs);
3381             }
3382
3383             /*
3384              * we want to check in the 1st loop, just in case it was the 1st
3385              * time and we had to sync the dirty bitmap.
3386              * qemu_clock_get_ns() is a bit expensive, so we only check each
3387              * some iterations
3388              */
3389             if ((i & 63) == 0) {
3390                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3391                               1000000;
3392                 if (t1 > MAX_WAIT) {
3393                     trace_ram_save_iterate_big_wait(t1, i);
3394                     break;
3395                 }
3396             }
3397             i++;
3398         }
3399     }
3400     qemu_mutex_unlock(&rs->bitmap_mutex);
3401
3402     /*
3403      * Must occur before EOS (or any QEMUFile operation)
3404      * because of RDMA protocol.
3405      */
3406     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3407
3408 out:
3409     if (ret >= 0
3410         && migration_is_setup_or_active(migrate_get_current()->state)) {
3411         ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3412         if (ret < 0) {
3413             return ret;
3414         }
3415
3416         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3417         qemu_fflush(f);
3418         ram_transferred_add(8);
3419
3420         ret = qemu_file_get_error(f);
3421     }
3422     if (ret < 0) {
3423         return ret;
3424     }
3425
3426     return done;
3427 }
3428
3429 /**
3430  * ram_save_complete: function called to send the remaining amount of ram
3431  *
3432  * Returns zero to indicate success or negative on error
3433  *
3434  * Called with iothread lock
3435  *
3436  * @f: QEMUFile where to send the data
3437  * @opaque: RAMState pointer
3438  */
3439 static int ram_save_complete(QEMUFile *f, void *opaque)
3440 {
3441     RAMState **temp = opaque;
3442     RAMState *rs = *temp;
3443     int ret = 0;
3444
3445     rs->last_stage = !migration_in_colo_state();
3446
3447     WITH_RCU_READ_LOCK_GUARD() {
3448         if (!migration_in_postcopy()) {
3449             migration_bitmap_sync_precopy(rs);
3450         }
3451
3452         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3453
3454         /* try transferring iterative blocks of memory */
3455
3456         /* flush all remaining blocks regardless of rate limiting */
3457         qemu_mutex_lock(&rs->bitmap_mutex);
3458         while (true) {
3459             int pages;
3460
3461             pages = ram_find_and_save_block(rs);
3462             /* no more blocks to sent */
3463             if (pages == 0) {
3464                 break;
3465             }
3466             if (pages < 0) {
3467                 ret = pages;
3468                 break;
3469             }
3470         }
3471         qemu_mutex_unlock(&rs->bitmap_mutex);
3472
3473         flush_compressed_data(rs);
3474         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3475     }
3476
3477     if (ret < 0) {
3478         return ret;
3479     }
3480
3481     ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3482     if (ret < 0) {
3483         return ret;
3484     }
3485
3486     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3487     qemu_fflush(f);
3488
3489     return 0;
3490 }
3491
3492 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3493                                        uint64_t *can_postcopy)
3494 {
3495     RAMState **temp = opaque;
3496     RAMState *rs = *temp;
3497
3498     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3499
3500     if (migrate_postcopy_ram()) {
3501         /* We can do postcopy, and all the data is postcopiable */
3502         *can_postcopy += remaining_size;
3503     } else {
3504         *must_precopy += remaining_size;
3505     }
3506 }
3507
3508 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3509                                     uint64_t *can_postcopy)
3510 {
3511     RAMState **temp = opaque;
3512     RAMState *rs = *temp;
3513
3514     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3515
3516     if (!migration_in_postcopy()) {
3517         qemu_mutex_lock_iothread();
3518         WITH_RCU_READ_LOCK_GUARD() {
3519             migration_bitmap_sync_precopy(rs);
3520         }
3521         qemu_mutex_unlock_iothread();
3522         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3523     }
3524
3525     if (migrate_postcopy_ram()) {
3526         /* We can do postcopy, and all the data is postcopiable */
3527         *can_postcopy += remaining_size;
3528     } else {
3529         *must_precopy += remaining_size;
3530     }
3531 }
3532
3533 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3534 {
3535     unsigned int xh_len;
3536     int xh_flags;
3537     uint8_t *loaded_data;
3538
3539     /* extract RLE header */
3540     xh_flags = qemu_get_byte(f);
3541     xh_len = qemu_get_be16(f);
3542
3543     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3544         error_report("Failed to load XBZRLE page - wrong compression!");
3545         return -1;
3546     }
3547
3548     if (xh_len > TARGET_PAGE_SIZE) {
3549         error_report("Failed to load XBZRLE page - len overflow!");
3550         return -1;
3551     }
3552     loaded_data = XBZRLE.decoded_buf;
3553     /* load data and decode */
3554     /* it can change loaded_data to point to an internal buffer */
3555     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3556
3557     /* decode RLE */
3558     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3559                              TARGET_PAGE_SIZE) == -1) {
3560         error_report("Failed to load XBZRLE page - decode error!");
3561         return -1;
3562     }
3563
3564     return 0;
3565 }
3566
3567 /**
3568  * ram_block_from_stream: read a RAMBlock id from the migration stream
3569  *
3570  * Must be called from within a rcu critical section.
3571  *
3572  * Returns a pointer from within the RCU-protected ram_list.
3573  *
3574  * @mis: the migration incoming state pointer
3575  * @f: QEMUFile where to read the data from
3576  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3577  * @channel: the channel we're using
3578  */
3579 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3580                                               QEMUFile *f, int flags,
3581                                               int channel)
3582 {
3583     RAMBlock *block = mis->last_recv_block[channel];
3584     char id[256];
3585     uint8_t len;
3586
3587     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3588         if (!block) {
3589             error_report("Ack, bad migration stream!");
3590             return NULL;
3591         }
3592         return block;
3593     }
3594
3595     len = qemu_get_byte(f);
3596     qemu_get_buffer(f, (uint8_t *)id, len);
3597     id[len] = 0;
3598
3599     block = qemu_ram_block_by_name(id);
3600     if (!block) {
3601         error_report("Can't find block %s", id);
3602         return NULL;
3603     }
3604
3605     if (ramblock_is_ignored(block)) {
3606         error_report("block %s should not be migrated !", id);
3607         return NULL;
3608     }
3609
3610     mis->last_recv_block[channel] = block;
3611
3612     return block;
3613 }
3614
3615 static inline void *host_from_ram_block_offset(RAMBlock *block,
3616                                                ram_addr_t offset)
3617 {
3618     if (!offset_in_ramblock(block, offset)) {
3619         return NULL;
3620     }
3621
3622     return block->host + offset;
3623 }
3624
3625 static void *host_page_from_ram_block_offset(RAMBlock *block,
3626                                              ram_addr_t offset)
3627 {
3628     /* Note: Explicitly no check against offset_in_ramblock(). */
3629     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3630                                    block->page_size);
3631 }
3632
3633 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3634                                                          ram_addr_t offset)
3635 {
3636     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3637 }
3638
3639 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3640                              ram_addr_t offset, bool record_bitmap)
3641 {
3642     if (!offset_in_ramblock(block, offset)) {
3643         return NULL;
3644     }
3645     if (!block->colo_cache) {
3646         error_report("%s: colo_cache is NULL in block :%s",
3647                      __func__, block->idstr);
3648         return NULL;
3649     }
3650
3651     /*
3652     * During colo checkpoint, we need bitmap of these migrated pages.
3653     * It help us to decide which pages in ram cache should be flushed
3654     * into VM's RAM later.
3655     */
3656     if (record_bitmap &&
3657         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3658         ram_state->migration_dirty_pages++;
3659     }
3660     return block->colo_cache + offset;
3661 }
3662
3663 /**
3664  * ram_handle_compressed: handle the zero page case
3665  *
3666  * If a page (or a whole RDMA chunk) has been
3667  * determined to be zero, then zap it.
3668  *
3669  * @host: host address for the zero page
3670  * @ch: what the page is filled from.  We only support zero
3671  * @size: size of the zero page
3672  */
3673 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3674 {
3675     if (ch != 0 || !buffer_is_zero(host, size)) {
3676         memset(host, ch, size);
3677     }
3678 }
3679
3680 /* return the size after decompression, or negative value on error */
3681 static int
3682 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3683                      const uint8_t *source, size_t source_len)
3684 {
3685     int err;
3686
3687     err = inflateReset(stream);
3688     if (err != Z_OK) {
3689         return -1;
3690     }
3691
3692     stream->avail_in = source_len;
3693     stream->next_in = (uint8_t *)source;
3694     stream->avail_out = dest_len;
3695     stream->next_out = dest;
3696
3697     err = inflate(stream, Z_NO_FLUSH);
3698     if (err != Z_STREAM_END) {
3699         return -1;
3700     }
3701
3702     return stream->total_out;
3703 }
3704
3705 static void *do_data_decompress(void *opaque)
3706 {
3707     DecompressParam *param = opaque;
3708     unsigned long pagesize;
3709     uint8_t *des;
3710     int len, ret;
3711
3712     qemu_mutex_lock(&param->mutex);
3713     while (!param->quit) {
3714         if (param->des) {
3715             des = param->des;
3716             len = param->len;
3717             param->des = 0;
3718             qemu_mutex_unlock(&param->mutex);
3719
3720             pagesize = TARGET_PAGE_SIZE;
3721
3722             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3723                                        param->compbuf, len);
3724             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3725                 error_report("decompress data failed");
3726                 qemu_file_set_error(decomp_file, ret);
3727             }
3728
3729             qemu_mutex_lock(&decomp_done_lock);
3730             param->done = true;
3731             qemu_cond_signal(&decomp_done_cond);
3732             qemu_mutex_unlock(&decomp_done_lock);
3733
3734             qemu_mutex_lock(&param->mutex);
3735         } else {
3736             qemu_cond_wait(&param->cond, &param->mutex);
3737         }
3738     }
3739     qemu_mutex_unlock(&param->mutex);
3740
3741     return NULL;
3742 }
3743
3744 static int wait_for_decompress_done(void)
3745 {
3746     int idx, thread_count;
3747
3748     if (!migrate_use_compression()) {
3749         return 0;
3750     }
3751
3752     thread_count = migrate_decompress_threads();
3753     qemu_mutex_lock(&decomp_done_lock);
3754     for (idx = 0; idx < thread_count; idx++) {
3755         while (!decomp_param[idx].done) {
3756             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3757         }
3758     }
3759     qemu_mutex_unlock(&decomp_done_lock);
3760     return qemu_file_get_error(decomp_file);
3761 }
3762
3763 static void compress_threads_load_cleanup(void)
3764 {
3765     int i, thread_count;
3766
3767     if (!migrate_use_compression()) {
3768         return;
3769     }
3770     thread_count = migrate_decompress_threads();
3771     for (i = 0; i < thread_count; i++) {
3772         /*
3773          * we use it as a indicator which shows if the thread is
3774          * properly init'd or not
3775          */
3776         if (!decomp_param[i].compbuf) {
3777             break;
3778         }
3779
3780         qemu_mutex_lock(&decomp_param[i].mutex);
3781         decomp_param[i].quit = true;
3782         qemu_cond_signal(&decomp_param[i].cond);
3783         qemu_mutex_unlock(&decomp_param[i].mutex);
3784     }
3785     for (i = 0; i < thread_count; i++) {
3786         if (!decomp_param[i].compbuf) {
3787             break;
3788         }
3789
3790         qemu_thread_join(decompress_threads + i);
3791         qemu_mutex_destroy(&decomp_param[i].mutex);
3792         qemu_cond_destroy(&decomp_param[i].cond);
3793         inflateEnd(&decomp_param[i].stream);
3794         g_free(decomp_param[i].compbuf);
3795         decomp_param[i].compbuf = NULL;
3796     }
3797     g_free(decompress_threads);
3798     g_free(decomp_param);
3799     decompress_threads = NULL;
3800     decomp_param = NULL;
3801     decomp_file = NULL;
3802 }
3803
3804 static int compress_threads_load_setup(QEMUFile *f)
3805 {
3806     int i, thread_count;
3807
3808     if (!migrate_use_compression()) {
3809         return 0;
3810     }
3811
3812     thread_count = migrate_decompress_threads();
3813     decompress_threads = g_new0(QemuThread, thread_count);
3814     decomp_param = g_new0(DecompressParam, thread_count);
3815     qemu_mutex_init(&decomp_done_lock);
3816     qemu_cond_init(&decomp_done_cond);
3817     decomp_file = f;
3818     for (i = 0; i < thread_count; i++) {
3819         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3820             goto exit;
3821         }
3822
3823         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3824         qemu_mutex_init(&decomp_param[i].mutex);
3825         qemu_cond_init(&decomp_param[i].cond);
3826         decomp_param[i].done = true;
3827         decomp_param[i].quit = false;
3828         qemu_thread_create(decompress_threads + i, "decompress",
3829                            do_data_decompress, decomp_param + i,
3830                            QEMU_THREAD_JOINABLE);
3831     }
3832     return 0;
3833 exit:
3834     compress_threads_load_cleanup();
3835     return -1;
3836 }
3837
3838 static void decompress_data_with_multi_threads(QEMUFile *f,
3839                                                void *host, int len)
3840 {
3841     int idx, thread_count;
3842
3843     thread_count = migrate_decompress_threads();
3844     QEMU_LOCK_GUARD(&decomp_done_lock);
3845     while (true) {
3846         for (idx = 0; idx < thread_count; idx++) {
3847             if (decomp_param[idx].done) {
3848                 decomp_param[idx].done = false;
3849                 qemu_mutex_lock(&decomp_param[idx].mutex);
3850                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3851                 decomp_param[idx].des = host;
3852                 decomp_param[idx].len = len;
3853                 qemu_cond_signal(&decomp_param[idx].cond);
3854                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3855                 break;
3856             }
3857         }
3858         if (idx < thread_count) {
3859             break;
3860         } else {
3861             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3862         }
3863     }
3864 }
3865
3866 static void colo_init_ram_state(void)
3867 {
3868     ram_state_init(&ram_state);
3869 }
3870
3871 /*
3872  * colo cache: this is for secondary VM, we cache the whole
3873  * memory of the secondary VM, it is need to hold the global lock
3874  * to call this helper.
3875  */
3876 int colo_init_ram_cache(void)
3877 {
3878     RAMBlock *block;
3879
3880     WITH_RCU_READ_LOCK_GUARD() {
3881         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3882             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3883                                                     NULL, false, false);
3884             if (!block->colo_cache) {
3885                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3886                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3887                              block->used_length);
3888                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3889                     if (block->colo_cache) {
3890                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3891                         block->colo_cache = NULL;
3892                     }
3893                 }
3894                 return -errno;
3895             }
3896             if (!machine_dump_guest_core(current_machine)) {
3897                 qemu_madvise(block->colo_cache, block->used_length,
3898                              QEMU_MADV_DONTDUMP);
3899             }
3900         }
3901     }
3902
3903     /*
3904     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3905     * with to decide which page in cache should be flushed into SVM's RAM. Here
3906     * we use the same name 'ram_bitmap' as for migration.
3907     */
3908     if (ram_bytes_total()) {
3909         RAMBlock *block;
3910
3911         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3912             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3913             block->bmap = bitmap_new(pages);
3914         }
3915     }
3916
3917     colo_init_ram_state();
3918     return 0;
3919 }
3920
3921 /* TODO: duplicated with ram_init_bitmaps */
3922 void colo_incoming_start_dirty_log(void)
3923 {
3924     RAMBlock *block = NULL;
3925     /* For memory_global_dirty_log_start below. */
3926     qemu_mutex_lock_iothread();
3927     qemu_mutex_lock_ramlist();
3928
3929     memory_global_dirty_log_sync();
3930     WITH_RCU_READ_LOCK_GUARD() {
3931         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3932             ramblock_sync_dirty_bitmap(ram_state, block);
3933             /* Discard this dirty bitmap record */
3934             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3935         }
3936         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3937     }
3938     ram_state->migration_dirty_pages = 0;
3939     qemu_mutex_unlock_ramlist();
3940     qemu_mutex_unlock_iothread();
3941 }
3942
3943 /* It is need to hold the global lock to call this helper */
3944 void colo_release_ram_cache(void)
3945 {
3946     RAMBlock *block;
3947
3948     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3949     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3950         g_free(block->bmap);
3951         block->bmap = NULL;
3952     }
3953
3954     WITH_RCU_READ_LOCK_GUARD() {
3955         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3956             if (block->colo_cache) {
3957                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3958                 block->colo_cache = NULL;
3959             }
3960         }
3961     }
3962     ram_state_cleanup(&ram_state);
3963 }
3964
3965 /**
3966  * ram_load_setup: Setup RAM for migration incoming side
3967  *
3968  * Returns zero to indicate success and negative for error
3969  *
3970  * @f: QEMUFile where to receive the data
3971  * @opaque: RAMState pointer
3972  */
3973 static int ram_load_setup(QEMUFile *f, void *opaque)
3974 {
3975     if (compress_threads_load_setup(f)) {
3976         return -1;
3977     }
3978
3979     xbzrle_load_setup();
3980     ramblock_recv_map_init();
3981
3982     return 0;
3983 }
3984
3985 static int ram_load_cleanup(void *opaque)
3986 {
3987     RAMBlock *rb;
3988
3989     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3990         qemu_ram_block_writeback(rb);
3991     }
3992
3993     xbzrle_load_cleanup();
3994     compress_threads_load_cleanup();
3995
3996     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3997         g_free(rb->receivedmap);
3998         rb->receivedmap = NULL;
3999     }
4000
4001     return 0;
4002 }
4003
4004 /**
4005  * ram_postcopy_incoming_init: allocate postcopy data structures
4006  *
4007  * Returns 0 for success and negative if there was one error
4008  *
4009  * @mis: current migration incoming state
4010  *
4011  * Allocate data structures etc needed by incoming migration with
4012  * postcopy-ram. postcopy-ram's similarly names
4013  * postcopy_ram_incoming_init does the work.
4014  */
4015 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4016 {
4017     return postcopy_ram_incoming_init(mis);
4018 }
4019
4020 /**
4021  * ram_load_postcopy: load a page in postcopy case
4022  *
4023  * Returns 0 for success or -errno in case of error
4024  *
4025  * Called in postcopy mode by ram_load().
4026  * rcu_read_lock is taken prior to this being called.
4027  *
4028  * @f: QEMUFile where to send the data
4029  * @channel: the channel to use for loading
4030  */
4031 int ram_load_postcopy(QEMUFile *f, int channel)
4032 {
4033     int flags = 0, ret = 0;
4034     bool place_needed = false;
4035     bool matches_target_page_size = false;
4036     MigrationIncomingState *mis = migration_incoming_get_current();
4037     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
4038
4039     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4040         ram_addr_t addr;
4041         void *page_buffer = NULL;
4042         void *place_source = NULL;
4043         RAMBlock *block = NULL;
4044         uint8_t ch;
4045         int len;
4046
4047         addr = qemu_get_be64(f);
4048
4049         /*
4050          * If qemu file error, we should stop here, and then "addr"
4051          * may be invalid
4052          */
4053         ret = qemu_file_get_error(f);
4054         if (ret) {
4055             break;
4056         }
4057
4058         flags = addr & ~TARGET_PAGE_MASK;
4059         addr &= TARGET_PAGE_MASK;
4060
4061         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
4062         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4063                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4064             block = ram_block_from_stream(mis, f, flags, channel);
4065             if (!block) {
4066                 ret = -EINVAL;
4067                 break;
4068             }
4069
4070             /*
4071              * Relying on used_length is racy and can result in false positives.
4072              * We might place pages beyond used_length in case RAM was shrunk
4073              * while in postcopy, which is fine - trying to place via
4074              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4075              */
4076             if (!block->host || addr >= block->postcopy_length) {
4077                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4078                 ret = -EINVAL;
4079                 break;
4080             }
4081             tmp_page->target_pages++;
4082             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4083             /*
4084              * Postcopy requires that we place whole host pages atomically;
4085              * these may be huge pages for RAMBlocks that are backed by
4086              * hugetlbfs.
4087              * To make it atomic, the data is read into a temporary page
4088              * that's moved into place later.
4089              * The migration protocol uses,  possibly smaller, target-pages
4090              * however the source ensures it always sends all the components
4091              * of a host page in one chunk.
4092              */
4093             page_buffer = tmp_page->tmp_huge_page +
4094                           host_page_offset_from_ram_block_offset(block, addr);
4095             /* If all TP are zero then we can optimise the place */
4096             if (tmp_page->target_pages == 1) {
4097                 tmp_page->host_addr =
4098                     host_page_from_ram_block_offset(block, addr);
4099             } else if (tmp_page->host_addr !=
4100                        host_page_from_ram_block_offset(block, addr)) {
4101                 /* not the 1st TP within the HP */
4102                 error_report("Non-same host page detected on channel %d: "
4103                              "Target host page %p, received host page %p "
4104                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4105                              channel, tmp_page->host_addr,
4106                              host_page_from_ram_block_offset(block, addr),
4107                              block->idstr, addr, tmp_page->target_pages);
4108                 ret = -EINVAL;
4109                 break;
4110             }
4111
4112             /*
4113              * If it's the last part of a host page then we place the host
4114              * page
4115              */
4116             if (tmp_page->target_pages ==
4117                 (block->page_size / TARGET_PAGE_SIZE)) {
4118                 place_needed = true;
4119             }
4120             place_source = tmp_page->tmp_huge_page;
4121         }
4122
4123         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4124         case RAM_SAVE_FLAG_ZERO:
4125             ch = qemu_get_byte(f);
4126             /*
4127              * Can skip to set page_buffer when
4128              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4129              */
4130             if (ch || !matches_target_page_size) {
4131                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4132             }
4133             if (ch) {
4134                 tmp_page->all_zero = false;
4135             }
4136             break;
4137
4138         case RAM_SAVE_FLAG_PAGE:
4139             tmp_page->all_zero = false;
4140             if (!matches_target_page_size) {
4141                 /* For huge pages, we always use temporary buffer */
4142                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4143             } else {
4144                 /*
4145                  * For small pages that matches target page size, we
4146                  * avoid the qemu_file copy.  Instead we directly use
4147                  * the buffer of QEMUFile to place the page.  Note: we
4148                  * cannot do any QEMUFile operation before using that
4149                  * buffer to make sure the buffer is valid when
4150                  * placing the page.
4151                  */
4152                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4153                                          TARGET_PAGE_SIZE);
4154             }
4155             break;
4156         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4157             tmp_page->all_zero = false;
4158             len = qemu_get_be32(f);
4159             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4160                 error_report("Invalid compressed data length: %d", len);
4161                 ret = -EINVAL;
4162                 break;
4163             }
4164             decompress_data_with_multi_threads(f, page_buffer, len);
4165             break;
4166
4167         case RAM_SAVE_FLAG_EOS:
4168             /* normal exit */
4169             multifd_recv_sync_main();
4170             break;
4171         default:
4172             error_report("Unknown combination of migration flags: 0x%x"
4173                          " (postcopy mode)", flags);
4174             ret = -EINVAL;
4175             break;
4176         }
4177
4178         /* Got the whole host page, wait for decompress before placing. */
4179         if (place_needed) {
4180             ret |= wait_for_decompress_done();
4181         }
4182
4183         /* Detect for any possible file errors */
4184         if (!ret && qemu_file_get_error(f)) {
4185             ret = qemu_file_get_error(f);
4186         }
4187
4188         if (!ret && place_needed) {
4189             if (tmp_page->all_zero) {
4190                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4191             } else {
4192                 ret = postcopy_place_page(mis, tmp_page->host_addr,
4193                                           place_source, block);
4194             }
4195             place_needed = false;
4196             postcopy_temp_page_reset(tmp_page);
4197         }
4198     }
4199
4200     return ret;
4201 }
4202
4203 static bool postcopy_is_running(void)
4204 {
4205     PostcopyState ps = postcopy_state_get();
4206     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4207 }
4208
4209 /*
4210  * Flush content of RAM cache into SVM's memory.
4211  * Only flush the pages that be dirtied by PVM or SVM or both.
4212  */
4213 void colo_flush_ram_cache(void)
4214 {
4215     RAMBlock *block = NULL;
4216     void *dst_host;
4217     void *src_host;
4218     unsigned long offset = 0;
4219
4220     memory_global_dirty_log_sync();
4221     WITH_RCU_READ_LOCK_GUARD() {
4222         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4223             ramblock_sync_dirty_bitmap(ram_state, block);
4224         }
4225     }
4226
4227     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4228     WITH_RCU_READ_LOCK_GUARD() {
4229         block = QLIST_FIRST_RCU(&ram_list.blocks);
4230
4231         while (block) {
4232             unsigned long num = 0;
4233
4234             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4235             if (!offset_in_ramblock(block,
4236                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4237                 offset = 0;
4238                 num = 0;
4239                 block = QLIST_NEXT_RCU(block, next);
4240             } else {
4241                 unsigned long i = 0;
4242
4243                 for (i = 0; i < num; i++) {
4244                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
4245                 }
4246                 dst_host = block->host
4247                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4248                 src_host = block->colo_cache
4249                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4250                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4251                 offset += num;
4252             }
4253         }
4254     }
4255     trace_colo_flush_ram_cache_end();
4256 }
4257
4258 /**
4259  * ram_load_precopy: load pages in precopy case
4260  *
4261  * Returns 0 for success or -errno in case of error
4262  *
4263  * Called in precopy mode by ram_load().
4264  * rcu_read_lock is taken prior to this being called.
4265  *
4266  * @f: QEMUFile where to send the data
4267  */
4268 static int ram_load_precopy(QEMUFile *f)
4269 {
4270     MigrationIncomingState *mis = migration_incoming_get_current();
4271     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4272     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4273     bool postcopy_advised = migration_incoming_postcopy_advised();
4274     if (!migrate_use_compression()) {
4275         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4276     }
4277
4278     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4279         ram_addr_t addr, total_ram_bytes;
4280         void *host = NULL, *host_bak = NULL;
4281         uint8_t ch;
4282
4283         /*
4284          * Yield periodically to let main loop run, but an iteration of
4285          * the main loop is expensive, so do it each some iterations
4286          */
4287         if ((i & 32767) == 0 && qemu_in_coroutine()) {
4288             aio_co_schedule(qemu_get_current_aio_context(),
4289                             qemu_coroutine_self());
4290             qemu_coroutine_yield();
4291         }
4292         i++;
4293
4294         addr = qemu_get_be64(f);
4295         flags = addr & ~TARGET_PAGE_MASK;
4296         addr &= TARGET_PAGE_MASK;
4297
4298         if (flags & invalid_flags) {
4299             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4300                 error_report("Received an unexpected compressed page");
4301             }
4302
4303             ret = -EINVAL;
4304             break;
4305         }
4306
4307         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4308                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4309             RAMBlock *block = ram_block_from_stream(mis, f, flags,
4310                                                     RAM_CHANNEL_PRECOPY);
4311
4312             host = host_from_ram_block_offset(block, addr);
4313             /*
4314              * After going into COLO stage, we should not load the page
4315              * into SVM's memory directly, we put them into colo_cache firstly.
4316              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4317              * Previously, we copied all these memory in preparing stage of COLO
4318              * while we need to stop VM, which is a time-consuming process.
4319              * Here we optimize it by a trick, back-up every page while in
4320              * migration process while COLO is enabled, though it affects the
4321              * speed of the migration, but it obviously reduce the downtime of
4322              * back-up all SVM'S memory in COLO preparing stage.
4323              */
4324             if (migration_incoming_colo_enabled()) {
4325                 if (migration_incoming_in_colo_state()) {
4326                     /* In COLO stage, put all pages into cache temporarily */
4327                     host = colo_cache_from_block_offset(block, addr, true);
4328                 } else {
4329                    /*
4330                     * In migration stage but before COLO stage,
4331                     * Put all pages into both cache and SVM's memory.
4332                     */
4333                     host_bak = colo_cache_from_block_offset(block, addr, false);
4334                 }
4335             }
4336             if (!host) {
4337                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4338                 ret = -EINVAL;
4339                 break;
4340             }
4341             if (!migration_incoming_in_colo_state()) {
4342                 ramblock_recv_bitmap_set(block, host);
4343             }
4344
4345             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4346         }
4347
4348         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4349         case RAM_SAVE_FLAG_MEM_SIZE:
4350             /* Synchronize RAM block list */
4351             total_ram_bytes = addr;
4352             while (!ret && total_ram_bytes) {
4353                 RAMBlock *block;
4354                 char id[256];
4355                 ram_addr_t length;
4356
4357                 len = qemu_get_byte(f);
4358                 qemu_get_buffer(f, (uint8_t *)id, len);
4359                 id[len] = 0;
4360                 length = qemu_get_be64(f);
4361
4362                 block = qemu_ram_block_by_name(id);
4363                 if (block && !qemu_ram_is_migratable(block)) {
4364                     error_report("block %s should not be migrated !", id);
4365                     ret = -EINVAL;
4366                 } else if (block) {
4367                     if (length != block->used_length) {
4368                         Error *local_err = NULL;
4369
4370                         ret = qemu_ram_resize(block, length,
4371                                               &local_err);
4372                         if (local_err) {
4373                             error_report_err(local_err);
4374                         }
4375                     }
4376                     /* For postcopy we need to check hugepage sizes match */
4377                     if (postcopy_advised && migrate_postcopy_ram() &&
4378                         block->page_size != qemu_host_page_size) {
4379                         uint64_t remote_page_size = qemu_get_be64(f);
4380                         if (remote_page_size != block->page_size) {
4381                             error_report("Mismatched RAM page size %s "
4382                                          "(local) %zd != %" PRId64,
4383                                          id, block->page_size,
4384                                          remote_page_size);
4385                             ret = -EINVAL;
4386                         }
4387                     }
4388                     if (migrate_ignore_shared()) {
4389                         hwaddr addr = qemu_get_be64(f);
4390                         if (ramblock_is_ignored(block) &&
4391                             block->mr->addr != addr) {
4392                             error_report("Mismatched GPAs for block %s "
4393                                          "%" PRId64 "!= %" PRId64,
4394                                          id, (uint64_t)addr,
4395                                          (uint64_t)block->mr->addr);
4396                             ret = -EINVAL;
4397                         }
4398                     }
4399                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4400                                           block->idstr);
4401                 } else {
4402                     error_report("Unknown ramblock \"%s\", cannot "
4403                                  "accept migration", id);
4404                     ret = -EINVAL;
4405                 }
4406
4407                 total_ram_bytes -= length;
4408             }
4409             break;
4410
4411         case RAM_SAVE_FLAG_ZERO:
4412             ch = qemu_get_byte(f);
4413             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4414             break;
4415
4416         case RAM_SAVE_FLAG_PAGE:
4417             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4418             break;
4419
4420         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4421             len = qemu_get_be32(f);
4422             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4423                 error_report("Invalid compressed data length: %d", len);
4424                 ret = -EINVAL;
4425                 break;
4426             }
4427             decompress_data_with_multi_threads(f, host, len);
4428             break;
4429
4430         case RAM_SAVE_FLAG_XBZRLE:
4431             if (load_xbzrle(f, addr, host) < 0) {
4432                 error_report("Failed to decompress XBZRLE page at "
4433                              RAM_ADDR_FMT, addr);
4434                 ret = -EINVAL;
4435                 break;
4436             }
4437             break;
4438         case RAM_SAVE_FLAG_EOS:
4439             /* normal exit */
4440             multifd_recv_sync_main();
4441             break;
4442         default:
4443             if (flags & RAM_SAVE_FLAG_HOOK) {
4444                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4445             } else {
4446                 error_report("Unknown combination of migration flags: 0x%x",
4447                              flags);
4448                 ret = -EINVAL;
4449             }
4450         }
4451         if (!ret) {
4452             ret = qemu_file_get_error(f);
4453         }
4454         if (!ret && host_bak) {
4455             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4456         }
4457     }
4458
4459     ret |= wait_for_decompress_done();
4460     return ret;
4461 }
4462
4463 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4464 {
4465     int ret = 0;
4466     static uint64_t seq_iter;
4467     /*
4468      * If system is running in postcopy mode, page inserts to host memory must
4469      * be atomic
4470      */
4471     bool postcopy_running = postcopy_is_running();
4472
4473     seq_iter++;
4474
4475     if (version_id != 4) {
4476         return -EINVAL;
4477     }
4478
4479     /*
4480      * This RCU critical section can be very long running.
4481      * When RCU reclaims in the code start to become numerous,
4482      * it will be necessary to reduce the granularity of this
4483      * critical section.
4484      */
4485     WITH_RCU_READ_LOCK_GUARD() {
4486         if (postcopy_running) {
4487             /*
4488              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4489              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4490              * service fast page faults.
4491              */
4492             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4493         } else {
4494             ret = ram_load_precopy(f);
4495         }
4496     }
4497     trace_ram_load_complete(ret, seq_iter);
4498
4499     return ret;
4500 }
4501
4502 static bool ram_has_postcopy(void *opaque)
4503 {
4504     RAMBlock *rb;
4505     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4506         if (ramblock_is_pmem(rb)) {
4507             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4508                          "is not supported now!", rb->idstr, rb->host);
4509             return false;
4510         }
4511     }
4512
4513     return migrate_postcopy_ram();
4514 }
4515
4516 /* Sync all the dirty bitmap with destination VM.  */
4517 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4518 {
4519     RAMBlock *block;
4520     QEMUFile *file = s->to_dst_file;
4521     int ramblock_count = 0;
4522
4523     trace_ram_dirty_bitmap_sync_start();
4524
4525     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4526         qemu_savevm_send_recv_bitmap(file, block->idstr);
4527         trace_ram_dirty_bitmap_request(block->idstr);
4528         ramblock_count++;
4529     }
4530
4531     trace_ram_dirty_bitmap_sync_wait();
4532
4533     /* Wait until all the ramblocks' dirty bitmap synced */
4534     while (ramblock_count--) {
4535         qemu_sem_wait(&s->rp_state.rp_sem);
4536     }
4537
4538     trace_ram_dirty_bitmap_sync_complete();
4539
4540     return 0;
4541 }
4542
4543 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4544 {
4545     qemu_sem_post(&s->rp_state.rp_sem);
4546 }
4547
4548 /*
4549  * Read the received bitmap, revert it as the initial dirty bitmap.
4550  * This is only used when the postcopy migration is paused but wants
4551  * to resume from a middle point.
4552  */
4553 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4554 {
4555     int ret = -EINVAL;
4556     /* from_dst_file is always valid because we're within rp_thread */
4557     QEMUFile *file = s->rp_state.from_dst_file;
4558     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4559     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4560     uint64_t size, end_mark;
4561
4562     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4563
4564     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4565         error_report("%s: incorrect state %s", __func__,
4566                      MigrationStatus_str(s->state));
4567         return -EINVAL;
4568     }
4569
4570     /*
4571      * Note: see comments in ramblock_recv_bitmap_send() on why we
4572      * need the endianness conversion, and the paddings.
4573      */
4574     local_size = ROUND_UP(local_size, 8);
4575
4576     /* Add paddings */
4577     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4578
4579     size = qemu_get_be64(file);
4580
4581     /* The size of the bitmap should match with our ramblock */
4582     if (size != local_size) {
4583         error_report("%s: ramblock '%s' bitmap size mismatch "
4584                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4585                      block->idstr, size, local_size);
4586         ret = -EINVAL;
4587         goto out;
4588     }
4589
4590     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4591     end_mark = qemu_get_be64(file);
4592
4593     ret = qemu_file_get_error(file);
4594     if (ret || size != local_size) {
4595         error_report("%s: read bitmap failed for ramblock '%s': %d"
4596                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4597                      __func__, block->idstr, ret, local_size, size);
4598         ret = -EIO;
4599         goto out;
4600     }
4601
4602     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4603         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4604                      __func__, block->idstr, end_mark);
4605         ret = -EINVAL;
4606         goto out;
4607     }
4608
4609     /*
4610      * Endianness conversion. We are during postcopy (though paused).
4611      * The dirty bitmap won't change. We can directly modify it.
4612      */
4613     bitmap_from_le(block->bmap, le_bitmap, nbits);
4614
4615     /*
4616      * What we received is "received bitmap". Revert it as the initial
4617      * dirty bitmap for this ramblock.
4618      */
4619     bitmap_complement(block->bmap, block->bmap, nbits);
4620
4621     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4622     ramblock_dirty_bitmap_clear_discarded_pages(block);
4623
4624     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4625     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4626
4627     /*
4628      * We succeeded to sync bitmap for current ramblock. If this is
4629      * the last one to sync, we need to notify the main send thread.
4630      */
4631     ram_dirty_bitmap_reload_notify(s);
4632
4633     ret = 0;
4634 out:
4635     g_free(le_bitmap);
4636     return ret;
4637 }
4638
4639 static int ram_resume_prepare(MigrationState *s, void *opaque)
4640 {
4641     RAMState *rs = *(RAMState **)opaque;
4642     int ret;
4643
4644     ret = ram_dirty_bitmap_sync_all(s, rs);
4645     if (ret) {
4646         return ret;
4647     }
4648
4649     ram_state_resume_prepare(rs, s->to_dst_file);
4650
4651     return 0;
4652 }
4653
4654 void postcopy_preempt_shutdown_file(MigrationState *s)
4655 {
4656     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4657     qemu_fflush(s->postcopy_qemufile_src);
4658 }
4659
4660 static SaveVMHandlers savevm_ram_handlers = {
4661     .save_setup = ram_save_setup,
4662     .save_live_iterate = ram_save_iterate,
4663     .save_live_complete_postcopy = ram_save_complete,
4664     .save_live_complete_precopy = ram_save_complete,
4665     .has_postcopy = ram_has_postcopy,
4666     .state_pending_exact = ram_state_pending_exact,
4667     .state_pending_estimate = ram_state_pending_estimate,
4668     .load_state = ram_load,
4669     .save_cleanup = ram_save_cleanup,
4670     .load_setup = ram_load_setup,
4671     .load_cleanup = ram_load_cleanup,
4672     .resume_prepare = ram_resume_prepare,
4673 };
4674
4675 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4676                                       size_t old_size, size_t new_size)
4677 {
4678     PostcopyState ps = postcopy_state_get();
4679     ram_addr_t offset;
4680     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4681     Error *err = NULL;
4682
4683     if (ramblock_is_ignored(rb)) {
4684         return;
4685     }
4686
4687     if (!migration_is_idle()) {
4688         /*
4689          * Precopy code on the source cannot deal with the size of RAM blocks
4690          * changing at random points in time - especially after sending the
4691          * RAM block sizes in the migration stream, they must no longer change.
4692          * Abort and indicate a proper reason.
4693          */
4694         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4695         migration_cancel(err);
4696         error_free(err);
4697     }
4698
4699     switch (ps) {
4700     case POSTCOPY_INCOMING_ADVISE:
4701         /*
4702          * Update what ram_postcopy_incoming_init()->init_range() does at the
4703          * time postcopy was advised. Syncing RAM blocks with the source will
4704          * result in RAM resizes.
4705          */
4706         if (old_size < new_size) {
4707             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4708                 error_report("RAM block '%s' discard of resized RAM failed",
4709                              rb->idstr);
4710             }
4711         }
4712         rb->postcopy_length = new_size;
4713         break;
4714     case POSTCOPY_INCOMING_NONE:
4715     case POSTCOPY_INCOMING_RUNNING:
4716     case POSTCOPY_INCOMING_END:
4717         /*
4718          * Once our guest is running, postcopy does no longer care about
4719          * resizes. When growing, the new memory was not available on the
4720          * source, no handler needed.
4721          */
4722         break;
4723     default:
4724         error_report("RAM block '%s' resized during postcopy state: %d",
4725                      rb->idstr, ps);
4726         exit(-1);
4727     }
4728 }
4729
4730 static RAMBlockNotifier ram_mig_ram_notifier = {
4731     .ram_block_resized = ram_mig_ram_block_resized,
4732 };
4733
4734 void ram_mig_init(void)
4735 {
4736     qemu_mutex_init(&XBZRLE.lock);
4737     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4738     ram_block_notifier_add(&ram_mig_ram_notifier);
4739 }